# Generate data

This notebook generates collates all the data from the various models and fields and puts it into a single accesible form.

---

# 1. How do the fields themselves change when going from V15 to V20?

In [1]:
import xarray as xr
import pandas as pd
def calculate_delta_field(filename,parameter):
    
    
    """
    Function to determine the change in a parameter when going from V15 to V20
    """
    
    root = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/'
    version = ['v015', 'v020']
    
    ds = [] #Array to hold results
    for v in version:
        path = root+f'climate.{v}/climate.{v}/639l_2/{filename}'
        ds_i= xr.open_dataset(path,engine='cfgrib',backend_kwargs={'indexpath': ''},filter_by_keys={'shortName':parameter}) #only want this one parameter
        
        ds.append(ds_i)
        
    #Take the difference, and make it a nice pandas df    
    delta_field = ds[1] - ds[0] #V20 - V15
    delta_field =  delta_field.assign_coords({"longitude": (((delta_field.longitude + 180) % 360) - 180)}) #long3 ---> long1
    delta_field = delta_field.to_dataframe().reset_index()
    
    V20_df = ds[1].to_dataframe().reset_index()
    V15_df = ds[0].to_dataframe().reset_index()

    if parameter == '10si': #seaice shortname is inverse to variable name
        parameter='si10'
    
    delta_field[f'V15_{parameter}'] = V15_df[parameter]
    delta_field[f'V20_{parameter}'] = V20_df[parameter]
    delta_field = delta_field.rename(columns={f'{parameter}': f'change_in_{parameter}'})
    delta_field[f'percentage_change_in_{parameter}'] = (delta_field[f'V20_{parameter}'] - delta_field[f'V15_{parameter}']) / delta_field[f'V15_{parameter}']

    return delta_field


In [2]:
#Calculate all the delta fields i.e. the change in parameter V15-->V20
clake_delta_field  = calculate_delta_field('clake','cl')        #How does cl change?
seaice_delta_field = calculate_delta_field('cicecap','10si')    #How does seaice change
cvh_delta_field    = calculate_delta_field('cvh','cvh')         #How does cvh change?
cvl_delta_field    = calculate_delta_field('cvl','cvl')         #How does cvl change
lsm_delta_field    = calculate_delta_field('lsmoro','lsm')      #How does lsm change?
dl_delta_field     = calculate_delta_field('lakedl','dl')       #How does lake depth change?

In [3]:
#Bring all the delta fields together
change_in_fields = pd.concat([clake_delta_field,seaice_delta_field,cvh_delta_field,cvl_delta_field,lsm_delta_field,dl_delta_field],axis=1)
change_in_fields = change_in_fields.loc[:,~change_in_fields.columns.duplicated()] #remove duplicated column names that arise due to the concat

In [4]:
    
def classify(x,tol):
    
    if abs(x) > tol:
        return True
    else:
        return False
    




#If the change in a variable is greater than tolerance (e.g. abs(cl) > tolerance), the change is marked as `significant`. 
#The "is significant"/"is not significant" boolean is then used is to define a grouping bitstring category.
#For lake depth dl, tolerance specifies the fracional change e.g. is the change in dl > 10%? 
tolerance = 0.1 

# Classify every grid point according to the delta fields
# First create extra boolean columns for whether the change in the field is significant, where significane is determined by `tolerance`     
for parameter in ['cl','si10','cvh','cvl','lsm']:
    change_in_fields[f'{parameter}_change_is_significant'] = change_in_fields.apply(lambda x : classify(x[f'change_in_{parameter}'],tolerance), axis = 1)
    
#handle dl separatley since this is not a fractional 0-1 features
change_in_fields[f'dl_change_is_significant'] = change_in_fields.apply(lambda x : classify(x[f'change_in_dl']/x[f'V15_dl'],tolerance), axis = 1)
    

In [5]:
        
def bit(x):
    
    if x:
        return 1
    else:
        return 0


def bitstring(list_of_columns):
    
    bitstring=''
    for i in list_of_columns:
        idx = bit(i)
        bitstring += str(idx)
        
    return bitstring



# Then create a single group bitstring for all combinations of booleans
change_in_fields[f'bitstring'] = change_in_fields.apply(lambda x : bitstring( [x['cl_change_is_significant'],
                                                                               x['cvh_change_is_significant'],
                                                                               x['cvl_change_is_significant'],
                                                                               x['dl_change_is_significant'],
                                                                               x['lsm_change_is_significant'],
                                                                               x['si10_change_is_significant']    # ALPHABETICAL
                                                                              ]
                                                                               ), axis = 1)

Lets have a look at what this change in fields df looks like:

In [6]:
display(change_in_fields)
change_in_fields.to_pickle('tmp_data/change_in_fields.pkl')

Unnamed: 0,values,step,surface,latitude,longitude,change_in_cl,V15_cl,V20_cl,percentage_change_in_cl,heightAboveGround,...,V15_dl,V20_dl,percentage_change_in_dl,cl_change_is_significant,si10_change_is_significant,cvh_change_is_significant,cvl_change_is_significant,lsm_change_is_significant,dl_change_is_significant,bitstring
0,0,0 days,0.0,89.784877,0.0,0.0,0.0,0.0,,10.0,...,4192.697266,4174.519531,-0.004336,False,False,False,False,False,False,000000
1,1,0 days,0.0,89.784877,20.0,0.0,0.0,0.0,,10.0,...,4153.591797,4187.108398,0.008069,False,False,False,False,False,False,000000
2,2,0 days,0.0,89.784877,40.0,0.0,0.0,0.0,,10.0,...,4152.509766,4205.511719,0.012764,False,False,False,False,False,False,000000
3,3,0 days,0.0,89.784877,60.0,0.0,0.0,0.0,,10.0,...,4158.060547,4217.100586,0.014199,False,False,False,False,False,False,000000
4,4,0 days,0.0,89.784877,80.0,0.0,0.0,0.0,,10.0,...,4157.796875,4219.924805,0.014943,False,False,False,False,False,False,000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542075,542075,0 days,0.0,-89.784877,-100.0,0.0,0.0,0.0,,10.0,...,25.000000,10.000000,-0.600000,False,False,False,False,False,True,000100
542076,542076,0 days,0.0,-89.784877,-80.0,0.0,0.0,0.0,,10.0,...,25.000000,10.000000,-0.600000,False,False,False,False,False,True,000100
542077,542077,0 days,0.0,-89.784877,-60.0,0.0,0.0,0.0,,10.0,...,25.000000,10.000000,-0.600000,False,False,False,False,False,True,000100
542078,542078,0 days,0.0,-89.784877,-40.0,0.0,0.0,0.0,,10.0,...,25.000000,10.000000,-0.600000,False,False,False,False,False,True,000100


This has 542080 rows. This is the number if grid points we expect given the reduced gaussian grid, and every one of thees grid points is classified accoridng to a 0.1 bit string.

---

# 1a How do the fields change? Ms Classification


An alternative classification of the grid points from Margerita based also on orography



In [4]:
import pandas as pd
df = pd.read_csv('tmp_data/all_yearly_points_with_oro_MC.csv')
categorical_columns = df.columns[df.columns.str.contains(pat = 'changeAbs')]

In [21]:
V15_columns = ['V15_clFr','V15_cvhFr','V15_cvlFr','V15_dl','V15_oceanFr','V15_si10Fr','V15_z']
V20_columns = ['V20_clFr','V20_cvhFr','V20_cvlFr','V20_dl','V20_oceanFr','V20_si10Fr','V20_z']

fractional_categorisation_columns = ['changeAbs_V20V15_clFr','changeAbs_V20V15_cvhFr','changeAbs_V20V15_cvlFr','changeAbs_V20V15_oceanFr','changeAbs_V20V15_si10Fr'] #These are fractional quantities
quantity_categorisation_columns = ['changeAbs_V20V15_dl','changeAbs_V20V15_z'] #...these are not

In [22]:
#Select only some columns
change_in_fields = df[['latitude_ERA', 'longitude_ERA'] + V15_columns + V20_columns +fractional_categorisation_columns+quantity_categorisation_columns].copy()

In [23]:
# Classify every grid point according to the delta fields
# First create extra boolean columns for whether the change in the field is significant, where significane is determined by `tolerance`     
for parameter in fractional_categorisation_columns:
    p = parameter.split('_')[-1]
    change_in_fields[f'{p}_change_is_significant'] = change_in_fields.apply(lambda x : classify(x[parameter],tolerance), axis = 1)
    
#handle dl and z separatley since these are not a fractional 0-1 features  
for parameter in ['changeAbs_V20V15_dl']:
    p = parameter.split('_')[-1]
    change_in_fields[f'{p}_change_is_significant'] = change_in_fields.apply(lambda x : classify(x[parameter]/x[f'V15_{p}'],tolerance), axis = 1)
    
for parameter in ['changeAbs_V20V15_z']:
    p = parameter.split('_')[-1]
    change_in_fields[f'{p}_change_is_significant'] = change_in_fields.apply(lambda x : classify(x[parameter]/x[f'V15_{p}'],tolerance/10.0), axis = 1)
    
    
    
# Then create a single group bitstring for all combinations of booleans
change_in_fields[f'bitstring'] = change_in_fields.apply(lambda x : bitstring( [x['clFr_change_is_significant'],
                                                                               x['cvhFr_change_is_significant'],
                                                                               x['cvlFr_change_is_significant'],
                                                                               x['dl_change_is_significant'],
                                                                               x['oceanFr_change_is_significant'],
                                                                               x['si10Fr_change_is_significant'],
                                                                               x['z_change_is_significant'] # ALPHABETICAL
                                                                              ]
                                                                               ), axis = 1)
    

In [24]:
display(change_in_fields)
change_in_fields.to_pickle('tmp_data/change_in_fields_Marg.pkl')

Unnamed: 0,latitude_ERA,longitude_ERA,V15_clFr,V15_cvhFr,V15_cvlFr,V15_dl,V15_oceanFr,V15_si10Fr,V15_z,V20_clFr,...,changeAbs_V20V15_dl,changeAbs_V20V15_z,clFr_change_is_significant,cvhFr_change_is_significant,cvlFr_change_is_significant,oceanFr_change_is_significant,si10Fr_change_is_significant,dl_change_is_significant,z_change_is_significant,bitstring
0,-54.941427,-66.56250,0.147125,0.007210,0.731774,5.005859,0.0,0.000000,1952.714722,0.004767,...,15.304688,11.432007,True,False,False,True,False,True,False,1001100
1,-54.660397,-70.31250,0.182325,0.193664,0.301594,6.428711,0.0,0.250059,5086.468750,0.048344,...,9.113281,16.056641,True,True,True,False,True,True,False,1111010
2,-54.660397,-69.84375,0.092898,0.067518,0.178301,6.771484,0.0,0.637169,7635.152344,0.011764,...,18.868164,78.724609,False,False,True,False,True,True,True,0011011
3,-54.660397,-69.37500,0.136888,0.053549,0.190927,100.000000,0.0,0.514738,7530.796875,0.003248,...,51.657227,199.744141,True,False,True,False,True,True,True,1011011
4,-54.660397,-68.90625,0.039358,0.159833,0.631237,100.000000,0.0,0.137348,6271.117188,0.011331,...,16.914062,59.607422,False,False,True,False,True,True,False,0011010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149805,64.496451,169.37500,0.000000,0.000000,1.000000,25.000000,0.0,0.000000,2940.859375,0.000030,...,15.000000,0.045166,False,False,False,False,False,True,False,0001000
149806,65.058510,95.00000,0.000000,0.000000,1.000000,25.000000,0.0,0.000000,5395.695312,0.000030,...,15.000000,0.095703,False,False,False,False,False,True,False,0001000
149807,65.620570,98.12500,0.000000,0.000000,1.000000,25.000000,0.0,0.000000,5872.917969,0.000030,...,15.000000,0.357422,False,False,False,False,False,True,False,0001000
149808,69.554988,-159.00000,0.000000,0.000000,1.000000,25.000000,0.0,0.000000,1342.546753,0.000030,...,15.000000,0.228882,False,False,False,False,False,True,False,0001000


Note the difference in the number of rows from the previous method



# 2. Load the model predictions

In [24]:
    
def load_predictions(model):

    """For this model, load the predictions and determine the prediction error"""
    
    df = pd.read_parquet(model+'predictions.parquet')
    #Calculate some extra columns
    df['predicion_bias'] = df.MODIS_LST - df.predictions
    df['predicion_error'] = abs(df.MODIS_LST - df.predictions)


    print("Mean/Median/Std prediction bias:", df['predicion_bias'].mean(), df['predicion_bias'].median(),df['predicion_bias'].std())
    print("Mean/Median/Std prediction error:", df['predicion_error'].mean(), df['predicion_error'].median(),df['predicion_error'].std())

    #Average predictions and errors over the year
    df_grouped = df.groupby(['latitude_ERA', 'longitude_ERA'],as_index=False).mean() 


    return df,df_grouped



In [25]:
# V15_model =  '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V15_2016_augmented/'
# V20_model =  '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V20_2016_augmented/'
# V20X_model = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V20_2016_augmented_monthlyclake/'



#new files, version 2
V15_model =  '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V15_2016_2_V2/'
V20_model =  '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V20_2016_2_V2/'
V20X_model = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V20_2016_X2_V3/'
#V20X_model = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V20_2016_X2_V4/'





print ('----------V15----------')
V15_predictions,V15_predictions_averaged = load_predictions(V15_model) #averaged = averaged over the year
print ('----------V20----------')
V20_predictions,V20_predictions_averaged = load_predictions(V20_model)
print ('----------V20X----------')
V20X_predictions,V20X_predictions_averaged = load_predictions(V20X_model)

----------V15----------
Mean/Median/Std prediction bias: 0.06721186 0.20108032 4.0190535
Mean/Median/Std prediction error: 2.9916244 2.283081 2.6846745
----------V20----------
Mean/Median/Std prediction bias: 0.05126155 0.20895386 3.9501467
Mean/Median/Std prediction error: 2.9394135 2.2425537 2.639345
----------V20X----------
Mean/Median/Std prediction bias: 0.106212065 0.25463867 3.915779
Mean/Median/Std prediction error: 2.9127858 2.2189636 2.6192172


In [40]:
#Any bonus models
#V15X_model = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V15_2016_X/'
V15X_model = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V15_2016_X_V5/' #this one used for manuscript
#V15X_model = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V15_2016_X_V6/' #this is an extra one

V15X_predictions,V15X_predictions_averaged = load_predictions(V15X_model) #averaged = averaged over the year


Mean/Median/Std prediction bias: -0.0022098676 0.12011719 4.0012007
Mean/Median/Std prediction error: 2.9743292 2.2615356 2.6763718


In [41]:
#Save all these dfs:

V15_predictions.to_pickle('tmp_data/V15_predictions.pkl')
V20_predictions.to_pickle('tmp_data/V20_predictions.pkl')
V20X_predictions.to_pickle('tmp_data/V20X_predictions.pkl')

V15_predictions_averaged.to_pickle('tmp_data/V15_predictions_yearly_average.pkl')
V20_predictions_averaged.to_pickle('tmp_data/V20_predictions_yearly_average.pkl')
V20X_predictions_averaged.to_pickle('tmp_data/V20X_predictions_yearly_average.pkl')


In [42]:
#Bonus save
V15X_predictions.to_pickle('tmp_data/V15X_predictions.pkl')
V15X_predictions_averaged.to_pickle('tmp_data/V15X_predictions_yearly_average.pkl')

---



# 3. Create a yearly file


Get the annual average and the grid point types


First lets load the data:

In [43]:
V15 = pd.read_pickle('tmp_data/V15_predictions_yearly_average.pkl')
V20 = pd.read_pickle('tmp_data/V20_predictions_yearly_average.pkl')
V20X= pd.read_pickle('tmp_data/V20X_predictions_yearly_average.pkl')
V15X= pd.read_pickle('tmp_data/V15X_predictions_yearly_average.pkl')






#change_in_fields = pd.read_pickle('tmp_data/change_in_fields.pkl')

change_in_fields = pd.read_pickle('tmp_data/change_in_fields_Marg.pkl')
change_in_fields['latitude_join'] = round(change_in_fields.latitude_ERA,3)
change_in_fields['longitude_join'] = round(change_in_fields.longitude_ERA,3) #just used for joining due to loss of precision from Margs file

In [44]:
if V20.shape == V15.shape == V20X.shape == V15X.shape:
    
    #Create a new df that will just hold the differences   
    data  = {'latitude_ERA':     V20.latitude_ERA, 
             'longitude_ERA':    V20.longitude_ERA,
             
              'MODIS_LST':       V20.MODIS_LST,
             
              'V15_prediction':  V15.predictions,
              'V20_prediction':  V20.predictions,
              'V20X_prediction': V20X.predictions,
              'V15X_prediction': V15X.predictions,
             
              'V15_error':       V15.predicion_error,
              'V20_error':       V20.predicion_error,
              'V20X_error':      V20X.predicion_error,
              'V15X_error':      V15X.predicion_error,

             
              'delta':           V20.predicion_error - V15.predicion_error,
              'deltaX':          V20X.predicion_error - V15.predicion_error,
              'deltaX15':        V15X.predicion_error - V15.predicion_error,

            
              'latitude_join':    round(V20.latitude_ERA,3), 
              'longitude_join':    round(V20.longitude_ERA,3)
              }

    df = pd.DataFrame(data) 
    
    
    #Join and save
    fname = 'tmp_data/clean_yearly_data.pkl'
    print (f'Joining and saving yearly file to: {fname}')
    df_cat = pd.merge(df,change_in_fields,how='inner',left_on=['latitude_join', 'longitude_join'], right_on=['latitude_join', 'longitude_join'],suffixes=('', '_y')) 
    
    if len != 0:
        df_cat.to_pickle(fname)
    else:
        print ("WHOOPS! Length is zero after the join")
else:
    
    print ("Shapes are different - don't join")

Joining and saving yearly file to: tmp_data/clean_yearly_data.pkl


 ---
 


## 