# Generate data

This notebook generates collates all the data from the various models and fields and puts it into a single accesible form.


It uses the file from Margartia in order to determine the relevant bitstrings

---


# 1. How do the fields themselves change when going from V15 to V20?

For this we use the categorisation fiel provided by Margarita


In [4]:
import pandas as pd
df = pd.read_csv('../tmp_data/all_yearly_points_with_oro_MC.csv')
categorical_columns = df.columns[df.columns.str.contains(pat = 'changeAbs')]

In [32]:
V15_columns = ['V15_clFr','V15_cvhFr','V15_cvlFr','V15_dl','V15_oceanFr','V15_si10Fr','V15_z']
V20_columns = ['V20_clFr','V20_cvhFr','V20_cvlFr','V20_dl','V20_oceanFr','V20_si10Fr','V20_z']

fractional_categorisation_columns = ['changeAbs_V20V15_clFr','changeAbs_V20V15_cvhFr','changeAbs_V20V15_cvlFr','changeAbs_V20V15_oceanFr','changeAbs_V20V15_si10Fr'] #These are fractional quantities
quantity_categorisation_columns = ['changeAbs_V20V15_dl','changeAbs_V20V15_z'] #...these are not

In [33]:
#Select only some columns
change_in_fields = df[['latitude_ERA', 'longitude_ERA'] + V15_columns + V20_columns +fractional_categorisation_columns+quantity_categorisation_columns].copy()

In [35]:

#If the change in a variable is greater than tolerance (e.g. abs(cl) > tolerance), the change is marked as `significant`. 
#The "is significant"/"is not significant" boolean is then used is to define a grouping bitstring category.
#For lake depth dl, tolerance specifies the fracional change e.g. is the change in dl > 10%? 
tolerance = 0.1 
    
def classify(x,tol):
    
    if abs(x) > tol:
        return True
    else:
        return False
    



# Classify every grid point according to the delta fields
# First create extra boolean columns for whether the change in the field is significant, where significane is determined by `tolerance`     
for parameter in fractional_categorisation_columns:
    p = parameter.split('_')[-1]
    change_in_fields[f'{p}_change_is_significant'] = change_in_fields.apply(lambda x : classify(x[parameter],tolerance), axis = 1)
    
#handle dl and z separatley since these are not a fractional 0-1 features  
for parameter in ['changeAbs_V20V15_dl']:
    p = parameter.split('_')[-1]
    change_in_fields[f'{p}_change_is_significant'] = change_in_fields.apply(lambda x : classify(x[parameter]/x[f'V15_{p}'],tolerance), axis = 1)
    
for parameter in ['changeAbs_V20V15_z']:
    p = parameter.split('_')[-1]
    change_in_fields[f'{p}_change_is_significant'] = change_in_fields.apply(lambda x : classify(x[parameter]/x[f'V15_{p}'],tolerance/10.0), axis = 1)
    
    
    
    
    

Now use these "significance" booleans to determine a bitstring

In [36]:
        
def bit(x):
    
    if x:
        return 1
    else:
        return 0


def bitstring(list_of_columns):
    
    bitstring=''
    for i in list_of_columns:
        idx = bit(i)
        bitstring += str(idx)
        
    return bitstring



# Then create a single group bitstring for all combinations of booleans
change_in_fields[f'bitstring'] = change_in_fields.apply(lambda x : bitstring( [x['clFr_change_is_significant'],
                                                                               x['cvhFr_change_is_significant'],
                                                                               x['cvlFr_change_is_significant'],
                                                                               x['dl_change_is_significant'],
                                                                               x['oceanFr_change_is_significant'],
                                                                               x['si10Fr_change_is_significant'],
                                                                               x['z_change_is_significant'] # ALPHABETICAL
                                                                              ]
                                                                               ), axis = 1)

Lets have a look at what this change in fields df looks like:

In [37]:
display(change_in_fields)
change_in_fields.to_pickle('tmp_data/change_in_fields.pkl')

Unnamed: 0,latitude_ERA,longitude_ERA,V15_clFr,V15_cvhFr,V15_cvlFr,V15_dl,V15_oceanFr,V15_si10Fr,V15_z,V20_clFr,...,changeAbs_V20V15_dl,changeAbs_V20V15_z,clFr_change_is_significant,cvhFr_change_is_significant,cvlFr_change_is_significant,oceanFr_change_is_significant,si10Fr_change_is_significant,dl_change_is_significant,z_change_is_significant,bitstring
0,-54.941427,-66.56250,0.147125,0.007210,0.731774,5.005859,0.0,0.000000,1952.714722,0.004767,...,15.304688,11.432007,True,False,False,True,False,True,False,1001100
1,-54.660397,-70.31250,0.182325,0.193664,0.301594,6.428711,0.0,0.250059,5086.468750,0.048344,...,9.113281,16.056641,True,True,True,False,True,True,False,1111010
2,-54.660397,-69.84375,0.092898,0.067518,0.178301,6.771484,0.0,0.637169,7635.152344,0.011764,...,18.868164,78.724609,False,False,True,False,True,True,True,0011011
3,-54.660397,-69.37500,0.136888,0.053549,0.190927,100.000000,0.0,0.514738,7530.796875,0.003248,...,51.657227,199.744141,True,False,True,False,True,True,True,1011011
4,-54.660397,-68.90625,0.039358,0.159833,0.631237,100.000000,0.0,0.137348,6271.117188,0.011331,...,16.914062,59.607422,False,False,True,False,True,True,False,0011010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149805,64.496451,169.37500,0.000000,0.000000,1.000000,25.000000,0.0,0.000000,2940.859375,0.000030,...,15.000000,0.045166,False,False,False,False,False,True,False,0001000
149806,65.058510,95.00000,0.000000,0.000000,1.000000,25.000000,0.0,0.000000,5395.695312,0.000030,...,15.000000,0.095703,False,False,False,False,False,True,False,0001000
149807,65.620570,98.12500,0.000000,0.000000,1.000000,25.000000,0.0,0.000000,5872.917969,0.000030,...,15.000000,0.357422,False,False,False,False,False,True,False,0001000
149808,69.554988,-159.00000,0.000000,0.000000,1.000000,25.000000,0.0,0.000000,1342.546753,0.000030,...,15.000000,0.228882,False,False,False,False,False,True,False,0001000


This has 149810 rows. This is not the entire reduced Gaussian grid, instead just the grid points where we have MODIS observations

---

# 1a. How do the fields change? - Margeritas classification

# 2. Load the model predictions

In [5]:
    
def load_predictions(model):

    """For this model, load the predictions and determine the prediction error"""
    
    df = pd.read_parquet(model+'predictions.parquet')
    #Calculate some extra columns
    df['predicion_bias'] = df.MODIS_LST - df.predictions
    df['predicion_error'] = abs(df.MODIS_LST - df.predictions)


    print("Mean/Median/Std prediction bias:", df['predicion_bias'].mean(), df['predicion_bias'].median(),df['predicion_bias'].std())
    print("Mean/Median/Std prediction error:", df['predicion_error'].mean(), df['predicion_error'].median(),df['predicion_error'].std())

    #Average predictions and errors over the year
    df_grouped = df.groupby(['latitude_ERA', 'longitude_ERA'],as_index=False).mean() 


    return df,df_grouped



In [6]:
#Original files
#V15_model =  '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V15_2016_augmented/'
#V20_model =  '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V20_2016_augmented/'
#V20X_model = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V20_2016_augmented_monthlyclake/'





#new files
V15_model =  '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V15_2016_2/'
V20_model =  '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V20_2016_2/'
V20X_model = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V20_2016_X2/'




#new files, version 2
V15_model =  '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V15_2016_2_V2/'
V20_model =  '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V20_2016_2_V2/'
V20X_model = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V20_2016_X2_V3/'


print ('----------V15----------')
V15_predictions,V15_predictions_averaged = load_predictions(V15_model) #averaged = averaged over the year
print ('----------V20----------')
V20_predictions,V20_predictions_averaged = load_predictions(V20_model)
print ('----------V20X----------')
V20X_predictions,V20X_predictions_averaged = load_predictions(V20X_model)

----------V15----------
Mean/Median/Std prediction bias: 0.06721186 0.20108032 4.0190535
Mean/Median/Std prediction error: 2.9916244 2.283081 2.6846745
----------V20----------
Mean/Median/Std prediction bias: 0.05126155 0.20895386 3.9501467
Mean/Median/Std prediction error: 2.9394135 2.2425537 2.639345
----------V20X----------
Mean/Median/Std prediction bias: 0.106212065 0.25463867 3.915779
Mean/Median/Std prediction error: 2.9127858 2.2189636 2.6192172


In [7]:
#Any bonus models
V15X_model = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/V15_2016_X/'
V15X_predictions,V15X_predictions_averaged = load_predictions(V15X_model) #averaged = averaged over the year


Mean/Median/Std prediction bias: 0.27741387 0.41308594 4.010094
Mean/Median/Std prediction error: 2.9977965 2.2907715 2.677875


In [8]:
#Save all these dfs:

V15_predictions.to_pickle('tmp_data/V15_predictions.pkl')
V20_predictions.to_pickle('tmp_data/V20_predictions.pkl')
V20X_predictions.to_pickle('tmp_data/V20X_predictions.pkl')

V15_predictions_averaged.to_pickle('tmp_data/V15_predictions_yearly_average.pkl')
V20_predictions_averaged.to_pickle('tmp_data/V20_predictions_yearly_average.pkl')
V20X_predictions_averaged.to_pickle('tmp_data/V20X_predictions_yearly_average.pkl')


In [9]:
#Bonus save
V15X_predictions.to_pickle('tmp_data/V15X_predictions.pkl')
V15X_predictions_averaged.to_pickle('tmp_data/V15X_predictions_yearly_average.pkl')

---

In [2]:
import numpy as np
np.var([4,5,10,12])

11.1875

In [3]:
np.var([2,2.1,2.2,2.3])

0.01249999999999999