### Noise calculations for paper revisions


This notebook explores the model training noise as requested during the first round of paper review.

We use the conda environment `analysis`


#### Define some useful functions

In [17]:
import pandas as pd
pd.options.mode.chained_assignment = None #Ignore SettingWithCopyWarning - we are safe here

def load_predictions(path):

    df = pd.read_parquet(path+'predictions.parquet')
    
    
    #Calculate some extra columns
    df['model_predicion_bias'] = df.MODIS_LST - df.predictions
    df['model_predicion_error'] = df.MODIS_LST - df.predictions
        
    df['ERA_predicion_bias'] = df.MODIS_LST - df.skt_unnormalised
    df['ERA_predicion_error'] = abs(df.MODIS_LST - df.skt_unnormalised)


    print ('----------------------------------MODEL-------------------------------------------------')
    print("Mean/Median/Std prediction bias:", df['model_predicion_bias'].mean(), df['model_predicion_bias'].median(),df['model_predicion_bias'].std())
    print("Mean/Median/Std prediction error:", df['model_predicion_error'].mean(), df['model_predicion_error'].median(),df['model_predicion_error'].std())

    
    #Average predictions and errors over the year
    df_grouped = df.groupby(['latitude_ERA', 'longitude_ERA'],as_index=False).mean() 
    
    
    
    
    
    return df,df_grouped



def surface_noise_numbers(df):
    
 
    
    print("Numer of grid points:", len(df))
    print("Median variance:", df['variance'].median())
    print("Mean variance:", df['variance'].mean())
    
    print("Median median error:", df['median_error'].median())
    print("Mean mean error:", df['mean_error'].mean())
    
  

    return df['variance'].median()
    
    
    
    
def pipeline(models_to_compare,ID):
    
    annually_averaged_dfs = []
    for m in models_to_compare:
        predictions,predictions_averaged = load_predictions(m)
        annually_averaged_dfs.extend([predictions_averaged])
        
        
        
    change_in_fields = pd.read_pickle('tmp_data/change_in_fields.pkl')
    change_in_fields['latitude_join'] = round(change_in_fields.latitude_ERA,3)
    change_in_fields['longitude_join'] = round(change_in_fields.longitude_ERA,3) #just used for joining due to loss of precision from Margs file
    
    
    
    
    #Create a new df that will just hold the differences   

    df =annually_averaged_dfs[0] #pick a df to get latitudes 
    data = { 'latitude_ERA':     df.latitude_ERA, 
             'longitude_ERA':    df.longitude_ERA,
             'MODIS_LST':        df.MODIS_LST,
             'latitude_join':    round(df.latitude_ERA,3), 
             'longitude_join':    round(df.longitude_ERA,3)}


    i = 1
    selected_cols = [] #we will use this later when computing variances
    for df in annually_averaged_dfs:
        data[f'prediction_error_{i}'] = df['model_predicion_error']
        data[f'prediction_{i}'] = df['predictions']

        selected_cols.extend([f'prediction_error_{i}'])
        i = i+1

    df_new = pd.DataFrame(data)  
    
    
    
    #Join and save
    df_cat = pd.merge(df_new,change_in_fields,how='inner',on=['latitude_join', 'longitude_join'],suffixes=('', '_y')) #inner join.
    df_cat = df_cat.drop(['latitude_join', 'longitude_join','latitude_ERA_y', 'longitude_ERA_y'], axis=1) #Get rid of junk columns
    
    
    

        
    df_cat['median_error']=df_cat[selected_cols].median(axis=1) # median over the prediciton errors for each grid point
    df_cat['mean_error']  =df_cat[selected_cols].mean(axis=1)     # mean over the prediciton errors for each grid point
    df_cat['variance']    =df_cat[selected_cols].std(axis=1)        # variance over the prediciton errors for each grid point i.e. noise
    
    df_cat['label'] = ID
    return df_cat
            

In [18]:
import matplotlib.pyplot as plt 

In [19]:
root = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/processed_data/trained_models/'

In [20]:
import warnings
warnings.filterwarnings("ignore") #ignore FutureWarning statements

In [None]:

#V15
model_1 = f'{root}V15_noise_expt_1/'
model_2 = f'{root}V15_noise_expt_2/'
model_3 = f'{root}V15_noise_expt_3/'
model_4 = f'{root}V15_noise_expt_4/'
models_to_compare = [model_1,model_2, model_3,model_4]
df_V15 = pipeline(models_to_compare,'V15')


----------------------------------MODEL-------------------------------------------------
Mean/Median/Std prediction bias: 0.08259364 0.226959228515625 3.8190529346466064
Mean/Median/Std prediction error: 0.08259364 0.226959228515625 3.8190529346466064
----------------------------------MODEL-------------------------------------------------
Mean/Median/Std prediction bias: 0.17329945 0.29364013671875 3.8254127502441406
Mean/Median/Std prediction error: 0.17329945 0.29364013671875 3.8254127502441406
----------------------------------MODEL-------------------------------------------------
Mean/Median/Std prediction bias: 0.0777295 0.20306396484375 3.8673462867736816
Mean/Median/Std prediction error: 0.0777295 0.20306396484375 3.8673462867736816
----------------------------------MODEL-------------------------------------------------
Mean/Median/Std prediction bias: 0.06500694 0.195526123046875 3.8166935443878174
Mean/Median/Std prediction error: 0.06500694 0.195526123046875 3.816693544387817

In [None]:

#V20
model_1 = f'{root}V20_noise_expt_1/'
model_2 = f'{root}V20_noise_expt_2/'
model_3 = f'{root}V20_noise_expt_3/'
model_4 = f'{root}V20_noise_expt_4/'
models_to_compare = [model_1,model_2, model_3,model_4]
df_V20 = pipeline(models_to_compare,'V20')

In [None]:
#V15X
model_1 = f'{root}V15X_noise_expt_1/'
model_2 = f'{root}V15X_noise_expt_2/'
model_3 = f'{root}V15X_noise_expt_3/'
model_4 = f'{root}V15X_noise_expt_4/'
models_to_compare = [model_1,model_2, model_3,model_4]
df_V15X = pipeline(models_to_compare,'V15X')

In [None]:
#V20X
model_1 = f'{root}V20X_noise_expt_1/'
model_2 = f'{root}V20X_noise_expt_2/'
model_3 = f'{root}V20X_noise_expt_3/'
model_4 = f'{root}V20X_noise_expt_4/'
models_to_compare = [model_1,model_2, model_3,model_4]
df_V20X = pipeline(models_to_compare,'V20X')

## Categories 

In [None]:
lake_condition        = 'clFr_change_is_significant & dl_change_is_significant & not oceanFr_change_is_significant & not si10Fr_change_is_significant'
lake_ground_condition = 'clFr_change_is_significant & dl_change_is_significant & not oceanFr_change_is_significant & not cvhFr_change_is_significant & not cvlFr_change_is_significant'
vegetation_condition  = 'cvhFr_change_is_significant & not clFr_change_is_significant'
glacier_condition     = 'si10Fr_change_is_significant'


#### LAKES

In [None]:
df_lake_v15 = df_V15.query(lake_condition)
df_lake_v20 = df_V20.query(lake_condition)
df_lake_v15X = df_V15X.query(lake_condition)
df_lake_v20X = df_V20X.query(lake_condition)

In [None]:

import matplotlib.pyplot as plt




def histograms_and_deltas(df,df_reference,label):
    
    fig, ax = plt.subplots(figsize=(10, 10), dpi=100)
    #Histogram of the mean error
    df_reference['mean_error'].hist(bins=100,ax=ax,label="V15")
    print("Mean = ", df_reference['mean_error'].mean())
    print("Median = ", df_reference['mean_error'].median())

    #Histogram of the mean error
    df['mean_error'].hist(bins=100,ax=ax,label=label)
    print("Mean = ", df['mean_error'].mean())
    print("Median = ", df['mean_error'].median())

    ax.legend()
    ax.grid(False)
    
    
    print("delta = ",df['mean_error'].mean() - df_reference['mean_error'].mean())
    plt.show()
    
    
    
    print("Noise plots")
    fig, ax = plt.subplots(figsize=(10, 10), dpi=100)
    #Histogram of the mean error
    df_reference['variance'].hist(bins=100,ax=ax,label="V15")
    print("Mean = ", df_reference['variance'].mean())
    print("Median = ", df_reference['variance'].median())

    #Histogram of the mean error
    df['variance'].hist(bins=100,ax=ax,label=label)
    print("Mean = ", df['variance'].mean())
    print("Median = ", df['variance'].median())

    ax.legend()
    ax.grid(False)
    
    plt.show()
    
    
    
    


import matplotlib.pyplot as plt




def histograms_and_deltas_smart(df,df_reference,label):
    
    plt.rcParams["font.family"] = "serif"
    
    
    fig, ax = plt.subplots(figsize=(10, 10), dpi=300)
    #Histogram of the mean error
    df_reference['mean_error'].hist(bins=100,ax=ax,label="V15")
    print("Mean = ", df_reference['mean_error'].mean())
    print("Median = ", df_reference['mean_error'].median())

    #Histogram of the mean error
    df['mean_error'].hist(bins=100,ax=ax,label=label)
    print("Mean = ", df['mean_error'].mean())
    print("Median = ", df['mean_error'].median())

    ax.legend()
    ax.grid(False)
    ax.set_xlabel('LST MAE [K]')
    
    
#     V15_median =  df_reference['mean_error'].median()
#     V20_median =  df['mean_error'].median()
    
#     ax.axvline(V15_median,linestyle='--',alpha=0.5,c='C0')
#     ax.axvline(V20_median,linestyle='--',alpha=0.5,c="C1")

    plt.savefig(f"images/categories_lakes_histogram.png", bbox_inches="tight",dpi=300)

    plt.show()
    
 

In [None]:
histograms_and_deltas_smart(df_lake_v20,df_lake_v15,label="V20")


In [None]:
histograms_and_deltas(df_lake_v20,df_lake_v15,label="V20")
print('------------------------------')
histograms_and_deltas(df_lake_v20X,df_lake_v15,label="V20X")
print('-------------------------------')
histograms_and_deltas(df_lake_v15X,df_lake_v15,label="V15X")

#### Lake ground

In [None]:
df_lake_groundv15 = df_V15.query(lake_ground_condition)
df_lake_groundv20 = df_V20.query(lake_ground_condition)
df_lake_groundv15X = df_V15X.query(lake_ground_condition)
df_lake_groundv20X = df_V20X.query(lake_ground_condition)


histograms_and_deltas(df_lake_groundv20,df_lake_groundv15,label="V20")
print('------------------------------')
histograms_and_deltas(df_lake_v20X,df_lake_groundv15,label="V20X")
print('-------------------------------')
histograms_and_deltas(df_lake_groundv15X,df_lake_groundv15,label="V15X")

# Vegetation

In [None]:
df_veg_v15 = df_V15.query(vegetation_condition)
df_veg_v20 = df_V20.query(vegetation_condition)
df_veg_v15X = df_V15X.query(vegetation_condition)
df_veg_v20X = df_V20X.query(vegetation_condition)


histograms_and_deltas(df_veg_v20,df_veg_v15,label="V20")
print('------------------------------')
histograms_and_deltas(df_veg_v20X,df_veg_v15,label="V20X")
print('-------------------------------')
histograms_and_deltas(df_veg_v15X,df_veg_v15,label="V15X")

# Glacier

In [None]:
df_glacier_v15 = df_V15.query(glacier_condition)
df_glacier_v20 = df_V20.query(glacier_condition)
df_glacier_v15X = df_V15X.query(glacier_condition)
df_glacier_v20X = df_V20X.query(glacier_condition)


histograms_and_deltas(df_glacier_v20,df_glacier_v15,label="V20")
print('------------------------------')
histograms_and_deltas(df_glacier_v20X,df_glacier_v15,label="V20X")
print('-------------------------------')
histograms_and_deltas(df_glacier_v15X,df_glacier_v15,label="V15X")

#### Deep dive on individual bad points

In [None]:
df_lake = pd.concat([df_lake_v15,df_lake_v20,df_lake_v15X,df_lake_v20X])
df_glacier = pd.concat([df_glacier_v15,df_glacier_v20,df_glacier_v15X,df_glacier_v20X])

In [None]:
lake_natron = df_lake.query('latitude_ERA == -2.3887580539270044 & longitude_ERA == 36.0000')
lake_natron_northern_edge =df_lake.query('latitude_ERA == -2.10772769472398 & longitude_ERA == 36.0000') 
lake_blanche =df_lake.query('latitude_ERA ==  -29.367671674745356 & longitude_ERA == 139.6875') 
salt_lake_city =df_lake.query('latitude_ERA ==  41.17094491970063 & longitude_ERA ==  -113.39999999999998') 
farah_province =df_lake.query('latitude_ERA ==  31.615914311651938 & longitude_ERA == 61.120000000000005') 
gujarat_province =df_lake.query('latitude_ERA ==  24.028095261448925 & longitude_ERA ==  69.0') 
toshka_lakes =df_lake.query('latitude_ERA ==  23.18500423251539 & longitude_ERA ==  30.900000000000006') 
all_northern_canada_points = df_lake.query('50.0 < latitude_ERA & -130 < longitude_ERA < -80')

caspain_1 = df_lake.query('latitude_ERA ==  46.22948997297545 & longitude_ERA ==   49.125')
caspain_2 = df_lake.query('latitude_ERA ==  46.22948997297545 & longitude_ERA ==    49.5')
caspain_3 = df_lake.query('latitude_ERA ==  46.51052023808231 & longitude_ERA ==    49.5')
caspain_4 = df_lake.query('latitude_ERA ==  47.072580762649004 & longitude_ERA ==    51.599999999999994')

caspian_edge = pd.concat([caspain_1,caspain_2,caspain_3,caspain_4])

bering_glacier = df_glacier.query('latitude_ERA == 60.280999861571715   & longitude_ERA ==    -143.4666666666667')
juncal_glacier = df_glacier.query('latitude_ERA == -33.021065936911214   & longitude_ERA ==    -70.07999999999998')


### Lake natron

In [None]:
surface_cols = ['latitude_ERA', 'longitude_ERA','prediction_error_1','prediction_error_2','prediction_error_3','prediction_error_4','mean_error','variance','label']
display(lake_natron[surface_cols])
print("delta V20= ", lake_natron.query("label == 'V20'")['mean_error'] - lake_natron.query("label == 'V15'")['mean_error'])
print("delta V20X= ", lake_natron.query("label == 'V20X'")['mean_error'] - lake_natron.query("label == 'V15'")['mean_error'])
print("delta V15X= ", lake_natron.query("label == 'V15X'")['mean_error'] - lake_natron.query("label == 'V15'")['mean_error'])

### Lake natron, northern edge

In [None]:
surface_cols = ['latitude_ERA', 'longitude_ERA','prediction_error_1','prediction_error_2','prediction_error_3','prediction_error_4','mean_error','variance','label']
display(lake_natron_northern_edge[surface_cols])
print("delta V20= ", lake_natron_northern_edge.query("label == 'V20'")['mean_error'] - lake_natron_northern_edge.query("label == 'V15'")['mean_error'])
print("delta V20X= ", lake_natron_northern_edge.query("label == 'V20X'")['mean_error'] - lake_natron_northern_edge.query("label == 'V15'")['mean_error'])
print("delta V15X= ", lake_natron_northern_edge.query("label == 'V15X'")['mean_error'] - lake_natron_northern_edge.query("label == 'V15'")['mean_error'])

### Lake Blanche

In [None]:
display(lake_blanche[surface_cols])
print("delta V20= ", lake_blanche.query("label == 'V20'")['mean_error'] - lake_blanche.query("label == 'V15'")['mean_error'])
print("delta V20X= ", lake_blanche.query("label == 'V20X'")['mean_error'] - lake_blanche.query("label == 'V15'")['mean_error'])
print("delta V15X= ", lake_blanche.query("label == 'V15X'")['mean_error'] - lake_blanche.query("label == 'V15'")['mean_error'])

### Salt lake city

In [None]:
display(salt_lake_city[surface_cols])
print("delta V20= ", salt_lake_city.query("label == 'V20'")['mean_error'] - salt_lake_city.query("label == 'V15'")['mean_error'])
print("delta V20X= ", salt_lake_city.query("label == 'V20X'")['mean_error'] - salt_lake_city.query("label == 'V15'")['mean_error'])
print("delta V15X= ", salt_lake_city.query("label == 'V15X'")['mean_error'] - salt_lake_city.query("label == 'V15'")['mean_error'])

### Farah province

In [None]:
display(farah_province[surface_cols])
print("delta V20= ", farah_province.query("label == 'V20'")['mean_error'] - farah_province.query("label == 'V15'")['mean_error'])
print("delta V20X= ", farah_province.query("label == 'V20X'")['mean_error'] - farah_province.query("label == 'V15'")['mean_error'])
print("delta V15X= ", farah_province.query("label == 'V15X'")['mean_error'] - farah_province.query("label == 'V15'")['mean_error'])

### Gujarat, India

In [None]:
display(gujarat_province[surface_cols])
print("delta V20= ", gujarat_province.query("label == 'V20'")['mean_error'] - gujarat_province.query("label == 'V15'")['mean_error'])
print("delta V20X= ", gujarat_province.query("label == 'V20X'")['mean_error'] - gujarat_province.query("label == 'V15'")['mean_error'])
print("delta V15X= ", gujarat_province.query("label == 'V15X'")['mean_error'] - gujarat_province.query("label == 'V15'")['mean_error'])

### Toshka lakes

In [None]:
display(toshka_lakes[surface_cols])
print("delta V20= ", toshka_lakes.query("label == 'V20'")['mean_error'] - toshka_lakes.query("label == 'V15'")['mean_error'])
print("delta V20X= ", toshka_lakes.query("label == 'V20X'")['mean_error'] - toshka_lakes.query("label == 'V15'")['mean_error'])
print("delta V15X= ", toshka_lakes.query("label == 'V15X'")['mean_error'] - toshka_lakes.query("label == 'V15'")['mean_error'])

### Northern Canada

In [None]:


display(all_northern_canada_points[surface_cols])



histograms_and_deltas(all_northern_canada_points.query("label == 'V20'"),all_northern_canada_points.query("label == 'V15'"),label="V20")
print('------------------------------')
histograms_and_deltas(all_northern_canada_points.query("label == 'V20X'"),all_northern_canada_points.query("label == 'V15'"),label="V20X")
print('-------------------------------')
histograms_and_deltas(all_northern_canada_points.query("label == 'V15X'"),all_northern_canada_points.query("label == 'V15'"),label="V15X")







### Caspian

In [None]:
caspian_edge



display(caspian_edge[surface_cols])



histograms_and_deltas(caspian_edge.query("label == 'V20'"),caspian_edge.query("label == 'V15'"),label="V20")
print('------------------------------')
histograms_and_deltas(caspian_edge.query("label == 'V20X'"),caspian_edge.query("label == 'V15'"),label="V20X")
print('-------------------------------')
histograms_and_deltas(caspian_edge.query("label == 'V15X'"),caspian_edge.query("label == 'V15'"),label="V15X")







### Bering glacier

In [None]:
display(bering_glacier[surface_cols])
print("delta V20= ", bering_glacier.query("label == 'V20'")['mean_error'] - bering_glacier.query("label == 'V15'")['mean_error'])
print("delta V20X= ", bering_glacier.query("label == 'V20X'")['mean_error'] - bering_glacier.query("label == 'V15'")['mean_error'])
print("delta V15X= ", bering_glacier.query("label == 'V15X'")['mean_error'] - bering_glacier.query("label == 'V15'")['mean_error'])

### Juncal glacier

In [None]:
display(juncal_glacier[surface_cols])
print("delta V20= ", juncal_glacier.query("label == 'V20'")['mean_error'] - juncal_glacier.query("label == 'V15'")['mean_error'])
print("delta V20X= ", juncal_glacier.query("label == 'V20X'")['mean_error'] - juncal_glacier.query("label == 'V15'")['mean_error'])
print("delta V15X= ", juncal_glacier.query("label == 'V15X'")['mean_error'] - juncal_glacier.query("label == 'V15'")['mean_error'])

## Single grid points

In [None]:

singles = [lake_natron, 
           lake_natron_northern_edge, 
           lake_blanche,
           salt_lake_city, 
           farah_province, 
           gujarat_province, 
           toshka_lakes, 
           bering_glacier, 
           juncal_glacier,
          ]

names = ['Lake Natron centre', 
           'Lake Natrom, north', 
           'Lake Blanche',
           'Great Salt Lake Desert', 
           'Farah Province', 
           'Gujarat Province', 
           'Toshka Lakes', 
           'Bering Glacier', 
           'Juncal Glacier',
          ]







In [None]:
import sys
import numpy as np 
from matplotlib.lines import Line2D


plt.rcParams["font.family"] = "serif"

fig, ax = plt.subplots(figsize=(10, 10), dpi=100)


x = np.arange(len(names))



for i in range(len(names)):
    print(i,names[i])
    df = singles[i]
    xi=x[i]
    
    
    y = df[['prediction_error_1','prediction_error_2','prediction_error_3','prediction_error_4']].to_numpy()
    
    V15  = y[0,:]
    V20  = y[1,:]
    V15X = y[2,:]
    V20X = y[3,:]
    
    
    for j in range(4):
        ax.scatter(V15[j],xi,c="C0",label="V15")
        ax.scatter(V20[j],xi,c="C1",label="V20")
        #ax.scatter(V20X[j],xi,c="C2",label="V20X")


    

# get the xticks, which are the numeric location of the ticks
yticks = ax.get_yticks()

# get the xticks and convert the values in the array to str type
yticklabels = list(map(str, ax.get_yticks()))

# update the string to be changed
for k in range(len(names)):
    
    yticklabels[k+1] = names[k]
    
yticklabels[0] = ''
yticklabels[-1] = ''


# set the xticks and the labels
_ = ax.set_yticks(yticks, yticklabels)
    

    
ax.set_xlabel('LST MAE [K]')
#plt.legend()

colors = ['C0', 'C1']
lines = [Line2D([0], [0], marker='o', color='w',markerfacecolor=c, markersize=7) for c in colors]
labels = ['V15','V20']
plt.legend(lines, labels)

plt.savefig(f"images/categories_v15_v20.png", bbox_inches="tight",dpi=300)


In [None]:
import sys
import numpy as np 
from matplotlib.lines import Line2D


plt.rcParams["font.family"] = "serif"

fig, ax = plt.subplots(figsize=(10, 10), dpi=100)


x = np.arange(len(names))



for i in range(len(names)):
    print(i,names[i])
    df = singles[i]
    xi=x[i]
    
    
    y = df[['prediction_error_1','prediction_error_2','prediction_error_3','prediction_error_4']].to_numpy()
    
    V15  = y[0,:]
    V20  = y[1,:]
    V15X = y[2,:]
    V20X = y[3,:]
    
    
    for j in range(4):
        ax.scatter(V15[j],xi,c="C0",label="V15")
        #ax.scatter(V20[j],xi,c="C1",label="V20")
        ax.scatter(V20X[j],xi,c="C2",label="V20X")


    

# get the xticks, which are the numeric location of the ticks
yticks = ax.get_yticks()

# get the xticks and convert the values in the array to str type
yticklabels = list(map(str, ax.get_yticks()))

# update the string to be changed
for k in range(len(names)):
    
    yticklabels[k+1] = names[k]
    
yticklabels[0] = ''
yticklabels[-1] = ''


# set the xticks and the labels
_ = ax.set_yticks(yticks, yticklabels)
    

    
ax.set_xlabel('LST MAE [K]')
#plt.legend()

colors = ['C0', 'C2']
lines = [Line2D([0], [0], marker='o', color='w',markerfacecolor=c, markersize=7) for c in colors]
labels = ['V15','V20']
plt.legend(lines, labels)

plt.savefig(f"images/categories_v15_v20X.png", bbox_inches="tight",dpi=300)


In [None]:
toshka_lakes