# Yearly average classification table

This notebook combines the model predicitons and the grid point categories to produce a table of the improvements when going from

* V15 ---> V20
* V15 ---> V20X
* V15 ---> V15X

This improvement is denoted by `delta` i.e. $\delta = \text{V20 prediction error} - \text{V15 prediction error}$

---

#### Load all the data


In [75]:
import pandas as pd
V15 = pd.read_pickle('tmp_data/V15_predictions_yearly_average.pkl')
V20 = pd.read_pickle('tmp_data/V20_predictions_yearly_average.pkl')
V20X= pd.read_pickle('tmp_data/V20X_predictions_yearly_average.pkl')
V15X= pd.read_pickle('tmp_data/V15X_predictions_yearly_average.pkl')


change_in_fields = pd.read_pickle('tmp_data/change_in_fields.pkl')
change_in_fields['latitude_join'] = round(change_in_fields.latitude_ERA,3)
change_in_fields['longitude_join'] = round(change_in_fields.longitude_ERA,3) #just used for joining due to loss of precision from Margs file

#### Join it together - V15/V20/V20X


In [76]:
V20.shape == V15.shape == V20X.shape

True

In [77]:
#Create a new df that will just hold the differences   
data = { 'latitude_ERA':     V20.latitude_ERA, 
         'longitude_ERA':    V20.longitude_ERA,
          'MODIS_LST':       V20.MODIS_LST,
          'V15_prediction':  V15.predictions,
          'V20_prediction':  V20.predictions,
          'V20X_prediction': V20X.predictions,
          'V15_error':       V15.predicion_error,
          'V20_error':       V20.predicion_error,
          'V20X_error':      V20X.predicion_error,
          'delta':           V20.predicion_error - V15.predicion_error,
          'deltaX':          V20X.predicion_error - V15.predicion_error,
          'latitude_join':    round(V20.latitude_ERA,3), 
          'longitude_join':    round(V20.longitude_ERA,3)}

df = pd.DataFrame(data)   

In [78]:
#Join and save
df_cat = pd.merge(df,change_in_fields,how='inner',on=['latitude_join', 'longitude_join'],suffixes=('', '_y')) #inner join.
df_cat = df_cat.drop(['latitude_join', 'longitude_join','latitude_ERA_y', 'longitude_ERA_y'], axis=1) #Get rid of junk columns
df_cat.to_pickle('tmp_data/clean_yearly_data.pkl')

#### Join it together - V15/V20/V20X/V15X

Note that V15X has a different shape to the other files (part of the RML pipeline, rather than original ML pipeline)

We therefore first do an inner join to just get the grid points shared by all files.


In [79]:
#Now inner join on V15X shape
df.shape[0] == V15X.shape[0]

False

In [80]:
dfcat_withV15X = pd.merge(df_cat,V15X,how='inner',on=['latitude_ERA', 'longitude_ERA']) #inner join.

In [81]:
#Create new columns
dfcat_withV15X['V15X_prediction'] = dfcat_withV15X.predictions
dfcat_withV15X['V15X_error']      = dfcat_withV15X.predicion_error
dfcat_withV15X['deltaX15']        = dfcat_withV15X.V15X_error - dfcat_withV15X.V15_error 

#Drop the V15X columns
dfcat_withV15X = dfcat_withV15X.drop(['MODIS_LST_y', 'skt_unnormalised',
       'number_of_modis_observations', 'predictions', 'predicion_bias',
       'predicion_error'], axis=1)

#Save it
dfcat_withV15X.to_pickle('tmp_data/clean_yearly_data_w_V15X.pkl')

## Create classification table

In [82]:
df= pd.read_pickle('tmp_data/clean_yearly_data.pkl') #Load the file we just created


In [83]:



from scipy.stats import ttest_ind
import numpy as np

def significance_test(v1,v2):
    try:
        return ttest_ind(v1, v2)
    except:
        return np.nan

def significance_boolean(x):
    
    try:
        pval = x[-1]
    except:
        #Nans are not subscriptable
        return 'Insignificant'
    
    if (pval > 0.01) or np.isnan(pval): 
        return 'Insignificant'
    else:
        return 'Significant'


def create_classification_table(ds,table_type):

    if table_type == 'V20':
        q = 'delta'
        x1 = 'V20_error'
    if table_type == 'V20X':
        q = 'deltaX'
        x1 = 'V20X_error'
    if table_type == 'V15X':
        q = 'deltaX15'
        x1 = 'V15X_error'


    classification_table                                = ds.groupby('bitstring').agg(**{'Number of Pixels':pd.NamedAgg(q,'size'),'AverageDelta':pd.NamedAgg(q,'mean')})     # For each group, count number of pixels and get the average delta      
    classification_table["Percentage"]                  = 100.0* classification_table['Number of Pixels'] / sum(classification_table["Number of Pixels"])                                            # Express number of pixels as a percentage
    
    classification_table['Stats (t-statistic,p-value)'] = ds.groupby('bitstring').apply(lambda x: significance_test(x['V15_error'], x[x1]))                                     # For each group, do a ttest between the V15_errors and the V20_errors
    classification_table['Significant Change?']         = classification_table['Stats (t-statistic,p-value)'].apply(lambda x: significance_boolean(x))

    return classification_table[['Number of Pixels', 'Percentage','AverageDelta','Stats (t-statistic,p-value)','Significant Change?']].sort_values(by=['Significant Change?', 'Number of Pixels'],ascending=False)





In [62]:
table_v20 = create_classification_table(df,'V20')


In [74]:
table_v20.sort_values(by=['Significant Change?'],ascending=False).head(40) #.iloc[0:ncircles]


Unnamed: 0_level_0,Number of Pixels,Percentage,AverageDelta,"Stats (t-statistic,p-value)",Significant Change?
bitstring,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,133954,89.415927,-0.01944,"(5.46191139608136, 4.714524033989624e-08)",Significant
1001000,529,0.353114,-0.791683,"(5.97010715075186, 3.2344400560504356e-09)",Significant
11010,163,0.108804,-0.444421,"(3.62253368821502, 0.0003385281408867883)",Significant
11001,21,0.014018,0.525767,"(-2.727655672247146, 0.009427473548313947)",Significant
101000,14,0.009345,4.729481,"(-5.701036087875889, 5.341755397665172e-06)",Significant
101001,8,0.00534,9.040051,"(-9.023153472137292, 3.2849290989162147e-07)",Significant
0,8821,5.888125,-0.051784,"(3.0757597216455403, 0.0021028817911008402)",Significant
1100000,4,0.00267,-0.088943,"(0.7576328365115681, 0.4773611502664379)",Insignificant
11110,2,0.001335,-1.524353,"(0.7904001133557881, 0.5121295850583774)",Insignificant
1111000,3,0.002003,4.240709,"(-1.7265867229874154, 0.1593173907592276)",Insignificant


In [84]:
table_v202 = create_classification_table(df,'V20')


  **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [90]:
table_v202.sort_values(by=['Significant Change?'],ascending=False).head(60) #.iloc[0:ncircles]


Unnamed: 0_level_0,Number of Pixels,Percentage,AverageDelta,"Stats (t-statistic,p-value)",Significant Change?
bitstring,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,130227,86.928109,-0.017689,"(5.018260060848922, 5.217579476526685e-07)",Significant
1001,4431,2.957746,-0.081862,"(2.7548139055604492, 0.005884540947922857)",Significant
1001000,455,0.303718,-0.741537,"(5.443607346182554, 6.720801363659759e-08)",Significant
11010,161,0.107469,-0.443137,"(3.616956184999805, 0.00034623551938193944)",Significant
11001,43,0.028703,0.496087,"(-3.6206744459901947, 0.00050144658748286)",Significant
101001,14,0.009345,8.04589,"(-10.835664011871849, 3.8850642308916806e-11)",Significant
0,7734,5.162539,-0.05279,"(3.258149268359118, 0.001123833128654373)",Significant
1100000,3,0.002003,-0.108881,"(2.3384231763678276, 0.07951541422321837)",Insignificant
110001,2,0.001335,3.808017,"(-1.2408311539978356, 0.3404741913611623)",Insignificant
11011,2,0.001335,-0.547783,"(0.5797794025369344, 0.620673692626469)",Insignificant
