# Yearly average classification table

This notebook combines the model predicitons and the grid point categories to produce a table of the improvements when going from

* V15 ---> V20
* V15 ---> V20X
* V15 ---> V15X

This improvement is denoted by `delta` i.e. delta = V20_prediction_error - V15_prediction_error

#### Load all the data


In [30]:
import pandas as pd
V15 = pd.read_pickle('tmp_data/V15_predictions_yearly_average.pkl')
V20 = pd.read_pickle('tmp_data/V20_predictions_yearly_average.pkl')
V20X= pd.read_pickle('tmp_data/V20X_predictions_yearly_average.pkl')
V15X= pd.read_pickle('tmp_data/V15X_predictions_yearly_average.pkl')


change_in_fields = pd.read_pickle('tmp_data/change_in_fields.pkl')


#### Join it together

Note that V15X has a different shape to the other files (part of the RML pipeline, rather than original ML pipeline)

We therefore first do an inner join to just get the grid points shared by all files.

In [54]:
V20.shape == V15.shape == V20X.shape

True

In [68]:
#Create a new df that will just hold the differences   
data1 = {'latitude_ERA':     V20.latitude_ERA, 
         'longitude_ERA':    V20.longitude_ERA,
          'MODIS_LST':       V20.MODIS_LST,
          'V15_prediction':  V15.predictions,
          'V20_prediction':  V20.predictions,
          'V20X_prediction': V20X.predictions,
          'V15_error':       V15.predicion_error,
          'V20_error':       V20.predicion_error,
          'V20X_error':      V20X.predicion_error,
          'delta':           V20.predicion_error - V15.predicion_error,
          'deltaX':          V20X.predicion_error - V15.predicion_error}

df1 = pd.DataFrame(data1)   

In [69]:
#Join and save
df_cat = pd.merge(df1,change_in_fields,how='inner',left_on=['latitude_ERA', 'longitude_ERA'], right_on=['latitude', 'longitude']) #inner join.
df_cat.to_pickle('tmp_data/clean_yearly_data.pkl')



In [61]:
#Now inner join on V15X shape
df1.shape[0] == V15X.shape[0]

False

In [74]:
dfcat_withV15X = pd.merge(df_cat,V15X,how='inner',on=['latitude_ERA', 'longitude_ERA']) #inner join.

In [75]:
#Create new columns
dfcat_withV15X['V15X_prediction'] = dfcat_withV15X.predictions
dfcat_withV15X['V15X_error']      = dfcat_withV15X.predicion_error
dfcat_withV15X['deltaX15']        = dfcat_withV15X.V15_error - dfcat_withV15X.V15X_error  

#Drop the V15X columns
dfcat_withV15X = dfcat_withV15X.drop(['MODIS_LST_y', 'skt_unnormalised',
       'number_of_modis_observations', 'predictions', 'predicion_bias',
       'predicion_error'], axis=1)





In [76]:
#Save it
dfcat_withV15X.to_pickle('tmp_data/clean_yearly_data_w_V15X.pkl')

#### Create a classification table

In [77]:
df_cat = pd.read_pickle('tmp_data/clean_yearly_data.pkl')
df_cat_w_v15 = pd.read_pickle('tmp_data/clean_yearly_data_w_V15X.pkl')

In [78]:



from scipy.stats import ttest_ind
import numpy as np

def significance_test(v1,v2):
    try:
        return ttest_ind(v1, v2)
    except:
        return np.nan

def significance_boolean(x):
    
    try:
        pval = x[-1]
    except:
        #Nans are not subscriptable
        return 'Insignificant'
    
    if (pval > 0.01) or np.isnan(pval): 
        return 'Insignificant'
    else:
        return 'Significant'


def create_classification_table(ds,table_type):

    if table_type == 'V20':
        q = 'delta'
        x1 = 'V20_error'
    if table_type == 'V20X':
        q = 'deltaX'
        x1 = 'V20X_error'
    if table_type == 'V15X':
        q = 'deltaX15'
        x1 = 'V15X_error'


    classification_table                                = ds.groupby('bitstring').agg(**{'Number of Pixels':pd.NamedAgg(q,'size'),'AverageDelta':pd.NamedAgg(q,'mean')})     # For each group, count number of pixels and get the average delta      
    classification_table["Percentage"]                  = 100.0* classification_table['Number of Pixels'] / sum(classification_table["Number of Pixels"])                                            # Express number of pixels as a percentage
    
    classification_table['Stats (t-statistic,p-value)'] = ds.groupby('bitstring').apply(lambda x: significance_test(x['V15_error'], x[x1]))                                     # For each group, do a ttest between the V15_errors and the V20_errors
    classification_table['Significant Change?']         = classification_table['Stats (t-statistic,p-value)'].apply(lambda x: significance_boolean(x))

    return classification_table[['Number of Pixels', 'Percentage','AverageDelta','Stats (t-statistic,p-value)','Significant Change?']].sort_values(by=['Significant Change?', 'Number of Pixels'],ascending=False)







In [79]:
table_v20 = create_classification_table(df_cat,'V20')
table_v20X = create_classification_table(df_cat,'V20X')
table_v15X = create_classification_table(df_cat_w_v15,'V15X')

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [80]:
table_v20

Unnamed: 0_level_0,Number of Pixels,Percentage,AverageDelta,"Stats (t-statistic,p-value)",Significant Change?
bitstring,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100,135005,90.117482,-0.020074,"(5.6015351428002385, 2.1266764789306387e-08)",Significant
0,9119,6.087044,-0.051633,"(3.025451326903979, 0.0024860820906935925)",Significant
100110,1512,1.009278,-0.33601,"(6.428846519940273, 1.4890045307006747e-10)",Significant
101,834,0.556705,-0.146436,"(2.603472646240921, 0.00931042147086133)",Significant
11100,18,0.012015,3.905514,"(-4.470645229389644, 8.249553308371896e-05)",Significant
10100,13,0.008678,7.586944,"(-13.562981643142237, 9.521420939065104e-13)",Significant
101100,8,0.00534,0.486319,"(-3.3043994483637684, 0.00521642231127351)",Significant
100100,2392,1.596689,-0.053458,"(2.174490351294765, 0.02971719330884196)",Insignificant
110,247,0.164876,-0.151713,"(1.5120205599473904, 0.1311706602165086)",Insignificant
100000,234,0.156198,-0.19249,"(1.996783817109343, 0.04642985343989715)",Insignificant


In [81]:
table_v20X

Unnamed: 0_level_0,Number of Pixels,Percentage,AverageDelta,"Stats (t-statistic,p-value)",Significant Change?
bitstring,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100,135005,90.117482,-0.081465,"(23.56030193871013, 1.3102858471844417e-122)",Significant
0,9119,6.087044,-0.104934,"(6.350991251617155, 2.189890540772654e-10)",Significant
100100,2392,1.596689,-0.111606,"(4.718440796562972, 2.4443631137646333e-06)",Significant
100110,1512,1.009278,-0.395243,"(7.593134806378133, 4.135401079128569e-14)",Significant
101,834,0.556705,-0.284028,"(4.890430864789042, 1.1031513823102672e-06)",Significant
100000,234,0.156198,-0.270278,"(2.896794175583813, 0.003947403841422364)",Significant
110,247,0.164876,-0.194843,"(2.0249189795464706, 0.043415219271146346)",Insignificant
111,114,0.076096,-0.082715,"(0.6609266848660991, 0.5093324991958698)",Insignificant
100010,104,0.069421,-0.19276,"(1.7034173947827063, 0.0899980578333079)",Insignificant
1,43,0.028703,-0.247474,"(1.00072653938051, 0.3198331285204762)",Insignificant


In [82]:
table_v15X

Unnamed: 0_level_0,Number of Pixels,Percentage,AverageDelta,"Stats (t-statistic,p-value)",Significant Change?
bitstring,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100,134995,90.121034,0.047434,"(13.381725782528799, 7.968659631344665e-41)",Significant
100110,1512,1.009393,0.155566,"(2.666214976885041, 0.007711922637627785)",Significant
0,9112,6.083061,0.041698,"(2.449874999086847, 0.014299922775501722)",Insignificant
100100,2392,1.59687,-0.047229,"(-1.7809028321789768, 0.07499178781649538)",Insignificant
101,834,0.556768,0.093064,"(1.6528111522634759, 0.09855772184422609)",Insignificant
110,247,0.164894,0.085473,"(0.8422448951159572, 0.4000601099065684)",Insignificant
100000,234,0.156216,-0.041637,"(-0.3915560834585783, 0.6955652253087852)",Insignificant
111,114,0.076105,0.095107,"(0.828567642868861, 0.4082232792686128)",Insignificant
100010,104,0.069429,0.044828,"(0.3248941762462097, 0.7455908704843667)",Insignificant
1,43,0.028706,0.062258,"(0.25507554108193187, 0.7992882436895232)",Insignificant


---

# Checks

Checking the V15X values look sensible

In [83]:
df_cat_w_v15 = pd.read_pickle('tmp_data/clean_yearly_data_w_V15X.pkl')

In [85]:
df_cat_w_v15.query("24.4 < latitude_ERA < 24.6 & 67.1 < longitude_ERA < 67.3")[['latitude_ERA','longitude_ERA','MODIS_LST_x','V15X_prediction','V15X_error']]

Unnamed: 0,latitude_ERA,longitude_ERA,MODIS_LST_x,V15X_prediction,V15X_error
72433,24.590156,67.2,300.773254,311.070892,10.297641
