## Core Workflow: Calculate precision scores 
Purpose: Calculate precision scores for duplicate imagery
<br>

In [1]:
import csv
import pandas as pd 

import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
from sklearn.metrics import mean_squared_error
from math import sqrt

import warnings
warnings.filterwarnings('ignore')

In [None]:
a = [1,2,5,6,9]
import numpy as np
print(np.mean(a))
print(np.std(a))

In [2]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

In [3]:
def svfig(path):
    plt.savefig(path, dpi=None, facecolor='w', edgecolor='w',
        orientation='portrait', papertype=None, format=None,
        transparent=False, bbox_inches=None, pad_inches=1, 
        frameon=None, metadata=None)

In [4]:
# 0: only 2 instance duplicates, 1: all duplicates (max and min from each)
def group_dupls(df,overlap_code=0):
    if overlap_code==0:
        dfr = df[df['ScoredLabels']==2].copy()
        dfr['range'] = dfr['max']-dfr['min']
        print(dfr['range'].count())
        return dfr
    elif overlap_code == 1:
        df = df[df['ScoredLabels']>1].copy()
        df['range'] = df['max']-df['min']
        print(df['range'].count())
        return df
    else:
        print ('??')

In [5]:
fig_size = plt.rcParams["figure.figsize"]
# Set figure width to 12 and height to 9
fig_size[0] = 14
fig_size[1] = 10
plt.rcParams["figure.figsize"] = fig_size

#### Prediction Distribution

In [12]:
datasets_all_09 = [
    'pred_LA_2009_linear_sika+gg+ss+smote_imitgg_norm_rm-hi-alb_9-14.csv',
    'pred_LA_2009_linear_sika+gg+ss+smote_imitgg_org_rm-hi-alb_9-14.csv',
               ]

datasets_all =  [
#     'pred_LA_2014_boosted_sika+gg_imitgg_norm_9-3.csv',
               ]

datasets_decision_09 = [
#     'pred_LA_2009_decision_sika+gg+ss+smote_imitgg_norm_rm-outlr_9-16.csv',
#     'pred_LA_2009_decision_sika+gg+ss+smote_imitgg_norm_rm-outlr_9-16_v2.csv',
#     'pred_LA_cnty_2009_train-50m_pred-rf-norm_10-23.csv',
                    ]

datasets_decision = [
    'pred_LA_cnty_2018_train-50m_pred-rf-norm_10-23.csv',
#     'pred_LA_2014_decision_sika+gg_imitgg_norm-all_9-9.csv',
#     'pred_LA_2016_decision_sika+gg_imitgg_norm-all_9-9.csv'
                    ]

### Stat for all models

In [None]:
for dataset in datasets_all_09:
    print (dataset)
    df_pre = pd.read_csv(dataset)
    df_pre = df_pre[df_pre.gg_prediction != 0]
    df_pre['Scored Labels'] = df_pre['Scored Labels']
    df = df_pre[['footprint_shapes','gg_prediction']]   
    df['ScoredLabels']=df_pre['Scored Labels']
   
    dfvd = pd.read_csv(dataset)  
    dfv = dfvd.dropna()
    dfv['Scored Labels']=dfv['Scored Labels']
    dfv['ScoredLabels']=dfv['Scored Labels']
    p = dfv['ScoredLabels']
    o = dfv['gg_prediction']
    dfv['residual'] = p-o
    dfv['AME'] = abs(p-o)

    rmse = sqrt(mean_squared_error(o, p))
#     print (rmse)

    dfvd = dfv[['gg_prediction', 'ScoredLabels', 'residual', 'AME']]
    
    dfv = dfvd.dropna()
    f_low_alb = dfv['gg_prediction']<0.2
    low_alb = dfv[f_low_alb]
    pl = low_alb['ScoredLabels']
    ol = low_alb['gg_prediction']
    low_alb.describe()

    rmsel = sqrt(mean_squared_error(ol, pl))
#     print (rmsel)

    low_albd = low_alb[['gg_prediction', 'ScoredLabels', 'residual', 'AME']]  
    
    dfv = dfvd.dropna()
    f_high_alb = dfv['gg_prediction']>0.4
    high_alb = dfv[f_high_alb]
    ph = high_alb['ScoredLabels']
    oh = high_alb['gg_prediction']
    high_alb.describe()

    rmseh = sqrt(mean_squared_error(oh, ph))
#     print (rmseh)

    high_albd = high_alb[['gg_prediction', 'ScoredLabels', 'residual', 'AME']]
    
    ame = dfv.AME.mean()
    amel = low_alb.AME.mean()
    ameh = high_alb.AME.mean()

#     print ()
    print ("RMSE: ", str(round(rmse, 4)), ' (entire valdiation set, all expected albedos)')
    print ("RMSE: ", str(round(rmsel, 4)), ' (low expected albedos, <0.2)')
    print ("RMSE: ", str(round(rmseh, 4)), ' (high expected albedos, >0.4)')
#     print ()
    print ("MAE:  ", str(round(ame, 4)), ' (entire valdiation set, all expected albedos)')
#     print ("MAE:  ", str(round(amel, 4)), ' (low expected albedos, <0.2)')
#     print ("MAE:  ", str(round(ameh, 4)), ' (high expected albedos, >0.4)')
#     print ()
    
    df['min']=df['ScoredLabels']
    df['max']=df['ScoredLabels']  
    
#     a = df.shape[0]
#     b = df.footprint_shapes.value_counts()

#     print ("Unique Bldg Footprints:", str(len(b)))
#     print ()
#     print ("Predictions:", str(a)+"  "+compareA)
#     print ()
#     print ("Duplicates:", str(a-len(b))+"  "+compareA)
#     print ("Duplicates: %", str(round((1-(len(b)/a))*100, 2))+"  "+compareA)
    
    aggfunc = {'ScoredLabels':'count', 'min':lambda x: x.min(), 'max':lambda x: x.max()}
    df_grp  = df.groupby(['footprint_shapes']).agg(aggfunc).reset_index().copy()
#     print(df_grp.shape[0])
    
    df_grp = group_dupls(df_grp)
    
#     df_r = df_grp.drop(['min','max'],1)
#     df_r.describe()
    mrange = df_grp.range.mean()
    print ("MAE for duplicates: ", str(round(mrange, 4)))
    
    p = df_grp['max']
    o = df_grp['min']

    rmsed = sqrt(mean_squared_error(o, p))
    print ('rmse for duplicates: ' + str(rmsed))
    print ()

### For predictions other than 2009

In [None]:
for dataset in datasets_all:
    print (dataset)
    df_pre = pd.read_csv(dataset)
    df_pre['Scored Labels'] = df_pre['Scored Labels']
    df = df_pre[['footprint_shapes']]   
    df['ScoredLabels']=df_pre['Scored Labels']
   
    
    df['min']=df['ScoredLabels']
    df['max']=df['ScoredLabels']  
    
    aggfunc = {'ScoredLabels':'count', 'min':lambda x: x.min(), 'max':lambda x: x.max()}
    df_grp  = df.groupby(['footprint_shapes']).agg(aggfunc).reset_index().copy()
#     print(df_grp.shape[0])
    
    df_grp = group_dupls(df_grp)
    
#     df_r = df_grp.drop(['min','max'],1)
#     df_r.describe()
    mrange = df_grp.range.mean()
    print ("MAE for duplicates: ", str(round(mrange, 4)))
    
    p = df_grp['max']
    o = df_grp['min']

    rmsed = sqrt(mean_squared_error(o, p))
    print ('rmse for duplicates: ' + str(rmsed))
    print ()

### Stat for Decision Forest models

In [None]:
for dataset in datasets_decision_09:
    print (dataset)
    df_pre = pd.read_csv(dataset)
    df_pre = df_pre[df_pre.gg_prediction != 0]
    df_pre['Scored Labels'] = df_pre['Scored Label Mean']
    df = df_pre[['footprint_shapes','gg_prediction']]   
    df['ScoredLabels']=df_pre['Scored Labels']
   
    dfvd = pd.read_csv(dataset)  
    dfv = dfvd.dropna()
    dfv['Scored Labels']=dfv['Scored Label Mean']
    dfv['ScoredLabels']=dfv['Scored Labels']
    p = dfv['ScoredLabels']
    o = dfv['gg_prediction']
    dfv['residual'] = p-o
    dfv['AME'] = abs(p-o)

    rmse = sqrt(mean_squared_error(o, p))
#     print (rmse)

    dfvd = dfv[['gg_prediction', 'ScoredLabels', 'residual', 'AME']]
    
    dfv = dfvd.dropna()
    f_low_alb = dfv['gg_prediction']<0.2
    low_alb = dfv[f_low_alb]
    pl = low_alb['ScoredLabels']
    ol = low_alb['gg_prediction']
    low_alb.describe()

    rmsel = sqrt(mean_squared_error(ol, pl))
#     print (rmsel)

    low_albd = low_alb[['gg_prediction', 'ScoredLabels', 'residual', 'AME']]  
    
    dfv = dfvd.dropna()
    f_high_alb = dfv['gg_prediction']>0.4
    high_alb = dfv[f_high_alb]
    ph = high_alb['ScoredLabels']
    oh = high_alb['gg_prediction']
    high_alb.describe()

    rmseh = sqrt(mean_squared_error(oh, ph))
#     print (rmseh)

    high_albd = high_alb[['gg_prediction', 'ScoredLabels', 'residual', 'AME']]
    
    ame = dfv.AME.mean()
    amel = low_alb.AME.mean()
    ameh = high_alb.AME.mean()

#     print ()
    print ("RMSE: ", str(round(rmse, 4)), ' (entire valdiation set, all expected albedos)')
    print ("RMSE: ", str(round(rmsel, 4)), ' (low expected albedos, <0.2)')
    print ("RMSE: ", str(round(rmseh, 4)), ' (high expected albedos, >0.4)')
#     print ()
    print ("MAE:  ", str(round(ame, 4)), ' (entire valdiation set, all expected albedos)')
#     print ("MAE:  ", str(round(amel, 4)), ' (low expected albedos, <0.2)')
#     print ("MAE:  ", str(round(ameh, 4)), ' (high expected albedos, >0.4)')
#     print ()
    
    df['min']=df['ScoredLabels']
    df['max']=df['ScoredLabels']  
    
#     a = df.shape[0]
#     b = df.footprint_shapes.value_counts()

#     print ("Unique Bldg Footprints:", str(len(b)))
#     print ()
#     print ("Predictions:", str(a)+"  "+compareA)
#     print ()
#     print ("Duplicates:", str(a-len(b))+"  "+compareA)
#     print ("Duplicates: %", str(round((1-(len(b)/a))*100, 2))+"  "+compareA)
    
    aggfunc = {'ScoredLabels':'count', 'min':lambda x: x.min(), 'max':lambda x: x.max()}
    df_grp  = df.groupby(['footprint_shapes']).agg(aggfunc).reset_index().copy()
#     print(df_grp.shape[0])
    
    df_grp = group_dupls(df_grp)
    
#     df_r = df_grp.drop(['min','max'],1)
#     df_r.describe()
    mrange = df_grp.range.mean()
    print ("MAE for duplicates: ", str(round(mrange, 4)))
    
    p = df_grp['max']
    o = df_grp['min']

    rmsed = sqrt(mean_squared_error(o, p))
    print ('rmse for duplicates: ' + str(rmsed))
    print ()

### For predictions other than 2009

In [13]:
for dataset in datasets_decision:
    print (dataset)
    df_pre = pd.read_csv(dataset)
    df_pre['Scored Labels'] = df_pre['Scored Label Mean']
    df = df_pre[['footprint_shapes']]   
    df['ScoredLabels']=df_pre['Scored Labels']
    
    df['min']=df['ScoredLabels']
    df['max']=df['ScoredLabels']  
    
    
    aggfunc = {'ScoredLabels':'count', 'min':lambda x: x.min(), 'max':lambda x: x.max()}
    df_grp  = df.groupby(['footprint_shapes']).agg(aggfunc).reset_index().copy()
#     print(df_grp.shape[0])
    
    df_grp = group_dupls(df_grp)
    
    mrange = df_grp.range.mean()
    print ("MAE for duplicates: ", str(round(mrange, 4)))
    
    p = df_grp['max']
    o = df_grp['min']

    rmsed = sqrt(mean_squared_error(o, p))
    print ('rmse for duplicates: ' + str(rmsed))
    print ()

pred_LA_cnty_2018_train-50m_pred-rf-norm_10-23.csv
439038
MAE for duplicates:  0.1967
rmse for duplicates: 0.3689749002220586

