# Compare methods on validation dataset

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
pd.options.display.float_format = '{:.3f}'.format
plt.rcParams["figure.dpi"] = 300
sns.set(style='darkgrid')
from sklearn.metrics import mean_absolute_error, mean_squared_error
from IPython.display import display

In [2]:
def eval(df: pd.DataFrame, base_method: str, comparison_method_list: list, label: str = 'GDT_TS', target_column_name: str = 'target', discard_low_model: bool = True, discard_thre=40):
    if discard_low_model:
        df = df.groupby(target_column_name).filter(lambda x: x[label].max() > discard_thre)
    group = df.groupby(target_column_name)
    pearson = group.corr()[label]
    spearman = group.corr('spearman')[label]
    pro_pearson = pearson.mean(level=1)[base_method]
    pro_spearman = spearman.mean(level=1)[base_method]
    loss = group.apply(lambda x: x[label].max()-x[label][x[base_method].idxmax()])
    pro_loss = loss.mean()
    zscore = group.apply(lambda x: stats.zscore(x[label])[x.index.get_loc(x[base_method].idxmax())])
    pro_zscore = zscore.mean()
    pearson_list = [pro_pearson]
    spearman_list = [pro_spearman]
    loss_list = [pro_loss]
    zscore_list = [pro_zscore]
    p_pearson_list = [None]
    p_spearman_list = [None]
    p_loss_list = [None]
    p_zscore_list = [None]
    for method in comparison_method_list:
        pearson_list.append(pearson.mean(level=1)[method])
        _, p_pearson = stats.wilcoxon(list(pearson[:, base_method]), list(pearson[:,method]))
        p_pearson_list.append(p_pearson)
        spearman_list.append(spearman.mean(level=1)[method])
        _, p_spearman = stats.wilcoxon(list(spearman[:, base_method]), list(spearman[:,method]))
        p_spearman_list.append(p_spearman)
        com_loss = group.apply(lambda x: x[label].max() - x[label][x[method].idxmax()])
        loss_list.append(com_loss.mean())
        _, p_loss = stats.wilcoxon(list(loss), list(com_loss))
        p_loss_list.append(p_loss)
        com_zscore = group.apply(lambda x: stats.zscore(x[label])[x.index.get_loc(x[method].idxmax())])
        zscore_list.append(com_zscore.mean())
        _, p_zscore = stats.wilcoxon(list(zscore), list(com_zscore))
        p_zscore_list.append(p_zscore)
        method = [base_method] + list(comparison_method_list)
    
    result_df = pd.DataFrame({'pearson': pearson_list, 'spearman': spearman_list, 'loss': loss_list, 'zscore': zscore_list, 'pearson_p': p_pearson_list, 'spearman_p': p_spearman_list, 'loss_p': p_loss_list, 'zscore_p': p_zscore_list}, index=method)
    return result_df

In [3]:
val_pro_df = pd.read_csv('each_feature/proposed_2.csv', index_col=0).rename({'global_score': 'atom_pssm_local'}, axis=1)
val_atom_only_df = pd.read_csv('each_feature/atom_only_4.csv', index_col=0).rename({'global_score': 'atom'}, axis=1)
val_add_pssm_df = pd.read_csv('each_feature/add_pssm_3.csv', index_col=0).rename({'global_score': 'atom_pssm'}, axis=1)
val_without_pssm_df = pd.read_csv('each_feature/without_pssm_5.csv', index_col=0).rename({'global_score': 'atom_local'}, axis=1)
val_without_atom_df = pd.read_csv('each_feature/without_atom_5.csv', index_col=0).rename({'global_score': 'pssm_local'}, axis=1)
val_df = pd.concat([val_pro_df, val_atom_only_df, val_add_pssm_df, val_without_pssm_df, val_without_atom_df], axis=1)
val_df = val_df.loc[:, ~val_df.columns.duplicated(keep='last')]
val_df = val_df.rename({'global_label': 'GDT_TS', 'target_name': 'target'}, axis=1)
val_df['GDT_TS'] *= 100
val_df

Unnamed: 0,atom_pssm_local,atom,atom_pssm,atom_local,pssm_local,GDT_TS,target,model_path
0,0.201,0.177,0.163,0.262,0.237,32.140,casp7/T0347,casp7/T0347/ROBETTA_TS3.npz
1,0.003,0.052,0.019,0.047,0.006,5.190,casp9/T0547,casp9/T0547/YASARA_TS1.npz
2,0.198,0.161,0.104,0.273,0.222,21.860,casp10/T0674,casp10/T0674/SAM-T06-server_TS1.npz
3,0.571,0.384,0.273,0.592,0.520,33.780,casp10/T0751,casp10/T0751/PconsD_TS4.npz
4,0.866,0.829,0.894,0.833,0.882,71.770,casp7/T0359,casp7/T0359/3D-JIGSAW_RECOM_TS5.npz
...,...,...,...,...,...,...,...,...
23089,0.917,0.754,0.829,0.900,0.890,83.490,casp9/T0538,casp9/T0538/Pcons_TS3.npz
23090,0.023,0.097,0.057,0.088,0.030,8.160,casp10/T0715,casp10/T0715/RBO-MBS_TS1.npz
23091,0.782,0.644,0.662,0.733,0.766,71.230,casp9/T0538,casp9/T0538/gws_TS1.npz
23092,0.020,0.024,0.012,0.035,0.022,11.800,casp10/T0741,casp10/T0741/BhageerathH_TS4.npz


In [4]:
val_df.to_csv('val_comparison_for_each_feature.csv')

In [5]:
val_df = pd.read_csv('val_comparison_for_each_feature.csv', index_col=0)

In [6]:
val_result_df = eval(val_df, base_method='atom_pssm_local', comparison_method_list=val_df.columns[1: 5])
val_result_df

Unnamed: 0,pearson,spearman,loss,zscore,pearson_p,spearman_p,loss_p,zscore_p
atom_pssm_local,0.865,0.751,2.519,4.866,,,,
atom,0.757,0.645,8.518,4.244,0.0,0.0,0.002,0.002
atom_pssm,0.834,0.729,9.86,4.239,0.0,0.001,0.0,0.0
atom_local,0.847,0.724,3.883,4.742,0.0,0.0,0.398,0.398
pssm_local,0.858,0.742,4.818,4.666,0.304,0.126,0.005,0.005


In [7]:
val_result_df.to_csv('val_result.csv')