In [1]:
import tensorflow as tf

import pandas as pd
import numpy as np

from ionmob.preprocess.data import to_tf_dataset
from ionmob.preprocess.helpers import tokenizer_from_json

from scipy.stats import pearsonr

In [2]:
def calculate_mean(a, b, c):
    return np.round(np.mean(np.array([a, b, c])), 2)

def median_absolute_percent_error(y, y_hat):
    return np.round(np.median((np.abs((y - y_hat) / y)) * 100), 2) 

def median_absolute_error(y, y_hat):
    return np.round(np.median(np.abs((y - y_hat))), 2)

In [3]:
ds = 'Chang'

grus = pd.read_csv(f'accuracies/GRU/{ds}_GRU.csv')
grus['sequence'] = grus.apply(lambda r: ''.join(r['sequence-tokenized']), axis=1)
grus = grus.rename(columns={'ccs_predicted':'ccs_gru'})

conv = pd.read_csv(f'accuracies/CONV/{ds}_CONV.csv')

lstm = pd.read_csv(f'accuracies/MEIER/{ds}_MEIER.csv')
lstm = lstm.rename(columns={'CCS': 'ccs', 'CCS_predicted': 'ccs_predicted'})

if ds == 'Chang':
    grus = grus.iloc[:-1]
    conv = conv.iloc[:-1]

conv = conv.sort_values(['charge', 'sequence'])
grus = grus.sort_values(['charge', 'sequence']).drop(columns=['sequence'])
lstm = lstm.sort_values(['Charge', 'Modified sequence'])

grus['GRU'] = grus.ccs_gru
grus['LSTM'] = lstm.ccs_predicted.values
grus['CONV'] = conv.ccs_predicted.values
grus['SQRT'] = grus.ccs_sqrt_predicted
# grus['Ensemble'] = (grus.GRU + grus.Meier + grus.Samukhina) / 3.0

row_list = []

for charge in range (2, 5):
    tmp = grus[grus.charge == charge]
    
    for model in ['LSTM', 'CONV', 'GRU', 'SQRT']:
        mape = median_absolute_percent_error(tmp.ccs, tmp[model])
        mae = median_absolute_error(tmp.ccs, tmp[model]),
        row_list.append({'model': model, 'charge': charge, 'MAE':mae[0], 'MAPE': mape, 
                         'pearson-R': np.round(pearsonr(tmp.ccs, tmp[model])[0], 2),
                         'dataset': 'Chang'})
        
df_1 = pd.DataFrame(row_list).sort_values(by=['model', 'charge', 'MAPE'])[['dataset', 
                                                                           'model', 
                                                                           'charge', 
                                                                           'MAE', 'MAPE',
                                                                           #'pearson-R'
                                                                          ]]

In [4]:
df_1

Unnamed: 0,dataset,model,charge,MAE,MAPE
1,Chang,CONV,2,3.83,1.0
5,Chang,CONV,3,10.62,1.93
9,Chang,CONV,4,15.2,2.0
2,Chang,GRU,2,4.33,1.15
6,Chang,GRU,3,9.61,1.71
10,Chang,GRU,4,15.82,2.24
0,Chang,LSTM,2,3.95,1.04
4,Chang,LSTM,3,11.49,2.09
8,Chang,LSTM,4,20.85,2.7
3,Chang,SQRT,2,6.32,1.61


In [6]:
ds = 'Tenzer'

grus = pd.read_csv(f'accuracies/GRU/{ds}_GRU.csv')
grus['sequence'] = grus.apply(lambda r: ''.join(r['sequence-tokenized']), axis=1)
grus = grus.rename(columns={'ccs_predicted':'ccs_gru'})

conv = pd.read_csv(f'accuracies/CONV/{ds}_CONV.csv')

lstm = pd.read_csv(f'accuracies/MEIER/{ds}_MEIER.csv')
lstm = lstm.rename(columns={'CCS': 'ccs', 'CCS_predicted': 'ccs_predicted'})

if ds == 'Chang':
    grus = grus.iloc[:-1]
    conv = conv.iloc[:-1]

conv = conv.sort_values(['charge', 'sequence'])
grus = grus.sort_values(['charge', 'sequence']).drop(columns=['sequence'])
lstm = lstm.sort_values(['Charge', 'Modified sequence'])

grus['GRU'] = grus.ccs_gru
grus['SQRT'] = grus.ccs_sqrt_predicted
grus['LSTM'] = lstm.ccs_predicted.values
grus['CONV'] = conv.ccs_predicted.values
# grus['Ensemble'] = (grus.GRU + grus.Meier + grus.Samukhina) / 3.0

row_list = []

for charge in range (2, 5):
    tmp = grus[grus.charge == charge]
    
    for model in ['LSTM', 'CONV', 'GRU', 'SQRT']:
        mape = median_absolute_percent_error(tmp.ccs, tmp[model])
        mae = median_absolute_error(tmp.ccs, tmp[model])
        row_list.append({'model': model, 'charge': charge, 'MAE':mae, 'MAPE': mape, 
                         'pearson-R': np.round(pearsonr(tmp.ccs, tmp[model])[0], 2),
                         'dataset': 'Tenzer'})
        
df_2 = pd.DataFrame(row_list).sort_values(by=['model', 'charge', 'MAPE'])[['dataset', 
                                                                           'model', 
                                                                           'charge', 
                                                                           'MAE', 
                                                                           'MAPE', 
                                                                           #'pearson-R'
                                                                          ]]

In [7]:
df_2

Unnamed: 0,dataset,model,charge,MAE,MAPE
1,Tenzer,CONV,2,4.26,1.17
5,Tenzer,CONV,3,10.07,1.99
9,Tenzer,CONV,4,20.59,2.86
2,Tenzer,GRU,2,4.23,1.17
6,Tenzer,GRU,3,9.09,1.8
10,Tenzer,GRU,4,18.47,2.59
0,Tenzer,LSTM,2,4.94,1.36
4,Tenzer,LSTM,3,11.31,2.24
8,Tenzer,LSTM,4,21.58,3.06
3,Tenzer,SQRT,2,5.74,1.6


In [8]:
ds = 'Sara'

grus = pd.read_csv(f'accuracies/GRU/{ds}_GRU.csv')
grus['sequence'] = grus.apply(lambda r: ''.join(r['sequence-tokenized']), axis=1)
grus = grus.rename(columns={'ccs_predicted':'ccs_gru'})

conv = pd.read_csv(f'accuracies/CONV/{ds}_CONV.csv')

lstm = pd.read_csv(f'accuracies/MEIER/{ds}_MEIER.csv')
lstm = lstm.rename(columns={'CCS': 'ccs', 'CCS_predicted': 'ccs_predicted'})

if ds == 'Chang':
    grus = grus.iloc[:-1]
    conv = conv.iloc[:-1]

conv = conv.sort_values(['charge', 'sequence'])
grus = grus.sort_values(['charge', 'sequence']).drop(columns=['sequence'])
lstm = lstm.sort_values(['Charge', 'Modified sequence'])

grus['GRU'] = grus.ccs_gru
grus['SQRT'] = grus.ccs_sqrt_predicted
grus['LSTM'] = lstm.ccs_predicted.values
grus['CONV'] = conv.ccs_predicted.values
# grus['Ensemble'] = (grus.GRU + grus.Meier + grus.Samukhina) / 3.0

row_list = []

for charge in range (2, 5):
    tmp = grus[grus.charge == charge]
    
    for model in ['LSTM', 'CONV', 'GRU', 'SQRT']:
        mape = median_absolute_percent_error(tmp.ccs, tmp[model])
        mae = median_absolute_error(tmp.ccs, tmp[model])
        row_list.append({'model': model, 'charge': charge, 'MAE':mae, 'MAPE': mape,
                         'pearson-R': np.round(pearsonr(tmp.ccs, tmp[model])[0], 2),
                         'dataset': 'Feola'})
        
df_3 = pd.DataFrame(row_list).sort_values(by=['model', 'charge', 'MAPE'])[['dataset', 
                                                                           'model', 
                                                                           'charge', 
                                                                           'MAE', 
                                                                           'MAPE', 
                                                                           #'pearson-R'
                                                                          ]]

In [9]:
df_3

Unnamed: 0,dataset,model,charge,MAE,MAPE
1,Feola,CONV,2,5.27,1.37
5,Feola,CONV,3,10.5,2.11
9,Feola,CONV,4,19.28,2.86
2,Feola,GRU,2,4.32,1.15
6,Feola,GRU,3,9.11,1.79
10,Feola,GRU,4,16.29,2.37
0,Feola,LSTM,2,7.98,2.19
4,Feola,LSTM,3,13.34,2.79
8,Feola,LSTM,4,26.03,3.71
3,Feola,SQRT,2,7.81,2.08


In [14]:
ogata = pd.read_csv('accuracies/GRU/Ogata_GRU.csv')

ogata['GRU'] = ogata.ccs_predicted
ogata['SQRT'] = ogata.ccs_sqrt_predicted

row_list = []

for charge in range (2, 5):
    tmp = ogata[ogata.charge == charge]
    
    for model in ['GRU', 'SQRT']:
        mape = median_absolute_percent_error(tmp.ccs, tmp[model])
        mae = median_absolute_error(tmp.ccs, tmp[model])
        row_list.append({'model': model, 'charge': charge, 'MAE':mae, 'MAPE': mape,
                         'pearson-R': np.round(pearsonr(tmp.ccs, tmp[model])[0], 2),
                         'dataset': 'Ogata'})
        
df_4 = pd.DataFrame(row_list).sort_values(by=['model', 'charge', 'MAPE'])[['dataset', 
                                                                           'model', 
                                                                           'charge', 
                                                                           'MAE', 
                                                                           'MAPE', 
                                                                           #'pearson-R'
                                                                          ]]

In [15]:
df_4

Unnamed: 0,dataset,model,charge,MAE,MAPE
0,Ogata,GRU,2,4.07,0.95
2,Ogata,GRU,3,10.89,1.99
4,Ogata,GRU,4,20.73,2.7
1,Ogata,SQRT,2,10.56,2.43
3,Ogata,SQRT,3,33.61,6.05
5,Ogata,SQRT,4,30.25,4.07


In [16]:
tenzer_phos = pd.read_csv('accuracies/GRU/Tenzer_phos_GRU.csv')

tenzer_phos['GRU'] = tenzer_phos.ccs_predicted
tenzer_phos['SQRT'] = tenzer_phos.ccs_sqrt_predicted

row_list = []

for charge in range (2, 5):
    tmp = tenzer_phos[tenzer_phos.charge == charge]
    
    for model in ['GRU', 'SQRT']:
        mape = median_absolute_percent_error(tmp.ccs, tmp[model])
        mae = median_absolute_error(tmp.ccs, tmp[model])
        row_list.append({'model': model, 'charge': charge, 'MAE':mae, 'MAPE': mape,
                         'pearson-R': np.round(pearsonr(tmp.ccs, tmp[model])[0], 2),
                         'dataset': 'Tenzer-phospho'})
        
df_5 = pd.DataFrame(row_list).sort_values(by=['model', 'charge', 'MAPE'])[['dataset', 
                                                                           'model', 
                                                                           'charge', 
                                                                           'MAE', 
                                                                           'MAPE', 
                                                                           #'pearson-R'
                                                                          ]]

In [17]:
df_5

Unnamed: 0,dataset,model,charge,MAE,MAPE
0,Tenzer-phospho,GRU,2,5.13,1.19
2,Tenzer-phospho,GRU,3,9.45,1.7
4,Tenzer-phospho,GRU,4,21.99,2.88
1,Tenzer-phospho,SQRT,2,10.87,2.52
3,Tenzer-phospho,SQRT,3,23.5,4.15
5,Tenzer-phospho,SQRT,4,29.5,3.87
