In [8]:
import numpy as np
from matplotlib import pyplot as plt 
import pandas as pd
import seaborn as sns
from rdkit import rdBase, Chem, DataStructs
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.AtomPairs import Pairs, Torsions
import pickle
from utils.algorithms import greedy_baseline
from evaluation.evaluation import cdf_wdud, dfs2wds, dfs2mpd
import random   
import pickle 

from utils.algorithms import greedy_wasserstein
print(rdBase.rdkitVersion)




tasks = ['delaney', 'sampl', 'lipo']
model_name = '_attentivefp_'
nums = ['_1', '_2', '_3', '_4', '_5']
properties = {'delaney':['measured log solubility in mols per litre'], 'sampl':['expt'], 'lipo':['exp']}

# load data
df = pickle.load(open("/Users/nakamura.tomohiro/Codes/SciRepRevision/SUBMO/save_pickle/result_df_others.p", "rb"))

n_select = 100


2020.09.1


In [9]:
for task_name in tasks:
    for num in nums:
        df[task_name][num]['maxsum_dissim_ranking_maccs'] = df[task_name]['_1']['maxsum_dissim_ranking_maccs']
        df[task_name][num]['maxsum_dissim_ranking_ecfp'] = df[task_name]['_1']['maxsum_dissim_ranking_ecfp']
        df[task_name][num]['maxmin_dissim_ranking_maccs'] = df[task_name]['_1']['maxmin_dissim_ranking_maccs']
        df[task_name][num]['maxmin_dissim_ranking_ecfp'] = df[task_name]['_1']['maxmin_dissim_ranking_ecfp']

# Caluculate Wasserstein Distances

In [10]:
# make new dataframe, wasserstein_distances.
wasserstein_distances = pd.DataFrame([])
gnn_based = ['logdet_ranking', 'maxsum_dissim_ranking', 'maxmin_dissim_ranking', 'random_ranking']
binary_based = ['maxsum_dissim_ranking_maccs', 'maxmin_dissim_ranking_maccs', 'maxsum_dissim_ranking_ecfp', 'maxmin_dissim_ranking_ecfp']
wasser_based = ['wgreedy_ranking']

for task in tasks:
    distances = dfs2wds(df[task], gnn_based, n_select, properties[task])
    wasserstein_distances = pd.concat([wasserstein_distances, distances])

    distances = dfs2wds(df[task+'_normalize'], gnn_based, n_select, properties[task])
    distances['method'] = distances['method'].apply(lambda x: x+'_normalize')
    wasserstein_distances = pd.concat([wasserstein_distances, distances])

    distances = dfs2wds(df[task], binary_based, n_select, properties[task])
    wasserstein_distances = pd.concat([wasserstein_distances, distances])

    distances = dfs2wds(df[task], wasser_based, n_select, properties[task])
    wasserstein_distances = pd.concat([wasserstein_distances, distances])

In [4]:
# add random mean values for rescaling
wasserstein_distances['random_average'] = 0
for task in tasks:
    for prop in properties[task]:
        random_average = wasserstein_distances[(wasserstein_distances['property'] == prop) & (wasserstein_distances['method'] == 'random_ranking')]['distance'].mean()
        wasserstein_distances.loc[wasserstein_distances['property'] == prop, 'random_average'] = random_average

wasserstein_distances['scaled_distance'] = wasserstein_distances['distance'] / wasserstein_distances['random_average']

In [5]:
# save wdud values
pickle.dump(wasserstein_distances, open( "./evaluation/wdud_values_others.p", "wb" ))

df_mean = wasserstein_distances.groupby(['method', 'property']).mean()
df_mean.to_csv("./evaluation/wdud_mean_others.csv")

df_std = wasserstein_distances.groupby(['method', 'property']).std()
df_std.to_csv("./evaluation/wdud_std_others.csv")

# Barplot Wasserstein Distances

In [11]:
with open('./evaluation/wdud_values_others.p', 'rb') as f:
    wasserstein_distances = pickle.load(f)

main_method_labels = ['SubMo-GNN', 'MS-MK', 'MM-MK', 'MS-EF', 'MM-EF', 'WG-GNN', 'Random']
ablation_method_labels = ['SubMo w/o N', 'SubMo w/ N', 'MS w/o N', 'MS w/ N', 'MM w/o N', 'MM w/ N', 'Random']

In [None]:
methods = ['logdet_ranking_normalize', 'maxsum_dissim_ranking_maccs', 'maxmin_dissim_ranking_maccs', 'maxsum_dissim_ranking_ecfp', 'maxmin_dissim_ranking_ecfp', 'wgreedy_ranking', 'random_ranking']
plotdf = wasserstein_distances[wasserstein_distances['method'].isin(methods)]

properties_lists = [
    ['measured log solubility in mols per litre', 'expt', 'exp'],
]

property_labels_lists = [
    ['ESOL', 'SAMPL', 'Lipophilicity'],  
]

for i in range(1):
    properties = properties_lists[i]
    property_labels = property_labels_lists[i]
    plotdf_i = plotdf[plotdf['property'].isin(properties)]

    plt.clf()
    sns.set(font_scale=2)
    plt.figure(figsize=(20, 6))
    ax = sns.barplot(x="property", y="scaled_distance", hue='method',hue_order=methods, ci='sd', data=plotdf_i)
    ax.set(ylabel='WDUD (rescaled)', xlabel='Property')
    ax.set_xticklabels(property_labels)
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles=handles, labels=main_method_labels, loc = 'lower right', ncol=3)
    plt.tight_layout(pad=0,w_pad=0, h_pad=0)
    plt.savefig('./evaluation/weval_main_{}_others.png'.format(i+1))
#plt.show()

# Caluculate MPD

In [13]:
mpds = pd.DataFrame([])

gnn_based = ['logdet_ranking', 'maxsum_dissim_ranking', 'maxmin_dissim_ranking', 'random_ranking']
binary_based = ['maxsum_dissim_ranking_maccs', 'maxmin_dissim_ranking_maccs', 'maxsum_dissim_ranking_ecfp', 'maxmin_dissim_ranking_ecfp']
wasser_based = ['wgreedy_ranking']

for task in tasks:
    mpds = pd.concat([mpds, dfs2mpd(df[task], gnn_based, n_select, task)])

    mpd = dfs2mpd(df[task+'_normalize'], gnn_based, n_select, task)
    mpd['method'] = mpd['method'].apply(lambda x: x+'_normalize')
    mpds = pd.concat([mpds, mpd])

    mpds = pd.concat([mpds, dfs2mpd(df[task], binary_based, n_select, task)])
    mpds = pd.concat([mpds, dfs2mpd(df[task], wasser_based, n_select, task)])

In [14]:
mpds

Unnamed: 0,method,fingerprint,task,mpd,num
0,logdet_ranking,maccs,delaney,0.822897,_1
1,maxsum_dissim_ranking,maccs,delaney,0.851041,_1
2,maxmin_dissim_ranking,maccs,delaney,0.788990,_1
3,random_ranking,maccs,delaney,0.826316,_1
0,logdet_ranking,ecfp,delaney,0.935376,_1
...,...,...,...,...,...
0,wgreedy_ranking,ecfp,lipo,0.857766,_3
0,wgreedy_ranking,maccs,lipo,0.580470,_4
0,wgreedy_ranking,ecfp,lipo,0.865275,_4
0,wgreedy_ranking,maccs,lipo,0.607150,_5


In [7]:
pickle.dump(mpds, open( "./evaluation/mpd_values_others.p", "wb" ))

df_mean = mpds.groupby(['method', 'fingerprint']).mean()
df_mean.to_csv("./evaluation/mpd_mean_others.csv")

df_std = mpds.groupby(['method', 'fingerprint']).std()
df_std.to_csv("./evaluation/mpd_std_others.csv")

# Barplot MPD

In [18]:
with open('./evaluation/mpd_values_others.p', 'rb') as f:
    mpds = pickle.load(f)

mpds['fingerprint'] = mpds['task']+' '+mpds['fingerprint']

In [None]:
methods = ['logdet_ranking_normalize', 'maxsum_dissim_ranking_maccs', 'maxmin_dissim_ranking_maccs', 'maxsum_dissim_ranking_ecfp', 'maxmin_dissim_ranking_ecfp', 'wgreedy_ranking', 'random_ranking']
main_method_labels = ['SUBMO-GNN', 'MS-MK', 'MM-MK', 'MS-EF', 'MM-EF', 'WG-GNN', 'RANDOM']
plotdf = mpds[mpds['method'].isin(methods)]


plt.clf()
sns.set(font_scale=2)
plt.figure(figsize=(20, 6))
ax = sns.barplot(x="fingerprint", y="mpd", hue='method',hue_order=methods, ci='sd', data=plotdf)
ax.set(ylabel='Value', xlabel='')
ax.set_xticklabels(['MPD-ESOL-MK', 'MPD-ESOL-EF', 'MPD-SAMPL-MK', 'MPD-SAMPL-EF', 'MPD-Lipophilicity-MK', 'MPD-Lipophilicity-EF'])
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=main_method_labels, loc = 'lower right', ncol=2)
plt.ylim([.5, 1])
#plt.tight_layout(pad=0,w_pad=0, h_pad=0)
plt.tight_layout()
plt.savefig('./evaluation/mpd_eval_others.png')
#plt.show()