In [1]:
import json
import os
import sys
from dataclasses import dataclass
from datetime import datetime

import cairosvg
#import dataframe_image as dfi
import numpy as np
import pandas as pd
import skunk
from scipy.stats import spearmanr

sys.path.insert(1,'../../')
from dglgcn import compute_threshold_from_split

sys.path.insert(1, '/path/to/application/app/folder')
today_date = datetime.today().date()
time_now = datetime.today().ctime()

# packages & settings for plotting
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import urllib.request

urllib.request.urlretrieve('https://github.com/google/fonts/raw/main/ofl/ibmplexmono/IBMPlexMono-Regular.ttf', 'IBMPlexMono-Regular.ttf')
fe = font_manager.FontEntry(
    fname='IBMPlexMono-Regular.ttf',
    name='plexmono')
font_manager.fontManager.ttflist.append(fe)
plt.rcParams.update({'axes.facecolor':'#f5f4e9',
            'grid.color' : '#AAAAAA',
            'axes.edgecolor':'#333333',
            'figure.facecolor':'#FFFFFF',
            'axes.grid': False,
                     
            'axes.prop_cycle':   plt.cycler('color', plt.cm.Dark2.colors),
            'font.family': fe.name,
            'figure.figsize': (3.5,3.5 / 1.2),
            'ytick.left': True,
            'xtick.bottom': True   ,
            'figure.dpi': 300
           })

In [2]:
censor_region = "above"
censor_splits = [0.1, 0.5, 0.9]
run_date = '2024-05-06'

In [3]:
urllib.request.urlretrieve(
    "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv",
    "./lipophilicity.csv",
)
lipodata = pd.read_csv("./lipophilicity.csv")
lipodata = list(zip(lipodata.smiles,lipodata.exp))

In [4]:
def load_dataframe(censor_split, censor_type, output_dir=None, ending="", hasNaN=False):
    task = f'{censor_type}_results_split{censor_split}_{censor_region}{ending}'
    if output_dir is None:
        dir_name = f'../all_results/gcn_{task}'
    else: 
        dir_name = output_dir + f'all_results/gcn_{task}'
    labels = [label for _, label in lipodata]
    threshold = compute_threshold_from_split(labels, censor_split, censor_region)
    print(f'For censor split {censor_split}, Threshold =',threshold)
    
    if hasNaN and censor_split == 0.9:
        # special case for omitting 90% sensitive data --> often gets NaN for correlation values
        file_path = f'{dir_name}/dataframe_{run_date}_revised.json'
        #ax.set_title(f'{censor_split * 100:.0f}% sensitive data$^*$') # add asterisk
    else:
        file_path = f'{dir_name}/dataframe_{run_date}.json'
        
    
    # load json file and plot results
    df = pd.read_json(file_path)
    return df

In [9]:
run_date = '2024-05-11'
df1 = load_dataframe(censor_splits[0], 'omit', ending="_150epochs", hasNaN=True)
df1

For censor split 0.1, Threshold = 3.6


Unnamed: 0,omit frac,overall rmse,overall rmse std,lower rmse,lower rmse std,upper rmse,upper rmse std,overall corr,overall corr std,lower corr,lower corr std,upper corr,upper corr std
0,0.0,0.854813,0.054284,0.80753,0.045036,1.148644,0.114771,0.745833,0.018929,0.723968,0.021996,0.086934,0.068402
1,0.1,0.861079,0.020184,0.78715,0.02518,1.289385,0.017027,0.735844,0.01739,0.7204,0.022033,-0.01218,0.063214
2,0.2,0.854298,0.014681,0.776316,0.010976,1.294078,0.038675,0.738294,0.014284,0.726674,0.01285,-0.015097,0.056368
3,0.3,0.856777,0.01327,0.784684,0.013085,1.274783,0.029244,0.735718,0.006457,0.721928,0.00862,-0.019505,0.026809
4,0.4,0.847806,0.019722,0.775039,0.018451,1.257414,0.054387,0.740396,0.014414,0.728462,0.015377,-0.004005,0.067004
5,0.5,0.84654,0.010576,0.77028,0.013369,1.278729,0.069963,0.740669,0.006302,0.730408,0.009537,0.027167,0.044312
6,0.6,0.849569,0.015109,0.773632,0.011433,1.277015,0.068647,0.740975,0.008702,0.730889,0.008186,0.030498,0.038071
7,0.7,0.843783,0.014109,0.770727,0.012753,1.258891,0.097381,0.744091,0.0072,0.73625,0.008732,-0.00935,0.035695
8,0.8,0.859106,0.030422,0.786301,0.036815,1.26901,0.043976,0.730585,0.018831,0.718778,0.023393,0.009872,0.060157
9,0.9,0.853446,0.013102,0.78319,0.006599,1.258337,0.059451,0.736552,0.006358,0.725073,0.008178,0.005584,0.042086


In [28]:
df0 = df1[['omit frac']]
df1 = df1[['lower corr', 'upper corr']]
df1

Unnamed: 0,lower corr,upper corr
0,0.723968,0.086934
1,0.7204,-0.01218
2,0.726674,-0.015097
3,0.721928,-0.019505
4,0.728462,-0.004005
5,0.730408,0.027167
6,0.730889,0.030498
7,0.73625,-0.00935
8,0.718778,0.009872
9,0.725073,0.005584


In [45]:
df2 = load_dataframe(censor_splits[1], 'omit', ending="_150epochs", hasNaN=True)
df2 = df2[['lower corr', 'upper corr']]
df3 = load_dataframe(censor_splits[2], 'omit', ending="_150epochs", hasNaN=True)
df3 = df3[['lower corr', 'upper corr']]

For censor split 0.5, Threshold = 2.36
For censor split 0.9, Threshold = 0.52


In [46]:
omit_df = pd.concat(dict( SensitiveSplit10 = df1, SensitiveSplit50 = df2, SensitiveSplit90 = df3), axis=1)
omit_df.set_index(df0['omit frac'])
omit_df.to_csv('gcn_omit_table.csv')
# todo: save using dataframe_image after installing it

Unnamed: 0_level_0,SensitiveSplit10,SensitiveSplit10,SensitiveSplit50,SensitiveSplit50,SensitiveSplit90,SensitiveSplit90
Unnamed: 0_level_1,lower corr,upper corr,lower corr,upper corr,lower corr,upper corr
omit frac,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0.0,0.723968,0.086934,0.561438,0.371851,0.478747,0.682783
0.1,0.7204,-0.01218,0.452903,0.271802,0.339216,0.526872
0.2,0.726674,-0.015097,0.458093,0.233703,0.094698,0.318302
0.3,0.721928,-0.019505,0.471033,0.257009,0.228367,0.381561
0.4,0.728462,-0.004005,0.463881,0.242119,0.112609,0.205086
0.5,0.730408,0.027167,0.493865,0.267267,0.105346,0.209811
0.6,0.730889,0.030498,0.488376,0.273452,-0.049307,0.036708
0.7,0.73625,-0.00935,0.473385,0.251018,-0.021834,0.075149
0.8,0.718778,0.009872,0.493362,0.262609,0.039874,0.141946
0.9,0.725073,0.005584,0.475103,0.243579,0.185547,0.228237


In [50]:
run_date = '2024-05-11'
df1 = load_dataframe(censor_splits[0], 'ynoise')
df0 = df1[['y noise level']]
df1 = df1[['lower corr', 'upper corr']]
df2 = load_dataframe(censor_splits[1], 'ynoise')
df2 = df2[['lower corr', 'upper corr']]
df3 = load_dataframe(censor_splits[2], 'ynoise')
df3 = df3[['lower corr', 'upper corr']]

For censor split 0.1, Threshold = 3.6
For censor split 0.5, Threshold = 2.36
For censor split 0.9, Threshold = 0.52


In [51]:
ynoise_df = pd.concat(dict( SensitiveSplit10 = df1, SensitiveSplit50 = df2, SensitiveSplit90 = df3), axis=1)
ynoise_df.set_index(df0['y noise level'])
ynoise_df.to_csv('gcn_ynoise_table.csv')

Unnamed: 0_level_0,SensitiveSplit10,SensitiveSplit10,SensitiveSplit50,SensitiveSplit50,SensitiveSplit90,SensitiveSplit90
Unnamed: 0_level_1,lower corr,upper corr,lower corr,upper corr,lower corr,upper corr
y noise level,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0.0,0.723968,0.086934,0.561438,0.371851,0.478747,0.682783
0.2,0.747395,0.095902,0.579338,0.375655,0.443202,0.697265
0.4,0.746898,0.113296,0.555389,0.365592,0.527335,0.670387
0.6,0.735435,0.104719,0.543698,0.365108,0.525239,0.666457
0.8,0.73825,0.101998,0.548451,0.348608,0.541888,0.638916
1.0,0.742593,0.102629,0.517627,0.348631,0.513152,0.633647
1.2,0.736721,0.103674,0.513882,0.343916,0.542007,0.620837
1.4,0.720238,0.106123,0.518719,0.338993,0.489379,0.596192
1.6,0.729865,0.096207,0.501464,0.342242,0.538032,0.588672
1.8,0.724703,0.103935,0.495017,0.356816,0.514227,0.560911


In [55]:
run_date = '2024-05-06'
df1 = load_dataframe(censor_splits[0], 'xnoise')
df0 = df1[['similarity scores']]
df1 = df1[['lower corr', 'upper corr']]
df2 = load_dataframe(censor_splits[1], 'xnoise')
df2 = df2[['lower corr', 'upper corr']]
df3 = load_dataframe(censor_splits[2], 'xnoise')
df3 = df3[['lower corr', 'upper corr']]

For censor split 0.1, Threshold = 3.6
For censor split 0.5, Threshold = 2.36
For censor split 0.9, Threshold = 0.52


In [56]:
xnoise_df = pd.concat(dict( SensitiveSplit10 = df1, SensitiveSplit50 = df2, SensitiveSplit90 = df3), axis=1)
xnoise_df.set_index(df0['similarity scores'])
xnoise_df.to_csv('gcn_xnoise_table.csv')

Unnamed: 0_level_0,SensitiveSplit10,SensitiveSplit10,SensitiveSplit50,SensitiveSplit50,SensitiveSplit90,SensitiveSplit90
Unnamed: 0_level_1,lower corr,upper corr,lower corr,upper corr,lower corr,upper corr
similarity scores,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1.0-1.0,0.723968,0.086934,0.561438,0.371851,0.478747,0.682783
0.8-1.0,0.728362,0.083342,0.551485,0.328075,0.469385,0.618586
0.7-0.8,0.733737,0.091266,0.530359,0.306976,0.459492,0.604322
0.6-0.7,0.718844,0.0784,0.536205,0.297782,0.461457,0.596626
0.5-0.6,0.714828,0.066449,0.509201,0.2607,0.401499,0.570469
0.4-0.5,0.710219,0.069312,0.492694,0.239154,0.414183,0.553829
0.35-0.4,0.71095,0.008033,0.468542,0.180642,0.415845,0.490968
0.3-0.35,0.685596,0.006672,0.457249,0.202647,0.30871,0.473961
0.25-0.3,0.70503,0.054433,0.493953,0.199417,0.381538,0.44004
0.2-0.25,0.69272,0.018776,0.425805,0.187184,0.343169,0.409927
