In [None]:
import json
import os
import sys
from dataclasses import dataclass
from datetime import datetime

import cairosvg
#import dataframe_image as dfi
import numpy as np
import pandas as pd
import skunk
from scipy.stats import spearmanr

sys.path.insert(1,'../../')
from dglgcn import compute_threshold_from_split

sys.path.insert(1, '/path/to/application/app/folder')
today_date = datetime.today().date()
time_now = datetime.today().ctime()

# packages & settings for plotting
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import urllib.request

urllib.request.urlretrieve('https://github.com/google/fonts/raw/main/ofl/ibmplexmono/IBMPlexMono-Regular.ttf', 'IBMPlexMono-Regular.ttf')
fe = font_manager.FontEntry(
    fname='IBMPlexMono-Regular.ttf',
    name='plexmono')
font_manager.fontManager.ttflist.append(fe)
plt.rcParams.update({'axes.facecolor':'#f5f4e9',
            'grid.color' : '#AAAAAA',
            'axes.edgecolor':'#333333',
            'figure.facecolor':'#FFFFFF',
            'axes.grid': False,
                     
            'axes.prop_cycle':   plt.cycler('color', plt.cm.Dark2.colors),
            'font.family': fe.name,
            'figure.figsize': (3.5,3.5 / 1.2),
            'ytick.left': True,
            'xtick.bottom': True   ,
            'figure.dpi': 300
           })

In [None]:
censor_region = "above"
censor_splits = [0.1, 0.5, 0.9]
run_date = '2024-05-06'

In [None]:
urllib.request.urlretrieve(
    "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv",
    "./lipophilicity.csv",
)
lipodata = pd.read_csv("./lipophilicity.csv")
lipodata = list(zip(lipodata.smiles,lipodata.exp))

In [None]:
def load_dataframe(censor_split, censor_type, output_dir=None, ending="", hasNaN=False):
    task = f'{censor_type}_results_split{censor_split}_{censor_region}{ending}'
    if output_dir is None:
        dir_name = f'../all_results/gcn_{task}'
    else: 
        dir_name = output_dir + f'all_results/gcn_{task}'
    labels = [label for _, label in lipodata]
    threshold = compute_threshold_from_split(labels, censor_split, censor_region)
    print(f'For censor split {censor_split}, Threshold =',threshold)
    
    if hasNaN and censor_split == 0.9:
        # special case for omitting 90% sensitive data --> often gets NaN for correlation values
        file_path = f'{dir_name}/dataframe_{run_date}_revised.json'
        #ax.set_title(f'{censor_split * 100:.0f}% sensitive data$^*$') # add asterisk
    else:
        file_path = f'{dir_name}/dataframe_{run_date}.json'
        
    
    # load json file and plot results
    df = pd.read_json(file_path)
    return df

In [None]:
run_date = '2024-05-11'
df1 = load_dataframe(censor_splits[0], 'omit', ending="_150epochs", hasNaN=True)
df1

In [None]:
df0 = df1[['omit frac']]
df1 = df1[['lower corr', 'upper corr']]
df1

In [None]:
df2 = load_dataframe(censor_splits[1], 'omit', ending="_150epochs", hasNaN=True)
df2 = df2[['lower corr', 'upper corr']]
df3 = load_dataframe(censor_splits[2], 'omit', ending="_150epochs", hasNaN=True)
df3 = df3[['lower corr', 'upper corr']]

In [None]:
omit_df = pd.concat(dict( SensitiveSplit10 = df1, SensitiveSplit50 = df2, SensitiveSplit90 = df3), axis=1)
omit_df = omit_df.set_index(df0['omit frac'])
#omit_df.to_csv('gcn_omit_table.csv')
omit_df
# todo: save using dataframe_image after installing it

In [None]:
run_date = '2024-05-11'
df1 = load_dataframe(censor_splits[0], 'ynoise')
df0 = df1[['y noise level']]
df1 = df1[['lower corr', 'upper corr']]
df2 = load_dataframe(censor_splits[1], 'ynoise')
df2 = df2[['lower corr', 'upper corr']]
df3 = load_dataframe(censor_splits[2], 'ynoise')
df3 = df3[['lower corr', 'upper corr']]

In [None]:
ynoise_df = pd.concat(dict( SensitiveSplit10 = df1, SensitiveSplit50 = df2, SensitiveSplit90 = df3), axis=1)
ynoise_df = ynoise_df.set_index(df0['y noise level'])
#ynoise_df.to_csv('gcn_ynoise_table.csv')
ynoise_df

In [None]:
run_date = '2024-05-06'
df1 = load_dataframe(censor_splits[0], 'xnoise')
df0 = df1[['similarity scores']]
df1 = df1[['lower corr', 'upper corr']]
df2 = load_dataframe(censor_splits[1], 'xnoise')
df2 = df2[['lower corr', 'upper corr']]
df3 = load_dataframe(censor_splits[2], 'xnoise')
df3 = df3[['lower corr', 'upper corr']]

In [None]:
xnoise_df = pd.concat(dict( SensitiveSplit10 = df1, SensitiveSplit50 = df2, SensitiveSplit90 = df3), axis=1)
xnoise_df = xnoise_df.set_index(df0['similarity scores'])
#xnoise_df.to_csv('gcn_xnoise_table.csv')
xnoise_df