In [None]:
import numpy as np
import pandas as pd
import json
from dataclasses import dataclass
from datetime import datetime
import os
import sys
sys.path.insert(1, '/path/to/application/app/folder')
today_date = datetime.today().date()
time_now = datetime.today().ctime()

# packages for plotting
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import urllib.request

urllib.request.urlretrieve('https://github.com/google/fonts/raw/main/ofl/ibmplexmono/IBMPlexMono-Regular.ttf', 'IBMPlexMono-Regular.ttf')
fe = font_manager.FontEntry(
    fname='IBMPlexMono-Regular.ttf',
    name='plexmono')
font_manager.fontManager.ttflist.append(fe)
plt.rcParams.update({'axes.facecolor':'#f5f4e9',
            'grid.color' : '#AAAAAA',
            'axes.edgecolor':'#333333',
            'figure.facecolor':'#FFFFFF',
            'axes.grid': False,
            'axes.prop_cycle':   plt.cycler('color', plt.cm.Dark2.colors),
            'font.family': fe.name,
            'figure.figsize': (3.5,3.5 / 1.2),
            'ytick.left': True,
            'xtick.bottom': True   ,
            'figure.dpi': 300
           })

In [None]:
# get the data
urllib.request.urlretrieve(
    "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv",
    "./lipophilicity.csv",
)
lipodata = pd.read_csv("./lipophilicity.csv")
data = list(zip(lipodata.smiles,lipodata.exp))

In [None]:
# select either threshold OR split to divide sensitive vs non-sensitive
censor_threshold = None
censor_region = 'above'
censor_split = 0.1             # 10% sensitive data, 90% non-sensitive data
seed = 511

In [None]:
sys.path.insert(1, '/path/to/application/app/folder')
from dglgcn import compute_threshold_from_split

labels = lipodata.exp
censor_threshold = compute_threshold_from_split(labels, censor_split, censor_region)
print(f'censor_threshold: {censor_threshold}')

In [None]:
@dataclass
class Config:
    lr: float = 0.005
    hdim: int = 128
    split: float = 0.1          # 10/10/80 test-val-train
    # batch_size
    epochs: int = 100
    patience: int = 10
    min_delta: float = 1e-4     # for early stopping
    loss_func: str = 'mse'

config = Config()
dir_name = f'OUTPUTS/all_results/gcn_ynoise_results_split{censor_split}_{censor_region}'
fig_dirname = 'OUTPUTS/figures'
os.makedirs(dir_name, exist_ok=True)
os.makedirs(fig_dirname, exist_ok=True)

In [None]:
interval_step = 0.2
start_level = 0
end_level = 5
y_noise_levels = np.linspace(start_level, end_level, int((end_level - start_level) / interval_step + 1))

print(y_noise_levels)
len(y_noise_levels)

In [None]:
from dglgcn import ynoise_train_wrapper, set_seeds, omit_train_wrapper

num_trials = 5

tasks = []
all_overall_rmse = []
all_lower_rmse = []
all_upper_rmse = []
all_corr = []
all_lower_corr = []
all_upper_corr = []
for trial in range(num_trials):
    trial_seed = seed + trial
    sim_scores = {}
    rmse = []
    lower_rmse = []
    upper_rmse = []
    corr = []
    lower_corr = []
    upper_corr = []
    print(f'\033[46mTrial {trial+1}, Seed: {trial_seed}\033[0m') 
    for ynoise in y_noise_levels:
        task = f"ynoise_{ynoise}"
        print(f'\n\033[34mTask: {task}\033[0m')
        set_seeds(trial_seed)
        result = ynoise_train_wrapper(
            ynoise, 
            censor_threshold=censor_threshold,
            censor_region=censor_region, 
            censor_split=censor_split,
            model_config=config,
            jobname=f'trial{trial}_{task}',
            dir_name=dir_name,
            random_state=seed, 
            verbose=True,
            separate_train_path='lipodata_presplit/train_data_131.csv',
            separate_test_path='lipodata_presplit/test_data_131.csv',
            separate_val_path='lipodata_presplit/val_data_131.csv',
        )
        rmse.append(result[0])
        lower_rmse.append(result[1])
        upper_rmse.append(result[2])
        corr.append(result[3])
        lower_corr.append(result[4])
        upper_corr.append(result[5])

    all_overall_rmse.append(rmse)
    all_lower_rmse.append(lower_rmse)
    all_upper_rmse.append(upper_rmse)
    all_corr.append(corr)
    all_lower_corr.append(lower_corr)
    all_upper_corr.append(upper_corr)
    
time_now = datetime.today().ctime()
with open(f'{dir_name}/history.json','a') as f:
    msg = f'\nRun from today: {time_now}'
    json.dump([msg, all_overall_rmse, all_lower_rmse, all_upper_rmse, all_corr, all_lower_corr, all_upper_corr],f, indent=4)

In [None]:
df_y = pd.DataFrame({
    'y noise level': y_noise_levels,
    'overall rmse': np.mean(all_overall_rmse, axis=0),
    'overall rmse std': np.std(all_overall_rmse, axis=0),
    'lower rmse': np.mean(all_lower_rmse, axis=0),
    'lower rmse std': np.std(all_lower_rmse, axis=0),
    'upper rmse': np.mean(all_upper_rmse, axis=0),
    'upper rmse std': np.std(all_upper_rmse, axis=0),
    'overall corr': np.mean(all_corr, axis=0),
    'overall corr std': np.std(all_corr, axis=0),
    'lower corr': np.mean(all_lower_corr, axis=0),
    'lower corr std': np.std(all_lower_corr, axis=0),
    'upper corr': np.mean(all_upper_corr, axis=0),
    'upper corr std': np.std(all_upper_corr, axis=0),
})

df_y.to_json(f'{dir_name}/dataframe_{today_date}.json')
df_y

In [None]:
#df_y = pd.read_json(f'{dir_name}/dataframe_{today_date}.json)

plt.figure(figsize=(9,9))
if censor_region=='above':
    lower_rmse_label = 'Non-sensitive Region' # s=0
    upper_rmse_label = 'Sensitive Region' # s=1
else:
    lower_rmse_label = 'Sensitive Region' 
    upper_rmse_label = 'Non-sensitive Region'   

plt.plot(df_y['y noise level'], df_y['lower rmse'], label=lower_rmse_label, marker='o', markersize=2)
plt.fill_between(df_y['y noise level'], df_y['lower rmse'] - df_y['lower rmse std'], df_y['lower rmse'] + df_y['lower rmse std'], alpha=0.2)

plt.plot(df_y['y noise level'], df_y['upper rmse'], label=upper_rmse_label, marker='o', markersize=2)
plt.fill_between(df_y['y noise level'], df_y['upper rmse'] - df_y['upper rmse std'], df_y['upper rmse'] + df_y['upper rmse std'], alpha=0.2)

plt.title('Test Errors')
plt.xlabel('Y Noise Level')
plt.ylabel('RMSE')
plt.ylim(0,1.8)
plt.legend()
plt.grid(True)
plt.tight_layout()
#plt.savefig(f'{fig_dirname}/gcn_testerror_ynoise_split{censor_split}_{censor_region}_{today_date}_v2.png', dpi=300)
plt.show()

# later: you want to add omission baseline to the y-noise plot above

In [None]:
# repeat with correlation

plt.plot(df_y['y noise level'], df_y['lower corr'], label=lower_rmse_label, marker='o', markersize=2)
plt.fill_between(df_y['y noise level'], df_y['lower corr'] - df_y['lower corr std'], df_y['lower corr'] + df_y['lower corr std'], alpha=0.2)

plt.plot(df_y['y noise level'], df_y['upper corr'], label=upper_rmse_label, marker='o', markersize=2)
plt.fill_between(df_y['y noise level'], df_y['upper corr'] - df_y['upper corr std'], df_y['upper corr'] + df_y['upper corr std'], alpha=0.2)

plt.xlabel('Y Noise Level')
plt.ylabel('Correlation')
plt.show()