In [None]:
import numpy as np
import pandas as pd

# packages for plotting
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import urllib.request

urllib.request.urlretrieve('https://github.com/google/fonts/raw/main/ofl/ibmplexmono/IBMPlexMono-Regular.ttf', 'IBMPlexMono-Regular.ttf')
fe = font_manager.FontEntry(
    fname='IBMPlexMono-Regular.ttf',
    name='plexmono')
font_manager.fontManager.ttflist.append(fe)
plt.rcParams.update({'axes.facecolor':'#f5f4e9',
            'grid.color' : '#AAAAAA',
            'axes.edgecolor':'#333333',
            'figure.facecolor':'#FFFFFF',
            'axes.grid': False,
            'axes.prop_cycle':   plt.cycler('color', plt.cm.Dark2.colors),
            'font.family': fe.name,
            'figure.figsize': (3.5,3.5 / 1.2),
            'ytick.left': True,
            'xtick.bottom': True   ,
            'figure.dpi': 300
           })

In [None]:
from dataclasses import dataclass

@dataclass
class Config:
    lr: float = 0.005
    hdim: int = 128
    split: float = 0.1          # 10/10/80 test-val-train
    # batch_size
    epochs: int = 60
    patience: int = 5
    min_delta: float = 1e-4     # for early stopping

config = Config()
seed = 511
# censor_threshold = 2
censor_region = 'above'
censor_split = 0.1          # 10/90 sensitive/non-sensitive data

dir_name = '../gcn_xnoise_results'

In [None]:
# get the data
urllib.request.urlretrieve(
    "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv",
    "./lipophilicity.csv",
)
lipodata = pd.read_csv("./lipophilicity.csv")
data = list(zip(lipodata.smiles,lipodata.exp))

In [None]:
# check mutations for generating SMILES using STONED
import exmol

initial_smiles = lipodata.smiles[0]
print('SMILES sequence: ',initial_smiles)

exps = []
spaces = []
for i in [1, 3, 5]:
    stoned_kwargs = {
        "num_samples": 2500,
        "alphabet": exmol.get_basic_alphabet(),
        "min_mutations": i,
        "max_mutations": i,
    }
    space = exmol.sample_space(
        initial_smiles, f=lambda x: 0, batched=False, preset='medium', stoned_kwargs=stoned_kwargs, quiet=True,
    )

    spaces.append(space)
    e = exmol.rcf_explain(space, nmols=2)
    if len(exps) == 0:
        exps.append(e[0])
    for ei in e:
        if not ei.is_origin and "Decrease" in ei.label:
            ei.label = f"Mutations = {i}"
            exps.append(ei)
            break

# plotting
fig, axs = plt.subplots(1, 3, figsize=(8, 3), dpi=180, squeeze=True, sharey=True)
for i, n in enumerate([1, 3, 5]):
    axs[i].hist([e.similarity for e in spaces[i][1:]], bins=99, edgecolor="none")
    axs[i].set_title(f"Mutations = {n}")
    axs[i].set_xlim(0, 1)
plt.tight_layout()
#plt.savefig("mutation-hist.png", bbox_inches="tight", dpi=180)

In [None]:
import sys
sys.path.insert(1, '/path/to/application/app/folder')

from dglgcn import xnoise_train_wrapper, set_seeds
from dglgcn import no_noise_train_wrapper, omit_train_wrapper


similarity_intervals = [
    [1.0,1.0], # no noise control
    [0.8,1.0],
    [0.7,0.8],
    [0.6,0.7],
    [0.5,0.6],
    [0.4,0.5],
    [0.3,0.4],
    [0.2,0.3],
    [0,0.2],
]

In [None]:
import json
from datetime import datetime
today_date = datetime.today().date()

num_trials = 5

all_overall_rmse = []
all_lower_rmse = []
all_upper_rmse = []
for trial in range(num_trials):
    trial_seed = seed + trial
    rmse = []
    tasks = []
    lower_rmse = []
    upper_rmse = []
    print(f'\033[46mTrial {trial+1}, Seed: {trial_seed}\033[0m') 
    for i, sim_score in enumerate(similarity_intervals):
        if i == 0:
            task = 'control' # no-noise control
            print(f'\n\033[34mTask: {task}\033[34m')
            set_seeds(trial_seed)
            result = no_noise_train_wrapper(
                data,  
                sensitive_split=censor_split,
                model_config=config,
                jobname=f'trial{trial}_{task}',
                dir_name=dir_name,
                random_state=trial_seed, 
                verbose=True,
            )
            tasks.append(task)
            rmse.append(result[0])
            lower_rmse.append(result[1])
            upper_rmse.append(result[2])
            continue
        
        task = f"xnoise_score{sim_score[0]}-{sim_score[1]}"
        print(f'\n\033[34mTask: {task}\033[34m')
        set_seeds(trial_seed)
        result = xnoise_train_wrapper(
            data,  
            sim_score,
            censor_region,
            censor_split,
            model_config=config,
            jobname=f'trial{trial}_{task}',
            dir_name=dir_name,
            random_state=trial_seed, 
            verbose=True,
        )
        tasks.append(task)
        rmse.append(result[0])
        lower_rmse.append(result[1])
        upper_rmse.append(result[2])

    all_overall_rmse.append(rmse)
    all_lower_rmse.append(lower_rmse)
    all_upper_rmse.append(upper_rmse)
    
time_now = datetime.today().ctime()
with open(f'{dir_name}/history.json','a') as f:
    f.write(f'\nRun from today: {time_now}')
    json.dump([all_overall_rmse, all_lower_rmse, all_upper_rmse],f)

In [None]:
sim_scores_strings = [f'{interval[0]}-{interval[1]}' for interval in similarity_intervals]
df_x = pd.DataFrame({
    'similarity scores': sim_scores_strings,
    'overall rmse': np.mean(all_overall_rmse, axis=0),
    'overall rmse std': np.std(all_overall_rmse, axis=0),
    'lower rmse': np.mean(all_lower_rmse, axis=0),
    'lower rmse std': np.std(all_lower_rmse, axis=0),
    'upper rmse': np.mean(all_upper_rmse, axis=0),
    'upper rmse std': np.std(all_upper_rmse, axis=0),
})

df_x.to_json(f'{dir_name}/summary_{today_date}.json')
df_x

In [None]:
plt.figure(figsize=(9,9))

if sensitive_threshold=='above':
    lower_rmse_label = 'Non-sensitive Region' # s=1
    upper_rmse_label = 'Sensitive Region' # s=0
else:
    lower_rmse_label = 'Sensitive Region' 
    upper_rmse_label = 'Non-sensitive Region'    

plt.errorbar(df_x['similarity scores'], df_x['lower rmse'], yerr=df_x['lower rmse std'], capsize=5, fmt='-o', label=lower_rmse_label)
plt.errorbar(df_x['similarity scores'], df_x['upper rmse'], yerr=df_x['upper rmse std'], capsize=5, fmt='-o', label=upper_rmse_label)

plt.title('Test Errors')
plt.xlabel('Similarity Score Intervals')
plt.ylabel('RMSE')
#plt.ylim(0,1.6)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(f'{dir_name}/gcn_testerror_xnoise_{today_date}.png', dpi=300)
plt.show()

In [None]:
if sensitive_threshold=='above':
    lower_rmse_label = 'Non-sensitive Region' # s=0
    upper_rmse_label = 'Sensitive Region' # s=1
else:
    lower_rmse_label = 'Sensitive Region' 
    upper_rmse_label = 'Non-sensitive Region'   

plt.plot(df_x['similarity scores'], df_x['lower rmse'], label=lower_rmse_label, marker='o', markersize=2)
plt.fill_between(df_x['similarity scores'], df_x['lower rmse'] - df_x['lower rmse std'], df_x['lower rmse'] + df_x['lower rmse std'], alpha=0.2)

plt.plot(df_x['similarity scores'], df_x['upper rmse'], label=upper_rmse_label, marker='o', markersize=2)
plt.fill_between(df_x['similarity scores'], df_x['upper rmse'] - df_x['upper rmse std'], df_x['upper rmse'] + df_x['upper rmse std'], alpha=0.2)

plt.title('Test Errors')
plt.xlabel('Similarity Score Intervals')
plt.ylabel('RMSE')
#plt.ylim(0,1.6)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(f'{dir_name}/gcn_testerror_xnoise_{today_date}_v2.png', dpi=300)
plt.show()

In [None]:
# OMISSION BASELINE
num_trials = 5

omit_overall_rmse = []
omit_lower_rmse = []
omit_upper_rmse = []
for trial in range(num_trials):
    trial_seed = seed + trial
    print(f'\033[46mTrial {trial+1}, Seed: {trial_seed}\033[0m')
    task = 'omit'
    print(f'\n\033[34mTask: {task}\033[34m')
    set_seeds(trial_seed)
    result = omit_train_wrapper(
        data,
        censor_region,
        censor_split=censor_split,
        omit_fraction=1,
        model_config=config,
        jobname=f'trial{trial}_{task}',
        dir_name=dir_name,
        random_state=trial_seed, 
        verbose=True,
    )
    omit_overall_rmse.append(result[0])
    omit_lower_rmse.append(result[1])
    omit_upper_rmse.append(result[2])
    
time_now = datetime.today().ctime()
with open(f'{dir_name}/history.json','a') as f:
    f.write(f'\nRun from today: {time_now} -- Omision baseline')
    json.dump([omit_overall_rmse, omit_lower_rmse, omit_upper_rmse],f)

df_omit = pd.DataFrame({
    'omit rmse': np.mean(omit_overall_rmse),
    'omit rmse std': np.std(omit_overall_rmse),
    'lower rmse': np.mean(omit_lower_rmse),
    'lower rmse std': np.std(omit_lower_rmse),
    'upper rmse': np.mean(omit_upper_rmse),
    'upper rmse std': np.std(omit_upper_rmse),
})
df_omit

In [None]:
# plt.figure(figsize=(9,9))
# df = df_omit

# if sensitive_threshold=='above':
#     lower_rmse_label = 'Non-sensitive Region' # s=1
#     upper_rmse_label = 'Sensitive Region' # s=0
# else:
#     lower_rmse_label = 'Sensitive Region' 
#     upper_rmse_label = 'Non-sensitive Region'    

# plt.errorbar(df['omit frac'], df['lower rmse'], yerr=df['lower rmse std'], capsize=5, fmt='-o', label=lower_rmse_label)
# plt.errorbar(df['omit frac'], df['upper rmse'], yerr=df['upper rmse std'], capsize=5, fmt='-o', label=upper_rmse_label)

# plt.title('Test Errors')
# plt.xlabel('% Sensitive Data Omitted')
# plt.ylabel('RMSE')
# #plt.ylim(0,1.6)
# plt.legend()
# plt.grid(True)
# plt.tight_layout()
# plt.savefig(f'{dir_name}/gcn_testerror_omit_baseline_{today_date}.png', dpi=300)
# plt.show()