In [None]:
import json
import os
import sys
from dataclasses import dataclass
from datetime import datetime

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
import torch


sys.path.insert(1,'../../')
from dglgcn import compute_threshold_from_split

sys.path.insert(1, '/path/to/application/app/folder')
today_date = datetime.today().date()
time_now = datetime.today().ctime()

# packages for plotting
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import urllib.request

urllib.request.urlretrieve('https://github.com/google/fonts/raw/main/ofl/ibmplexmono/IBMPlexMono-Regular.ttf', 'IBMPlexMono-Regular.ttf')
fe = font_manager.FontEntry(
    fname='IBMPlexMono-Regular.ttf',
    name='plexmono')
font_manager.fontManager.ttflist.append(fe)
plt.rcParams.update({'axes.facecolor':'#f5f4e9',
            'grid.color' : '#AAAAAA',
            'axes.edgecolor':'#333333',
            'figure.facecolor':'#FFFFFF',
            'axes.grid': False,
            'axes.prop_cycle':   plt.cycler('color', plt.cm.Dark2.colors),
            'font.family': fe.name,
            'figure.figsize': (3.5,3.5 / 1.2),
            'ytick.left': True,
            'xtick.bottom': True   ,
            'figure.dpi': 300
           })

In [None]:
censor_region = "above"
censor_splits = [0.1, 0.5, 0.9]

In [None]:
urllib.request.urlretrieve(
    "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv",
    "./lipophilicity.csv",
)
lipodata = pd.read_csv("./lipophilicity.csv")
data = list(zip(lipodata.smiles,lipodata.exp))

In [None]:
task = f'omit_results_split{split}_{censor_region}'
dir_name = f'../all_results/gcn_{task}'

for trial in range(1):  # Trials from 0 to 4
    for omit in omit_fractions:  # Omit fractions from 0.0 to 1.0
        file_name = f"{dir_name}/trainingcurve_trial{trial}_omit_{omit}.json"
        with open(file_name, 'r') as file:
            epochs, train_loss, val_loss = json.load(file)
            actual_epochs = epochs[:len(train_loss)]
        plt.plot(actual_epochs, train_loss, label='Train Loss', color='C0')
        plt.plot(actual_epochs, val_loss, label='Validation Loss', color='C1')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title(f'Omit Fraction {omit:.1f}')
        plt.show()

In [None]:
def plot_multiple_training_curves(censor_split, censor_type, censor_intervals, results_dir=None):
    task = f'{censor_type}_results_split{censor_split}_{censor_region}'
    if results_dir == None:
        results_dir = f'../all_results/gcn_{task}'
    
    ncols = len(censor_intervals)
    fig, axs = plt.subplots(nrows=5, ncols=ncols, sharey=True, figsize=(ncols*2+5, 12), dpi=300)
    
    for i in range(5):  # Trials from 0 to 4
        for j, c in enumerate(censor_intervals):  # Omit fractions from 0.0 to 1.0
            ax = axs[i,j]
            # ax = axs[i][j]
            file_name = f"{results_dir}/trainingcurve_trial{i}_{censor_type}_{c}.json"
            with open(file_name, 'r') as file:
                epochs, train_loss, val_loss = json.load(file)
                actual_epochs = epochs[:len(train_loss)]
            ax.plot(actual_epochs, train_loss, label='Train Loss', color='C0')
            ax.plot(actual_epochs, val_loss, label='Val Loss', color='C1')
            if i == 0:
                if censor_type == 'omit':
                    ax.set_title(f'Omit Fraction {c:.1f}')
                elif censor_type == 'xnoise':
                    ax.set_title(f'Similarity {c}')
                else: 
                    ax.set_title(f'Y Noise level {c:.1f}')
            if j == 0:
                ax.set_ylabel(f"Trial {i+1}")
            
    plt.title(f'Sensitive Split {censor_split}')
    plt.tight_layout()
    return plt
    #plt.show()

def plot_multiple_parity_plots(censor_split, censor_type, censor_intervals, results_dir=None):
    task = f'{censor_type}_results_split{censor_split}_{censor_region}'
    if results_dir == None:
        results_dir = f'../all_results/gcn_{task}'
    
    labels = [label for _, label in data]
    threshold = compute_threshold_from_split(labels, censor_split, censor_region)
    
    ncols = len(censor_intervals)
    fig, axs = plt.subplots(nrows=5, ncols=ncols, sharey=True, figsize=(ncols*2+5, 12), dpi=300)
    
    for i in range(5):  # Trials from 0 to 4
        for j, c in enumerate(censor_intervals):  # Omit fractions from 0.0 to 1.0
            ax = axs[i,j]
            file_name = f'{results_dir}/parityplotdata_trial{i}_{censor_type}_{c}.json'
            with open(file_name, 'r') as file:
                rmse, lower_rmse, upper_rmse, corr, lower_corr, upper_corr, ytest, yhat = json.load(file)

            ytest_t, yhat_t = torch.tensor(ytest), torch.tensor(yhat)
            upper_ytest = ytest_t[ytest_t >= threshold]
            lower_ytest = ytest_t[ytest_t < threshold]
            upper_yhat = yhat_t[ytest_t >= threshold]
            lower_yhat = yhat_t[ytest_t < threshold] 

            # note: assume upper local region is sensitive region
            ax.plot([-2, 5], [-2, 5], c='black')
            ax.scatter(upper_ytest, upper_yhat, label='sensitve', s=1, c='C1')
            ax.scatter(lower_ytest, lower_yhat, s=1, label='non-sensitive', c='C0')
            ax.text(0,0, f'corr={upper_corr:.3f}', c='C1')
            ax.text(0,-1, f'corr={lower_corr:.3f}', c='C0')
            ax.set_xlim(-2,5)
            ax.set_ylim(-2,5)
            if i == 0:
                if censor_type == 'omit':
                    ax.set_title(f'Omit Fraction {c:.1f}')
                elif censor_type == 'xnoise':
                    ax.set_title(f'Similarity {c}')
                else: 
                    ax.set_title(f'Y Noise level {c:.1f}')
            if j == 0:
                ax.set_ylabel(f"Trial {i+1}")
            
    plt.title(f'Sensitive Split {censor_split}')
    plt.tight_layout()
    return plt
        
    

In [None]:
omit_fractions = np.linspace(0, 1, int(1/0.1+1))
print('omit fractions', omit_fractions)
censor_intervals = omit_fractions

for split in censor_splits:
    plt = plot_multiple_training_curves(split, 'omit', censor_intervals)
    plt.savefig(f'paper_figs/training_curves/training_curves_omit_split_{split}.svg', dpi = 300)
    plt.close()
    
    plt = plot_multiple_parity_plots(split, 'omit', censor_intervals)
    plt.savefig(f'paper_figs/parity_plots/parity_plots_omit_split_{split}.svg', dpi = 300)
    plt.close()



In [None]:
sim_scores = [
    [1.0, 1.0], # no-noise control
    [0.8, 1.0],
    [0.7, 0.8],
    [0.6, 0.7],
    [0.5, 0.6],
    [0.4, 0.5],
    [0.35, 0.4],
    [0.3, 0.35],
    [0.25, 0.3],
    [0.2, 0.25],
    [0.15, 0.2],
    [0.1, 0.15],
    [0.05, 0.1],
    [0, 0.05],
]
censor_intervals = [f"{s[0]}-{s[1]}" for s in sim_scores]
censor_type = 'xnoise'

for split in censor_splits:
    plt = plot_multiple_training_curves(split, censor_type, censor_intervals)
    plt.savefig(f'paper_figs/training_curves/training_curves_{censor_type}_split_{split}.png', dpi = 300)
    #plt.close()
    
    plt = plot_multiple_parity_plots(split, censor_type, censor_intervals)
    plt.savefig(f'paper_figs/parity_plots/parity_plots_{censor_type}_split_{split}.png', dpi = 300)
    #plt.close()

In [None]:
interval_step = 0.2
start_level = 0
end_level = 5

y_noise_levels = np.linspace(start_level, end_level, int((end_level - start_level) / interval_step + 1))
censor_intervals = y_noise_levels
censor_type = 'ynoise'

for split in censor_splits:
    plt = plot_multiple_training_curves(split, censor_type, censor_intervals)
    plt.savefig(f'paper_figs/training_curves/training_curves_{censor_type}_split_{split}.png', dpi = 300)
    #plt.close()
    
    plt = plot_multiple_parity_plots(split, censor_type, censor_intervals)
    plt.savefig(f'paper_figs/parity_plots/parity_plots_{censor_type}_split_{split}.png', dpi = 300)
    #plt.close()