In [None]:
import json
import os
import sys
from dataclasses import dataclass
from datetime import datetime

import cairosvg
import numpy as np
import pandas as pd
import skunk
import torch
from matplotlib.offsetbox import AnnotationBbox
from scipy.stats import spearmanr


sys.path.insert(1, '/path/to/application/app/folder')
today_date = datetime.today().date()
time_now = datetime.today().ctime()

# packages for plotting
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import urllib.request

urllib.request.urlretrieve('https://github.com/google/fonts/raw/main/ofl/ibmplexmono/IBMPlexMono-Regular.ttf', 'IBMPlexMono-Regular.ttf')
fe = font_manager.FontEntry(
    fname='IBMPlexMono-Regular.ttf',
    name='plexmono')
font_manager.fontManager.ttflist.append(fe)
plt.rcParams.update({'axes.facecolor':'#f5f4e9',
            'grid.color' : '#AAAAAA',
            'axes.edgecolor':'#333333',
            'figure.facecolor':'#FFFFFF',
            'axes.grid': False,
            'axes.prop_cycle':   plt.cycler('color', plt.cm.Dark2.colors),
            'font.family': fe.name,
            'figure.figsize': (3.5,3.5 / 1.2),
            'ytick.left': True,
            'xtick.bottom': True   ,
            'figure.dpi': 300
           })

In [None]:
censor_region = "above"
censor_splits = [0.1, 0.5, 0.9]

In [None]:
# helper functions
import re
from scipy.stats import spearmanr

def local_spearman(y, yhat, threshold, above=True):
    # Filter y and yhat based on the threshold
    if above:
        mask = np.array(y) > threshold
    else:
        mask = np.array(y) <= threshold

    local_y = np.array(y)[mask]
    local_yhat = np.array(yhat)[mask]

    if len(local_y) > 1:  # Ensure there are at least 2 data points
        corr, _ = spearmanr(local_y, local_yhat)
        return corr
    else:
        return np.nan  # Not enough data points for a valid correlation


def inset_fig_placeholder(i, ax, xy, censor_type, no_noise=True):
    '''
    Placing empty annotation boxes for 2nd inset figure in each subplot
    Can be before, during, or after 'process_and_plot'
    
    Steps
    1. plot inset figures & save using savefig('filename.svg') -- done
    2. make skunk annotation boxes
    3. use skunk.insert to insert figures --> last, after legend is added
    '''
    
    # place inset figure holder
    if no_noise:
        name_box = f'{censor_type}_no-noise{i}'
        connectionstyle="arc3,rad=-0.2"
        #xybox = (0.03, 0.97) # left top corner
        xybox = (0.01,0.97)
    else:
        name_box = f'{censor_type}_max-noise{i}'
        connectionstyle="arc3,rad=0.2"
        #xybox = (0.73, 0.97) # right top corner
        xybox = (0.76, 0.97)
    #box = skunk.Box(75,75, name_box)
    box = skunk.Box(45, 45, name_box)
    ab = AnnotationBbox(box, xy, # where it points
                        xybox=xybox, # where the box is located
                        xycoords='data',
                        boxcoords=("axes fraction", "axes fraction"),
                        box_alignment=(0,1),
                        arrowprops=dict(arrowstyle='->,head_length=0.4,head_width=0.2',
                                        connectionstyle=connectionstyle, 
                                        fc="w",))
    ax.add_artist(ab)
    return ax
 
    
def make_parity_plot(censor_type, censor_split, censor_region, censor_interval):
    # helper function to make parity plots for inset figures
    file_name = f'all_results/mlp_{censor_type}_results_split{censor_split}_{censor_region}/history.json'
    with open(file_name, 'r') as f:
        content = f.read()
    parts = re.split(r'\nRun from today: .*\[\n', content)
    all_trials_results = json.loads('['+ parts[1]) # just get the first run
    
    if censor_type == 'omit':
        task = f'omit {int(censor_interval*100)}%'
    elif censor_type == 'xnoise':
        task = f'xn_level{censor_interval:0.1f}'
    elif censor_type == 'ynoise':
        task = f'y noise level {censor_interval:0.1f}'
    else:
        raise KeyError(f'Unknown censor_type: {censor_type}')
    
    result = all_trials_results[0] # grab the first trial
    threshold = result['censor_threshold']
    ytest = result['y_test']
    yhat = result['pred'][task]
    
    
    fig1, ax1 = plt.subplots(figsize=(1,1))
    ytest_t, yhat_t = torch.tensor(ytest), torch.tensor(yhat)
    upper_ytest = ytest_t[ytest_t >= threshold]
    lower_ytest = ytest_t[ytest_t < threshold]
    upper_yhat = yhat_t[ytest_t >= threshold]
    lower_yhat = yhat_t[ytest_t < threshold]
    if censor_region == 'above':
        lower_color = 'C0'
        upper_color = 'C1'

    else:
        lower_color = 'C1'
        upper_color = 'C0'
    ax1.scatter(upper_ytest, upper_yhat, s=1, c=upper_color)
    ax1.scatter(lower_ytest, lower_yhat, s=1, c=lower_color)
    
    min_val = min(ytest)
    max_val = max(ytest)

    # Set the limits based on the global min and max values
    plt.xlim(min_val, max_val)
    plt.ylim(min_val, max_val)

    ax1.plot([min_val, max_val], [min_val, max_val], c='black', linewidth=0.5)
    ax1.set_xlim(min_val, max_val)
    ax1.set_ylim(min_val, max_val)

    # remove ticks and tick labels to look simple
    ax1.set_xticks([])
    ax1.set_yticks([])

    # save to svg file
    svg_filename = f'figures/{task}_split{censor_split}_yt{threshold}.svg'
    fig1.patch.set_alpha(0.0)
    plt.tight_layout()
    plt.savefig(svg_filename, bbox_inches='tight')
    plt.close()
    return svg_filename

def insert_skunk_figs(censor_types, censor_splits, censor_region, censor_intervals):
    if isinstance(censor_types, str):
        censor_types = [censor_types]
        censor_intervals = [censor_intervals]
    
    skunk_dict={}
    for j, ctype in enumerate(censor_types):
        intervals = censor_intervals[j]
        for i, split in enumerate(censor_splits):
            # fig for point 0
            svg_filename1 = make_parity_plot(ctype, split, censor_region, intervals[0])
            skunk_dict[f'{ctype}_no-noise{i}'] = svg_filename1

            # fig for last point
            svg_filename2 = make_parity_plot(ctype, split, censor_region, intervals[-1])
            skunk_dict[f'{ctype}_max-noise{i}'] = svg_filename2
    svg = skunk.insert(skunk_dict)
    return svg

In [None]:
# updated plotting fxn
def process_and_plot_mlp_with_inset_figs(i, ax, censor_split, censor_type, censor_intervals, tasks, metric='corr', title=True):
    file_name = f'all_results/mlp_{censor_type}_results_split{censor_split}_{censor_region}/history.json'
    with open(file_name, 'r') as f:
        content = f.read()
    parts = re.split(r'\nRun from today: .*\[\n', content)
    all_trials_results = json.loads('['+ parts[1]) # just get the first run
    mean_all = []
    mean_above = []
    mean_below = []
    std_all = []
    std_above = []
    std_below = []
    for task in tasks:
        spearman_correlations = []
        correlations_above = []
        correlations_below = []
        for result in all_trials_results:
            threshold = result['censor_threshold']
            ytest = result['y_test']
            yhat = result['pred'][task]
            overall_corr, _ = spearmanr(ytest, yhat)
            spearman_correlations.append(overall_corr) 
            correlations_above.append(local_spearman(ytest, yhat, threshold, above=True))
            correlations_below.append(local_spearman(ytest, yhat, threshold, above=False))

        mean_all.append(np.mean(spearman_correlations))
        mean_above.append(np.mean(correlations_above)) 
        mean_below.append(np.mean(correlations_below))
        std_all.append(np.std(spearman_correlations))
        std_above.append(np.std(correlations_above))
        std_below.append(np.std(correlations_below))

    ax.plot(censor_intervals, mean_above, label='Sensitive data', marker='x', c='C1') # del "markersize=2"
    ax.fill_between(
        censor_intervals, np.subtract(mean_above, std_above), np.add(mean_above,std_above), alpha=0.2, color='C1')

    ax.plot(censor_intervals, mean_below, label='Non-sensitive data', marker='^', color='C0') # del "markersize=2"
    ax.fill_between(
        censor_intervals, np.subtract(mean_below,std_below), np.add(mean_below,std_below), alpha=0.2, color='C0')
     
    # Placeholders for Inset Figures
    no_noise_xcoord = censor_intervals[0]
    max_noise_xcoord = censor_intervals[-1]
        
    # inset fig for no-noise parity plot, two arrows pointing at two curve regions
    xy1a = (no_noise_xcoord, mean_above[0])
    xy1b = (no_noise_xcoord, mean_below[0])
    inset_fig_placeholder(i, ax, xy1a, censor_type, no_noise=True)
    inset_fig_placeholder(i, ax, xy1b, censor_type, no_noise=True) # for 2nd arrow
    
    # inset fig for max-noise parity plot
    xy2a = (max_noise_xcoord, mean_above[-1])
    xy2b = (max_noise_xcoord, mean_below[-1])
    inset_fig_placeholder(i, ax, xy2a, censor_type, no_noise=False)
    inset_fig_placeholder(i, ax, xy2b, censor_type, no_noise=False)
        
    ax.autoscale(enable=True, axis='x', tight=True)
    if metric == 'corr':
        ax.set_ylim(0, 1.7)
    else:
        ax.set_ylim(0, 2.0)
    ax.grid(True)
    if title:
        ax.set_title(f'{censor_split * 100:.0f}% sensitive data')
        
    return censor_intervals

In [None]:
def process_and_plot_mlp(ax, censor_split, censor_type, censor_intervals, tasks):
    file_name = f'all_results/mlp_{censor_type}_results_split{censor_split}_{censor_region}/history.json'
    with open(file_name, 'r') as f:
        content = f.read()
    parts = re.split(r'\nRun from today: .*\[\n', content)
    all_trials_results = json.loads('['+ parts[1]) # just get the first run
    mean_all = []
    mean_above = []
    mean_below = []
    std_all = []
    std_above = []
    std_below = []
    for task in tasks:
        spearman_correlations = []
        correlations_above = []
        correlations_below = []
        for result in all_trials_results:
            threshold = result['censor_threshold']
            ytest = result['y_test']
            yhat = result['pred'][task]
            overall_corr, _ = spearmanr(ytest, yhat)
            spearman_correlations.append(overall_corr) 
            correlations_above.append(local_spearman(ytest, yhat, threshold, above=True))
            correlations_below.append(local_spearman(ytest, yhat, threshold, above=False))

        mean_all.append(np.mean(spearman_correlations))
        mean_above.append(np.mean(correlations_above)) 
        mean_below.append(np.mean(correlations_below))
        std_all.append(np.std(spearman_correlations))
        std_above.append(np.std(correlations_above))
        std_below.append(np.std(correlations_below))

    ax.plot(censor_intervals, mean_above, label='Sensitive data', marker='x', c='C1') # del "markersize=2"
    ax.fill_between(
        censor_intervals, np.subtract(mean_above, std_above), np.add(mean_above,std_above), alpha=0.2, color='C1')

    ax.plot(censor_intervals, mean_below, label='Non-sensitive data', marker='^', color='C0') # del "markersize=2"
    ax.fill_between(
        censor_intervals, np.subtract(mean_below,std_below), np.add(mean_below,std_below), alpha=0.2, color='C0')

    ax.set_ylim(0, 1)
    ax.set_title(f'{censor_split * 100:.0f}% sensitive data')
    #ax.grid(True)
    
def process_and_plot_mlp_rmse(ax, censor_split, censor_type, censor_intervals, tasks):
    file_name = f'all_results/mlp_{censor_type}_results_split{censor_split}_{censor_region}/history.json'
    with open(file_name, 'r') as f:
        content = f.read()
    parts = re.split(r'\nRun from today: .*\[\n', content)
    all_trials_results = json.loads('['+ parts[1]) # just get the first run
    mean_all = []
    mean_above = []
    mean_below = []
    std_all = []
    std_above = []
    std_below = []
    for task in tasks:
        overall_rmse = []
        rmse_above = []
        rmse_below = []
        for result in all_trials_results:
            overall_rmse.append(result['overall_error'][task])
            rmse_above.append(result['upper_error'][task])
            rmse_below.append(result['lower_error'][task])
        
        mean_all.append(np.mean(overall_rmse))
        mean_above.append(np.mean(rmse_above)) 
        mean_below.append(np.mean(rmse_below))
        std_all.append(np.std(overall_rmse))
        std_above.append(np.std(rmse_above))
        std_below.append(np.std(rmse_below))
        
    ax.plot(censor_intervals, mean_above, label='Sensitive data', marker='x' , c='C1') # del "markersize=2"
    ax.fill_between(
        censor_intervals, np.subtract(mean_above, std_above), np.add(mean_above,std_above), alpha=0.2, color='C1')

    ax.plot(censor_intervals, mean_below, label='Non-sensitive data', marker='^', color='C0') # del "markersize=2"
    ax.fill_between(
        censor_intervals, np.subtract(mean_below,std_below), np.add(mean_below,std_below), alpha=0.2, color='C0')

#     ax.plot(censor_intervals, mean_all, label='All data', marker='o', markersize=2, color='C2')
#     ax.fill_between(
#         censor_intervals, np.subtract(mean_all,std_all), np.add(mean_all,std_all), alpha=0.2, color='C2')

    ax.set_ylim(0, 1.8)
    ax.set_title(f'{censor_split * 100:.0f}% sensitive data', fontsize=12)
    ax.grid(True)

In [None]:
# combine all MLP results

# todo: try ylim(0,1) and place inset figures outside grid area
def plot_everything():
    combined_fig, axs = plt.subplots(3, 3, figsize=(10,10), sharey=True) # 3 splits for each of 3 censor types
    ytitle = 'Spearman Correlation'
    all_censor_intervals = []
    
    # plot omission results
    omit_fractions = np.linspace(0, 1, int(1/0.1+1))
    tasks = [f'omit {int(omit_frac*100)}%' for omit_frac in omit_fractions]
    all_censor_intervals.append(omit_fractions)
    axs[0,0].set_ylabel(ytitle, fontsize=12)
    axs[0,1].set_xlabel('Percentage of Sensitive Data Omitted from Training Data\n ', fontsize=16) #labelpad=10) #, fontsize=16)
    for i, split in enumerate(censor_splits):
        process_and_plot_mlp_with_inset_figs(i, axs[0,i], split, 'omit', omit_fractions, tasks)

    # plot xnoise results
    x_noise_levels = np.linspace(0, 2, int(2/0.1+1))
    tasks = [f'xn_level{x:0.1f}' for x in x_noise_levels]
    all_censor_intervals.append(x_noise_levels) 
    axs[1,0].set_ylabel(ytitle, fontsize=12)
    axs[1,1].set_xlabel(r'Level of Feature Noise ($\delta X$) Applied to Sensitive Data in Training Data' + '\n ', fontsize=16) #, labelpad=10) #, fontsize=16)
    for i, split in enumerate(censor_splits):
        process_and_plot_mlp_with_inset_figs(i, axs[1,i], split, 'xnoise', x_noise_levels, tasks) #, title=False)

    # plot ynoise results
    y_noise_levels = np.linspace(0, 10, int(2/0.2+1))
    tasks = [f'y noise level {y}' for y in y_noise_levels]
    all_censor_intervals.append(y_noise_levels)
    axs[2,0].set_ylabel(ytitle, fontsize=12)
    axs[2,1].set_xlabel(r'Level of Label Noise ($\delta y$) Applied to Sensitive Data in Training Data', fontsize=16) #, labelpad=10) #, fontsize=16)
    for i, split in enumerate(censor_splits):
        process_and_plot_mlp_with_inset_figs(i, axs[2,i], split, 'ynoise', y_noise_levels, tasks) #, title=False)
    
    plt.subplots_adjust(hspace=1.0)
    legend_handles = [
        plt.Line2D([0], [0], marker='x', color='C1', lw=2, label='Sensitive Labels'),
        plt.Line2D([0], [0], marker='^', color='C0', lw=2, label='Non-sensitive Labels')
    ]
    combined_fig.legend(handles=legend_handles, bbox_to_anchor=(0.97, 0.06)) #, loc='center') #, bbox_to_anchor=(0.92, 0.5))
    
    combined_fig.tight_layout(rect=[0, 0.05, 1, 1]) #tight_layout(rect=[0, 0, 0.85, 1])
    
    svg = insert_skunk_figs(['omit', 'xnoise', 'ynoise'], censor_splits, censor_region, all_censor_intervals)
    plt.close()
    return svg

In [None]:
svg = plot_everything()
skunk.display(svg)

with open('mlp_corr_with_all_titles_2024-12-31.svg', 'w') as f:
    f.write(svg)

cairosvg.svg2png(bytestring=svg, write_to='mlp_corr_with_all_titles_2024-12-31.png', dpi=300)

In [None]:
# omission overview - RMSE

censor_type = 'omit'
omit_fractions = np.linspace(0, 1, int(1/0.1+1))
tasks = [f'omit {int(omit_frac*100)}%' for omit_frac in omit_fractions]
censor_intervals = omit_fractions 

fig, axs = plt.subplots(1, 3, figsize=(18, 5))
axs[0].set_ylabel('Test RMSE', fontsize=16)
axs[1].set_xlabel('% sensitive data omitted from training data', labelpad=10, fontsize=16)

for i, split in enumerate(censor_splits):
    process_and_plot_mlp_rmse(axs[i], split, censor_type, censor_intervals, tasks)

legend_handles = [
    plt.Line2D([0], [0], marker='x', color='C1', lw=2, label='Sensitive Labels'),
    plt.Line2D([0], [0], marker='^', color='C0', lw=2, label='Non-sensitive Labels')
]
fig.legend(handles=legend_handles, loc='center', bbox_to_anchor=(0.92, 0.5))

plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.savefig('overview_mlp_omission_rmse.png', dpi=300)
plt.show()

In [None]:
# x-noise overview - RMSE

censor_type = 'xnoise'
x_noise_levels = np.linspace(0, 2, int(2/0.1+1))
tasks = [f'xn_level{x:0.1f}' for x in x_noise_levels]
censor_intervals = x_noise_levels 

fig, axs = plt.subplots(1, 3, figsize=(18, 5))
axs[0].set_ylabel('Test RMSE', fontsize=16)
axs[1].set_xlabel('Feature noise level applied to sensitive data in training data', labelpad=10, fontsize=16)

for i, split in enumerate(censor_splits):
    process_and_plot_mlp_rmse(axs[i], split, censor_type, censor_intervals, tasks)

legend_handles = [
    plt.Line2D([0], [0], marker='x', color='C1', lw=2, label='Sensitive Labels'),
    plt.Line2D([0], [0], marker='^', color='C0', lw=2, label='Non-sensitive Labels')
]
fig.legend(handles=legend_handles, loc='center', bbox_to_anchor=(0.92, 0.5))

plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.savefig('overview_mlp_xnoise_rmse.png', dpi=300)
plt.show()

In [None]:
# y-noise overview - RMSE

censor_type = 'ynoise'
y_noise_levels = np.linspace(0, 10, int(2/0.2+1))
tasks = [f'y noise level {y}' for y in y_noise_levels]
censor_intervals = y_noise_levels 

fig, axs = plt.subplots(1, 3, figsize=(18, 5))
axs[0].set_ylabel('Test RMSE', fontsize=16)
axs[1].set_xlabel('Label noise level applied to sensitive data in training data', labelpad=10, fontsize=16)

for i, split in enumerate(censor_splits):
    process_and_plot_mlp_rmse(axs[i], split, censor_type, censor_intervals, tasks)

legend_handles = [
    plt.Line2D([0], [0], marker='x', color='C1', lw=2, label='Sensitive Labels'),
    plt.Line2D([0], [0], marker='^', color='C0', lw=2, label='Non-sensitive Labels')
]
fig.legend(handles=legend_handles, loc='center', bbox_to_anchor=(0.92, 0.5))

plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.savefig('overview_mlp_ynoise_rmse.png', dpi=300)
plt.show()