In [None]:
import json
import os
import re
import sys
from dataclasses import dataclass
from datetime import datetime

import cairosvg
#import dataframe_image as dfi
import numpy as np
import pandas as pd
import skunk
from scipy.stats import spearmanr

sys.path.insert(1, '/path/to/application/app/folder')
today_date = datetime.today().date()
time_now = datetime.today().ctime()

# packages & settings for plotting
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import urllib.request

urllib.request.urlretrieve('https://github.com/google/fonts/raw/main/ofl/ibmplexmono/IBMPlexMono-Regular.ttf', 'IBMPlexMono-Regular.ttf')
fe = font_manager.FontEntry(
    fname='IBMPlexMono-Regular.ttf',
    name='plexmono')
font_manager.fontManager.ttflist.append(fe)
plt.rcParams.update({'axes.facecolor':'#f5f4e9',
            'grid.color' : '#AAAAAA',
            'axes.edgecolor':'#333333',
            'figure.facecolor':'#FFFFFF',
            'axes.grid': False,
                     
            'axes.prop_cycle':   plt.cycler('color', plt.cm.Dark2.colors),
            'font.family': fe.name,
            'figure.figsize': (3.5,3.5 / 1.2),
            'ytick.left': True,
            'xtick.bottom': True   ,
            'figure.dpi': 300
           })

In [None]:
censor_region = "above"
censor_splits = [0.1, 0.5, 0.9]

In [None]:
def local_spearman(y, yhat, threshold, above=True):
    # Filter y and yhat based on the threshold
    if above:
        mask = np.array(y) > threshold
    else:
        mask = np.array(y) <= threshold

    local_y = np.array(y)[mask]
    local_yhat = np.array(yhat)[mask]

    if len(local_y) > 1:  # Ensure there are at least 2 data points
        corr, _ = spearmanr(local_y, local_yhat)
        return corr
    else:
        return np.nan  # Not enough data points for a valid correlation
    
def create_dataframe(censor_split, censor_type, censor_intervals, tasks):
    file_name = f'all_results/mlp_{censor_type}_results_split{censor_split}_{censor_region}/history.json'
    with open(file_name, 'r') as f:
        content = f.read()
    parts = re.split(r'\nRun from today: .*\[\n', content)
    all_trials_results = json.loads('['+ parts[1]) # just get the first run

    mean_above = []
    mean_below = []
    for task in tasks:
        spearman_correlations = []
        correlations_above = []
        correlations_below = []
        for result in all_trials_results:
            threshold = result['censor_threshold']
            ytest = result['y_test']
            yhat = result['pred'][task]
            overall_corr, _ = spearmanr(ytest, yhat)
            spearman_correlations.append(overall_corr) 
            correlations_above.append(local_spearman(ytest, yhat, threshold, above=True))
            correlations_below.append(local_spearman(ytest, yhat, threshold, above=False))
        
        mean_above.append(np.mean(correlations_above)) 
        mean_below.append(np.mean(correlations_below))
    
    df = pd.DataFrame({
        censor_type: censor_intervals,
        'lower corr': mean_below,
        'upper corr': mean_above,
    })
    return df

In [None]:
censor_type = 'omit'
omit_fractions = np.linspace(0, 1, int(1/0.1+1))
tasks = [f'omit {int(omit_frac*100)}%' for omit_frac in omit_fractions]
censor_intervals = omit_fractions 

df1 = create_dataframe(censor_splits[0], censor_type, censor_intervals, tasks)
df1

In [None]:
df0 = df1[['omit']]
df1 = df1[['lower corr', 'upper corr']]
df2 = create_dataframe(censor_splits[1], censor_type, censor_intervals, tasks)
df2 = df2[['lower corr', 'upper corr']]
df3 = create_dataframe(censor_splits[2], censor_type, censor_intervals, tasks)
df3 = df3[['lower corr', 'upper corr']]

In [None]:
omit_df = pd.concat(dict( SensitiveSplit10 = df1, SensitiveSplit50 = df2, SensitiveSplit90 = df3), axis=1)
omit_df = omit_df.set_index(df0['omit'])
#omit_df.to_csv('mlp_omit_table.csv')
omit_df

In [None]:
censor_type = 'ynoise'
y_noise_levels = np.linspace(0, 10, int(2/0.2+1))
tasks = [f'y noise level {y}' for y in y_noise_levels]
censor_intervals = y_noise_levels 

df1 = create_dataframe(censor_splits[0], censor_type, censor_intervals, tasks)
df0 = df1[['ynoise']]
df1 = df1[['lower corr', 'upper corr']]
df2 = create_dataframe(censor_splits[1], censor_type, censor_intervals, tasks)
df2 = df2[['lower corr', 'upper corr']]
df3 = create_dataframe(censor_splits[2], censor_type, censor_intervals, tasks)
df3 = df3[['lower corr', 'upper corr']]

In [None]:
ynoise_df = pd.concat(dict( SensitiveSplit10 = df1, SensitiveSplit50 = df2, SensitiveSplit90 = df3), axis=1)
ynoise_df = ynoise_df.set_index(df0['ynoise'])
#ynoise_df.to_csv('mlp_ynoise_table.csv')
ynoise_df

In [None]:
censor_type = 'xnoise'
x_noise_levels = np.linspace(0, 2, int(2/0.1+1))
tasks = [f'xn_level{x:0.1f}' for x in x_noise_levels]
censor_intervals = x_noise_levels 

df1 = create_dataframe(censor_splits[0], censor_type, censor_intervals, tasks)
df0 = df1[['xnoise']]
df1 = df1[['lower corr', 'upper corr']]
df2 = create_dataframe(censor_splits[1], censor_type, censor_intervals, tasks)
df2 = df2[['lower corr', 'upper corr']]
df3 = create_dataframe(censor_splits[2], censor_type, censor_intervals, tasks)
df3 = df3[['lower corr', 'upper corr']]

In [None]:
xnoise_df = pd.concat(dict( SensitiveSplit10 = df1, SensitiveSplit50 = df2, SensitiveSplit90 = df3), axis=1)
xnoise_df = xnoise_df.set_index(df0['xnoise'])
#xnoise_df.to_csv('mlp_xnoise_table.csv')
xnoise_df