In [None]:
import numpy as np
from dataclasses import dataclass
from datetime import datetime
import json
import os
import sys
sys.path.insert(1, '/path/to/application/app/folder')
time_now = datetime.today().ctime()
today_date = datetime.today().date()

# packages for plotting
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import urllib.request

urllib.request.urlretrieve('https://github.com/google/fonts/raw/main/ofl/ibmplexmono/IBMPlexMono-Regular.ttf', 'IBMPlexMono-Regular.ttf')
fe = font_manager.FontEntry(
    fname='IBMPlexMono-Regular.ttf',
    name='plexmono')
font_manager.fontManager.ttflist.append(fe)
plt.rcParams.update({'axes.facecolor':'#f5f4e9',
            'grid.color' : '#AAAAAA',
            'axes.edgecolor':'#333333',
            'figure.facecolor':'#FFFFFF',
            'axes.grid': False,
            'axes.prop_cycle':   plt.cycler('color', plt.cm.Dark2.colors),
            'font.family': fe.name,
            'figure.figsize': (3.5,3.5 / 1.2),
            'ytick.left': True,
            'xtick.bottom': True   ,
            'figure.dpi': 300
           })

In [None]:
censor_threshold = None
censor_region = 'above'
censor_split = 0.1             # 10% sensitive data, 90% non-sensitive data

In [None]:
@dataclass
class Config:
    Din: int = 50 # dim of features
    hidden_dim: int = 64
    batchsize: int = 32
    datasize: int = 6400
    split: float = 0.1 # 10/10/80 test val train
    epochs: int = 60
    lr: float = 0.001
    patience: int = 5
    min_delta: float = 1e-4 # for early stopping

config = Config()
seed = 511
dir_name = f'OUTPUTS/all_results/mlp_omit_results_split{censor_split}_{censor_region}'
fig_dirname = 'OUTPUTS/figures'
os.makedirs(dir_name, exist_ok=True)
os.makedirs(fig_dirname, exist_ok=True)

In [None]:
omit_fractions = np.linspace(0, 1, int(1/0.1+1))
omit_tasks = []
for omit_frac in omit_fractions:
    task = (f"omit {int(omit_frac*100)}%", 0, 0, True, omit_frac)
    print(task)
    omit_tasks.append(task)

len(omit_tasks)

In [None]:
from mlp_fxns import mlptask_wrapper


num_trials = 5
all_trials_results = []
for trial in range(num_trials):
    trial_seed = seed + trial
    print(f'\033[46mTrial {trial+1}, Seed: {trial_seed}\033[0m') 
    results = mlptask_wrapper(
        trial_seed, 
        omit_tasks, 
        censor_region, 
        censor_split,
        model_config=config,
        verbose=True, 
        sanitycheckplot=False
    )
    all_trials_results.append(results)
    


# def convert_float32_to_float(data, variable_name=""):
#     if isinstance(data, dict):
#         return {key: convert_float32_to_float(value, key) for key, value in data.items()}
#     elif isinstance(data, list):
#         return [convert_float32_to_float(item, variable_name) for item in data]
#     elif isinstance(data, np.float32):
#         if variable_name:
#             print(f"Converting variable '{variable_name}' from np.float32 to float")
#         return float(data)
#     else:
#         return data

# all_trials_results = convert_float32_to_float(all_trials_results)


try: 
    with open(f'{dir_name}/history.json','a') as f:
        f.write(f'\n\nRun from today: {time_now}')
        json.dump(all_trials_results, f, indent=4)
        print(f'all results saved to {dir_name}/history.json')
except Exception as e:
    template = "An exception of type {0} occurred. Arguments:\n{1!r}"
    message = template.format(type(e).__name__, e.args)
    print(message)

In [None]:
# check the results of last trial 
from plot_fxns import plot_trainingcurves, plot_parityplots

plot_trainingcurves(omit_tasks[:4], results)
plot_parityplots(omit_tasks[:4], results, threshold=results['censor_threshold'])

In [None]:
# calculate mean & std for all trials
aggregated_results ={
    'x_noise_level': 0,
    'y_noise_level': 0,
    'omit': True,
    'omit_fraction': list(omit_fractions),
    'overall_error_mean': [],
    'overall_error_std': [],
    'lower_error_mean': [],
    'lower_error_std': [],
    'upper_error_mean': [],
    'upper_error_std': [],
    'overall_corr_mean': [],
    'overall_corr_std': []
    'lower_corr_mean': [],
    'lower_corr_std': []
    'upper_corr_mean': [],
    'upper_corr_std': [],
}

for omit_frac in omit_fractions:
    task_name = f"omit {int(omit_frac*100)}%"
    overall_errors = []
    lower_errors = []
    upper_errors = []
    for result in all_trials_results:
        overall_errors.append(result['overall_error'][task_name])
        lower_errors.append(result['lower_error'][task_name])
        upper_errors.append(result['upper_error'][task_name])
        
    aggregated_results['overall_error_mean'].append(np.mean(overall_errors))
    aggregated_results['overall_error_std'].append(np.std(overall_errors))
    aggregated_results['lower_error_mean'].append(np.mean(lower_errors))
    aggregated_results['lower_error_std'].append(np.std(lower_errors))
    aggregated_results['upper_error_mean'].append(np.mean(upper_errors))
    aggregated_results['upper_error_std'].append(np.std(upper_errors))
    aggregated_results['overall_corr_mean'].append(np.mean(overall_errors))
    aggregated_results['overall_corr_std'].append(np.std(overall_errors))
    aggregated_results['upper_corr_mean'].append(np.mean(overall_errors))
    aggregated_results['upper_corr_std'].append(np.std(overall_errors))
    aggregated_results['lower_corr_mean'].append(np.mean(overall_errors))
    aggregated_results['lower_corr_std'].append(np.std(overall_errors))

In [None]:
# check censor_threshold
# generated data was different every time the seed changes

censor_thresholds = [result['censor_threshold'] for result in all_trials_results]
print(censor_thresholds)

In [None]:
from plot_fxns import create_dataframe

df = create_dataframe(aggregated_results)
df.to_json(f'{dir_name}/summary_{today_date}.json')
df

In [None]:
plt.figure(figsize=(4,4))

# plt.plot(df['% omitted'], df['s=0 RMSE'], marker='o', label='Non-sensitive region')
# plt.plot(df['% omitted'], df['s=1 RMSE'], marker='o', label='Sensitive region')
plt.errorbar(df['% omitted'], df['s=1 RMSE'], yerr=df['s=1 RMSE std'], marker='o', label='Sensitive data', capsize=5)
plt.errorbar(df['% omitted'], df['s=0 RMSE'], yerr=df['s=0 RMSE std'], marker='o', label='Non-sensitive data', capsize=5)
plt.title('Test Errors')
plt.xlabel('omission fraction')
plt.ylabel('RMSE')
plt.legend()
plt.ylim(0,1.8)
plt.grid(True)
plt.tight_layout()
plt.savefig(f'{fig_dirname}/mlp_testerror_omission_split{censor_split}_{censor_region}_{today_date}.png',dpi=300)
plt.show()

In [None]:
plt.plot(df['% omitted'], df['s=1 RMSE'], label='Sensitive data', marker='o', markersize=2)
plt.fill_between(df['% omitted'], df['s=1 RMSE'] - df['s=1 RMSE std'], df['s=1 RMSE'] + df['s=1 RMSE std'], alpha=0.2)

plt.plot(df['% omitted'], df['s=0 RMSE'], label='Non-sensitive data', marker='o', markersize=2)
plt.fill_between(df['% omitted'], df['s=0 RMSE'] - df['s=0 RMSE std'], df['s=0 RMSE'] + df['s=0 RMSE std'], alpha=0.2)

plt.title('Test Errors')
plt.xlabel('omission fraction')
plt.ylabel('RMSE')
plt.legend()
plt.ylim(0,1.8)
plt.grid(True)
plt.tight_layout()
plt.savefig(f'{fig_dirname}/mlp_testerror_omission_split{censor_split}_{censor_region}_{today_date}_v2.png',dpi=300)
plt.show()

In [None]:
plt.plot(df['% omitted'], df['s=1 corr'], label='Sensitive data', marker='o', markersize=2)
plt.fill_between(df['% omitted'], df['s=1 corr'] - df['s=1 corr std'], df['s=1 corr'] + df['s=1 corr std'], alpha=0.2)

plt.plot(df['% omitted'], df['s=0 corr'], label='Non-sensitive data', marker='o', markersize=2)
plt.fill_between(df['% omitted'], df['s=0 corr'] - df['s=0 corr std'], df['s=0 corr'] + df['s=0 corr std'], alpha=0.2)

plt.title('Test Errors')
plt.xlabel('omission fraction')
plt.ylabel('spearman correlation')
plt.legend()
plt.ylim(0,1)
plt.grid(True)
plt.tight_layout()
plt.savefig(f'{fig_dirname}/mlp_testcorr_omission_split{censor_split}_{censor_region}_{today_date}.png',dpi=300)
plt.show()

In [None]:
plt.plot(df['% omitted'], df['s=1 corr'], label='Sensitive data', marker='o', markersize=2)
plt.fill_between(df['% omitted'], df['s=1 corr'] - df['s=1 corr std'], df['s=1 corr'] + df['s=1 corr std'], alpha=0.2)

plt.plot(df['% omitted'], df['s=0 corr'], label='Non-sensitive data', marker='o', markersize=2)
plt.fill_between(df['% omitted'], df['s=0 corr'] - df['s=0 corr std'], df['s=0 corr'] + df['s=0 corr std'], alpha=0.2)

plt.title('Test Errors')
plt.xlabel('omission fraction')
plt.ylabel('spearman correlation')
plt.legend()
plt.ylim(-0.2,1)
plt.grid(True)
plt.tight_layout()
plt.savefig(f'{fig_dirname}/mlp_testcorr_omission_split{censor_split}_{censor_region}_{today_date}_v2.png',dpi=300)
plt.show()