In [None]:
import sys
sys.path.append("../")

import numpy as np
import pandas as pd
import joblib

from rsep_explain.variables import auto_var
from utils import params_to_dataframe
from params import CalcSepExperiments, CalcLinSepExperiments
from IPython.display import display


In [None]:
def display_results(exp, result_file_dir='./results/calc_separation', columns=None):
    if columns is None:
        columns = ['n_samples', 'n_features', 'aug_n_samples', 'trn_rsep', 'aug_rsep',
                    'trn_acc', 'tst_acc', 'node_count', 'depth', 'n_leaves',
                    'ap_trn_acc', 'ap_tst_acc', 'ap_node_count', 'ap_depth', 'ap_n_leaves',]
    _, _, grid_params, _ = exp()()
    df = params_to_dataframe(grid_params,
                             columns=columns,
                             result_file_dir=result_file_dir,
                             logging_level=0)
    return df

In [None]:
df = display_results(CalcSepExperiments, result_file_dir='../results/calc_separation', columns=['n_samples', 'n_features', 'aug_n_samples', 'aug_rsep'])
df['dataset'] = df['dataset'].map(lambda x: x.replace("risk_", ""))
dd = df.set_index("dataset")
dd['percentage removed'] = (dd['n_samples'] - dd['aug_n_samples']) / dd['n_samples']
dd['rsep'] = dd['aug_rsep'].map(lambda x: np.min(x))
rsep_dd = dd[['n_samples', 'percentage removed', 'rsep']]
rsep_dd['n_samples'] = rsep_dd['n_samples'].map(lambda x: int(x))
print(rsep_dd.to_latex(escape=False, column_format="lcc", float_format="%.6f"))
rsep_dd

In [None]:
df = display_results(CalcLinSepExperiments, result_file_dir='../results/calc_lin_separation/',
                     columns=['n_samples', 'l1svm_results'])
df['dataset'] = df['dataset'].map(lambda x: x.replace("risk_", ""))
def get_lin_sep_samples(df):
    cs = [1e-10, 1e-8, 1e-6, 1e-4, 1e-2, 1e-0, 1e2, 1e4, 1e6, 1e8, 1e10]
    c_n_examples = {}
    c_gamma = {}
    for i, d in df.iterrows():
        X, y = auto_var.get_var_with_argument("dataset", f"risk_{d['dataset']}",
                                              base_dir="../")
        preprocess_fn = auto_var.get_var_with_argument("preprocessor", "rminmax", X=X)
        X = preprocess_fn(X)
        
        for c in cs:
            for ent in d['l1svm_results']:
                if ent['c'] == c:
                    break
            w = np.asarray(ent['w'])
            w = w / np.linalg.norm(w, ord=1) 
            if np.isnan(np.linalg.norm(w, ord=1)):
                gamma = -1
            else:
                assert np.isclose(np.linalg.norm(w, ord=1), 1), np.linalg.norm(w, ord=1)

                value = (y.reshape(-1) * np.dot(X, w).reshape(-1))
                gamma = value[value > 0].min()

                #if len(ent['adv dists']) == 1:
                #    gamma = -1
                #else:
                #    value = (y.reshape(-1) * np.dot(X, w).reshape(-1))
                #    gamma = ent['adv dists'][value > 0]
                #    if len(gamma) > 0:
                #        gamma = gamma.min()
                #    else:
                #        gamma = -1
            
            c_gamma.setdefault(c, []).append(gamma)
            c_n_examples.setdefault(c, []).append(1. - ent['acc'])
    n_samples, wl1s, gammas = [], [], []
    for c in cs:
        n_samples.append(c_n_examples[c])
        gammas.append(c_gamma[c])
        
    idx = np.argmin(n_samples, axis=0)
    #n_samples = n_samples[1:]
    df['removed'] = np.min(n_samples, axis=0)
    #df['wl1'] = [wl1s[i][k] for i, k in enumerate(idx)]
    df['wl1'] = [gammas[k][i] for i, k in enumerate(idx)]
    return df[['dataset', 'n_samples', 'removed', 'wl1']]

df = get_lin_sep_samples(df)
dd = df.groupby(["dataset"]).mean()
dd['percentage removed'] = dd['removed']
linsep_dd = dd[['n_samples', 'percentage removed', 'wl1']]
linsep_dd['n_samples'] = linsep_dd['n_samples'].map(lambda x: int(x))
linsep_dd

In [None]:
data = {}
for ds in CalcSepExperiments.grid_params[0]['dataset']:
    X, y = auto_var.get_var_with_argument("dataset", ds, base_dir="../")
    bin_fets = 0
    for i in range(X.shape[1]):
        if len(np.unique(X[:, i])) == 2:
            bin_fets += 1
    data[ds.replace("risk_", "")] = [X.shape[0], X.shape[1]-1, bin_fets, (y==1).mean()]
df_stats = pd.DataFrame.from_dict(data, orient='index', columns=["\# samples", "\# features", "\# binary features", "percentage of positive label"])
df_stats.index.name = "dataset"
df_stats = df_stats.sort_index()
df_stats

In [None]:
ddff = df_stats.merge(rsep_dd[['percentage removed', 'rsep']], on='dataset')
ddff = ddff.merge(linsep_dd[['percentage removed', 'wl1']], on='dataset', suffixes=("", "_lin"))
df_stats.index.name = "dataset"
ddff['percentage removed'] = 1.-ddff['percentage removed']
ddff['percentage removed_lin'] = 1.-ddff['percentage removed_lin']
print(ddff.to_latex(escape=False, column_format="lccccc", float_format="%.2f"))
ddff.sort_index()

In [None]:
ddff = df_stats.merge(rsep_dd[['percentage removed', 'rsep']], on='dataset')
ddff = ddff.merge(linsep_dd[['percentage removed', 'wl1']], on='dataset', suffixes=("", "_lin"))
df_stats.index.name = "dataset"
ddff['percentage removed'] = 1.-ddff['percentage removed']
ddff['percentage removed_lin'] = 1.-ddff['percentage removed_lin']
print(ddff.to_latex(escape=False, column_format="lccccc", float_format="%.2f"))
ddff.sort_index()

In [None]:
res = joblib.load("../results/calc_lin_separation/risk_diabetes-rminmax-0.pkl")

In [None]:
res['svm_results'][1]['adv dists'].max()