In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
from scipy import stats
import measurements_methods as m
import warnings
warnings.filterwarnings("ignore")

In [2]:
file_path = './Data/models_results_crime_final.csv'
file_path2 = './Data/models_results_law_school_final.csv'
file_path3 = './Data/models_results_insurance_final.csv'
df_crim = pd.read_csv(file_path)
df_law = pd.read_csv(file_path2)
df_insurance = pd.read_csv(file_path3)

In [3]:
cols = list(df_law.columns)
cols.remove('Linear Regression')
cols.insert(13,'Linear_Regression')

In [4]:
folders = ['dict_law','dict_crime','dict_insurance']
df_list_corr = []
for folder in folders:  
    folder_path = f"./Data/{folder}/"
    classifiers = ['lr' ,'Lasso','Ridge', 'KLR_KLR_gaussian', 'KLR_KLR_polynomial']

    df_temp = pd.DataFrame(columns=classifiers,index=cols[:-2])
    for classifier in classifiers:
        indep_list = []

        for model in cols[:-2]:
            path_1 = f'{folder_path}/{classifier}/{model}.json'
            metric = 'independence'
            with open(path_1, "r") as file:
                data1 = json.load(file)
            porb1 = np.array(data1[metric]['indep'])

            threshold = 0.99
            porb1[porb1>threshold]=threshold
        
            r_ind_1 = ((data1[metric]['n'] - data1[metric]['a_sum']) / data1[metric]['a_sum']) * (porb1 / (1 - porb1)).mean()
            
            # Ind value after threshold
            #print('Indep value of cacluclated     :',r_ind_1 )

            # Orgnial ind value
            indep_list.append(data1[metric]['indep'])
            
        df_temp[classifier] = indep_list
    df_list_corr.append(df_temp)

In [8]:
m.spearman_correlation_with_significance(df_list_corr)

Unnamed: 0,lr,Lasso,Ridge,KLR_KLR_gaussian,KLR_KLR_polynomial
lr,"(1.0*, 1.0*, 1.0*)","(1.0*, 1.0*, 1.0*)","(1.0*, 1.0*, 1.0*)","(0.88*, 0.61*, -0.51*)","(0.04, -0.65*, 0.23)"
Lasso,"(1.0*, 1.0*, 1.0*)","(1.0*, 1.0*, 1.0*)","(1.0*, 1.0*, 1.0*)","(0.88*, 0.61*, -0.48*)","(0.04, -0.65*, 0.21)"
Ridge,"(1.0*, 1.0*, 1.0*)","(1.0*, 1.0*, 1.0*)","(1.0*, 1.0*, 1.0*)","(0.88*, 0.61*, -0.51*)","(0.04, -0.65*, 0.23)"
KLR_KLR_gaussian,"(0.88*, 0.61*, -0.51*)","(0.88*, 0.61*, -0.48*)","(0.88*, 0.61*, -0.51*)","(1.0*, 1.0*, 1.0*)","(0.1, -0.58*, -0.67*)"
KLR_KLR_polynomial,"(0.04, -0.65*, 0.23)","(0.04, -0.65*, 0.21)","(0.04, -0.65*, 0.23)","(0.1, -0.58*, -0.67*)","(1.0*, 1.0*, 1.0*)"


In [9]:
m.kendall_correlation_with_significance(df_list_corr)

Unnamed: 0,lr,Lasso,Ridge,KLR_KLR_gaussian,KLR_KLR_polynomial
lr,"(1.0*, 1.0*, 1.0*)","(1.0*, 1.0*, 0.98*)","(1.0*, 0.99*, 1.0*)","(0.77*, 0.45*, -0.38*)","(0.03, -0.46*, 0.12)"
Lasso,"(1.0*, 1.0*, 0.98*)","(1.0*, 1.0*, 1.0*)","(1.0*, 0.99*, 0.98*)","(0.77*, 0.45*, -0.36*)","(0.03, -0.46*, 0.1)"
Ridge,"(1.0*, 0.99*, 1.0*)","(1.0*, 0.99*, 0.98*)","(1.0*, 1.0*, 1.0*)","(0.77*, 0.44*, -0.38*)","(0.03, -0.47*, 0.12)"
KLR_KLR_gaussian,"(0.77*, 0.45*, -0.38*)","(0.77*, 0.45*, -0.36*)","(0.77*, 0.44*, -0.38*)","(1.0*, 1.0*, 1.0*)","(0.07, -0.43*, -0.55*)"
KLR_KLR_polynomial,"(0.03, -0.46*, 0.12)","(0.03, -0.46*, 0.1)","(0.03, -0.47*, 0.12)","(0.07, -0.43*, -0.55*)","(1.0*, 1.0*, 1.0*)"


In [None]:
# synthetic data generation
import numpy as np
import pandas as pd
from scipy.stats import norm

mean = 0
std_dev = 1
size = 500
target_values_class_0 = np.zeros(size, dtype=int)
data_protected = norm.rvs(loc=0, scale=std_dev, size=size)

while mean < 5 :
    data_privileged = norm.rvs(loc=mean, scale=std_dev, size=size)
    target_values_class_1 = np.ones(size, dtype=int)

    regression_values = np.concatenate([data_protected, data_privileged])
    target_values = np.concatenate([target_values_class_0, target_values_class_1])
    indices = np.arange(1000)
    np.random.shuffle(indices)
    
    synthetic_data = pd.DataFrame({
        "Regression_Value": regression_values[indices],
        "Target": target_values[indices]
    })
    synthetic_data.to_csv(f'./Data/mean_{int(mean*10)}.csv', index=False)
    mean += 0.1



