## Linear Regression  
A Linear Regression function to find the coefficients based on ratio analysis outputs  
in order to find the trade off between bias (bias1 or bias2) and target value (accuracy or f1 score)  

In [1]:
import os
import pickle
import random
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from script_single_task import random_ratios, acc, f1score, bias1, bias2, newBias

In [2]:
CREATE_INPUT_DATASET = True

In [3]:
iter_per_ratio = 200
classifiers = ["KNN", "LinearSVC", "SVC", "Forest", "LogReg", "Tree", "MLP"]
methods = ["mean_v1", "mean_v2", "similar_v1", "similar_v2", "multi_v1", "multi_v2"]
data_columns_1 = ["iter_number", "random_ratio", "ml_name", "method_name", "bias1", "bias2", "new_bias", "accuracy", "f1_score", "real_accuracy"]
data_columns_2 = ["iter_number", "random_ratio"] + \
                 ["ML_{}".format(x) for x in classifiers] + \
                 ["Imp_{}".format(x) for x in methods] + \
                 ["bias1", "bias2", "new_bias", "accuracy", "f1_score", "real_accuracy"]

### Prepare Dataset

In [4]:
def prepare_dataset(dataset_name, target_name="acc", file_name=None):
    if os.path.exists(file_name):
        return pd.read_csv(file_name)
    global classifiers, methods, data_columns
    data_final = []
    for method in methods:
        if not os.path.exists(os.path.join("condor_outputs", target_name, dataset_name, "{}.pkl".format(method))):
            raise Exception("Required pkl not found: {}".format(os.path.join("condor_outputs", target_name, dataset_name, "{}.pkl".format(method))))
        with open(os.path.join("condor_outputs", target_name, dataset_name, "{}.pkl".format(method)), "rb") as inFile:
            pkl_data = pickle.load(inFile)
        j = 0
        for i in range(0, len(pkl_data), iter_per_ratio):
            i_data = pkl_data[i:(i+iter_per_ratio)]
            for clf in classifiers:
                clf_data = [x[clf] for x in i_data]
                for cf_matrices in clf_data:
                    # [[acc avg], [bias1], [bias2], [f1 score], [real acc], [new bias]], remove -1, [None] cases
                    data_processed = [[], [], [], [], [], []]
                    for mm in cf_matrices:
                        if len(mm) < 1:
                            continue
                        cf_m, acc_m = mm[0], mm[1]
                    try:
                        x = acc(cf_m)
                        y = bias1(cf_m)
                        z = bias2(cf_m)
                        w = f1score(cf_m)
                        k = newBias(cf_m)
                    except Exception as e:
                        continue
                    if (y > 0) and (z > 0) and len(w) == 2:
                        data_processed[0].append(x)
                        data_processed[1].append(y)
                        data_processed[2].append(z)
                        data_processed[3].append(np.mean(w))
                        data_processed[4].append(acc_m)
                        data_processed[5].append(k)
                    row_data = [i // iter_per_ratio, round(random_ratios[j], 2), clf, method, np.mean(data_processed[1]),
                                np.mean(data_processed[2]), np.mean(data_processed[5]),
                                np.mean(data_processed[0]), np.mean(data_processed[3]),
                                np.mean(data_processed[4])]
                    data_final.append(row_data)
            j+=1
    data_final = pd.DataFrame(data_final, columns=data_columns_1)
    if file_name:
        data_final.to_csv(file_name, index=False)
    return data_final

### Analysis

#### Titanic Dataset as Example

In [5]:
if CREATE_INPUT_DATASET:
    data = prepare_dataset("titanic", "acc", os.path.join("ratio_analysis_plots", "d_titanic.csv"))
else:
    data = pd.read_csv(os.path.join("ratio_analysis_plots", "d_titanic.csv"))

In [6]:
# remove empty columns
data.dropna(inplace=True)
data.shape

(138895, 10)

In [7]:
result_acc_bias1 = smf.ols(formula="accuracy ~ bias1 + random_ratio + C(ml_name) + C(method_name)", data=data).fit()
result_acc_bias1.params

Intercept                       0.779175
C(ml_name)[T.KNN]              -0.079200
C(ml_name)[T.LinearSVC]        -0.047520
C(ml_name)[T.LogReg]           -0.045365
C(ml_name)[T.MLP]              -0.051147
C(ml_name)[T.SVC]              -0.038160
C(ml_name)[T.Tree]             -0.024497
C(method_name)[T.mean_v2]      -0.020112
C(method_name)[T.multi_v1]     -0.007877
C(method_name)[T.multi_v2]     -0.020284
C(method_name)[T.similar_v1]    0.008930
C(method_name)[T.similar_v2]   -0.020793
bias1                           0.001392
random_ratio                   -0.235153
dtype: float64

In [8]:
result_acc_bias2 = smf.ols(formula="accuracy ~ bias2 + random_ratio + C(ml_name) + C(method_name)", data=data).fit()
result_acc_bias2.params

Intercept                       0.756522
C(ml_name)[T.KNN]              -0.066091
C(ml_name)[T.LinearSVC]        -0.034647
C(ml_name)[T.LogReg]           -0.032169
C(ml_name)[T.MLP]              -0.038263
C(ml_name)[T.SVC]              -0.026011
C(ml_name)[T.Tree]             -0.021243
C(method_name)[T.mean_v2]      -0.024085
C(method_name)[T.multi_v1]     -0.006074
C(method_name)[T.multi_v2]     -0.018943
C(method_name)[T.similar_v1]    0.010274
C(method_name)[T.similar_v2]   -0.020803
bias2                           0.007972
random_ratio                   -0.227529
dtype: float64

In [9]:
result_acc_bias_new = smf.ols(formula="accuracy ~ new_bias + random_ratio + C(ml_name) + C(method_name)", data=data).fit()
result_acc_bias_new.params

Intercept                       0.756224
C(ml_name)[T.KNN]              -0.072298
C(ml_name)[T.LinearSVC]        -0.038580
C(ml_name)[T.LogReg]           -0.037152
C(ml_name)[T.MLP]              -0.042807
C(ml_name)[T.SVC]              -0.031216
C(ml_name)[T.Tree]             -0.023752
C(method_name)[T.mean_v2]      -0.024325
C(method_name)[T.multi_v1]     -0.009028
C(method_name)[T.multi_v2]     -0.021855
C(method_name)[T.similar_v1]    0.007794
C(method_name)[T.similar_v2]   -0.023385
new_bias                        0.046851
random_ratio                   -0.232149
dtype: float64

#### Compute and collect to csv

In [10]:
datasets = ["titanic", "german", "juvenile"]
target = "acc"
csv_data = []
csv_column = ["data_name", "Intercept", "bias1", "bias2", "new_bias", "random_ratio", "ml_name.KNN",
              "ml_name.LinearSVC", "ml_name.LogReg", "ml_name.MLP", "ml_name.SVC",
              "ml_name.Tree", "method_name.mean_v2", "method_name.multi_v1", "method_name.multi_v2",
              "method_name.similar_v1", "method_name.similar_v2"]
for data_name in datasets:
    if CREATE_INPUT_DATASET:
        data = prepare_dataset(data_name, target, os.path.join("ratio_analysis_plots", "d_{}.csv".format(data_name)))
    else:
        data = pd.read_csv(os.path.join("ratio_analysis_plots", "d_{}.csv".format(data_name)))
    data.dropna(inplace=True)
    print("Dataset: {}, shape: {}".format(data_name, data.shape))
    result_acc_bias1 = smf.ols(formula="accuracy ~ bias1 + random_ratio + C(ml_name) + C(method_name)", data=data).fit().params.tolist()
    result_acc_bias2 = smf.ols(formula="accuracy ~ bias2 + random_ratio + C(ml_name) + C(method_name)", data=data).fit().params.tolist()
    result_acc_bias_new = smf.ols(formula="accuracy ~ new_bias + random_ratio + C(ml_name) + C(method_name)", data=data).fit().params.tolist()
    new_rows = [
        [data_name, result_acc_bias1[0]] + [result_acc_bias1[12], np.nan, np.nan, result_acc_bias1[13]] + result_acc_bias1[1:12],
        [data_name, result_acc_bias2[0]] + [np.nan, result_acc_bias2[12], np.nan, result_acc_bias2[13]] + result_acc_bias2[1:12],
        [data_name, result_acc_bias_new[0]] + [np.nan, np.nan, result_acc_bias_new[12], result_acc_bias_new[13]] + result_acc_bias_new[1:12],
    ]
    csv_data += new_rows
csv_data = pd.DataFrame(csv_data, columns=csv_column)
csv_data.to_csv(os.path.join("ratio_analysis_plots", "d_collected.csv"), index=False)

Dataset: titanic, shape: (138895, 10)
Dataset: german, shape: (153088, 10)
Dataset: juvenile, shape: (157383, 10)
