## Linear Regression  
A Linear Regression function to find the coefficients based on ratio analysis outputs  
in order to find the trade off between bias (bias1 or bias2) and target value (accuracy or f1 score)  

In [1]:
import os
import pickle
import random
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from script_single_task import random_ratios, acc, f1score, bias1, bias2, newBias

In [2]:
CREATE_INPUT_DATASET = True

In [3]:
iter_per_ratio = 200
classifiers = ["KNN", "LinearSVC", "SVC", "Forest", "LogReg", "Tree", "MLP"]
methods = ["mean_v1", "mean_v2", "similar_v1", "similar_v2", "multi_v1", "multi_v2"]
data_columns_1 = ["iter_number", "random_ratio", "ml_name", "method_name", "bias1", "bias2", "new_bias", "accuracy", "f1_score", "real_accuracy"]
data_columns_2 = ["iter_number", "random_ratio"] + \
                 ["ML_{}".format(x) for x in classifiers] + \
                 ["Imp_{}".format(x) for x in methods] + \
                 ["bias1", "bias2", "new_bias", "accuracy", "f1_score", "real_accuracy"]

### Prepare Dataset

In [4]:
def prepare_dataset(dataset_name, target_name="acc", file_name=None):
    if os.path.exists(file_name):
        return pd.read_csv(file_name)
    global classifiers, methods, data_columns
    data_final = []
    for method in methods:
        if not os.path.exists(os.path.join("condor_outputs", target_name, dataset_name, "{}.pkl".format(method))):
            raise Exception("Required pkl not found: {}".format(os.path.join("condor_outputs", target_name, dataset_name, "{}.pkl".format(method))))
        with open(os.path.join("condor_outputs", target_name, dataset_name, "{}.pkl".format(method)), "rb") as inFile:
            pkl_data = pickle.load(inFile)
        j = 0
        for i in range(0, len(pkl_data), iter_per_ratio):
            i_data = pkl_data[i:(i+iter_per_ratio)]
            for clf in classifiers:
                clf_data = [x[clf] for x in i_data]
                for cf_matrices in clf_data:
                    # [[acc avg], [bias1], [bias2], [f1 score], [real acc], [new bias]], remove -1, [None] cases
                    data_processed = [[], [], [], [], [], []]
                    for mm in cf_matrices:
                        if len(mm) < 1:
                            continue
                        cf_m, acc_m = mm[0], mm[1]
                    try:
                        x = acc(cf_m)
                        y = bias1(cf_m)
                        z = bias2(cf_m)
                        w = f1score(cf_m)
                        k = newBias(cf_m)
                    except Exception as e:
                        continue
                    if (y > 0) and (z > 0) and len(w) == 2:
                        data_processed[0].append(x)
                        data_processed[1].append(y)
                        data_processed[2].append(z)
                        data_processed[3].append(np.mean(w))
                        data_processed[4].append(acc_m)
                        data_processed[5].append(k)
                    row_data = [i // iter_per_ratio, round(random_ratios[j], 2), clf, method, np.mean(data_processed[1]),
                                np.mean(data_processed[2]), np.mean(data_processed[5]),
                                np.mean(data_processed[0]), np.mean(data_processed[3]),
                                np.mean(data_processed[4])]
                    data_final.append(row_data)
            j+=1
    data_final = pd.DataFrame(data_final, columns=data_columns_1)
    if file_name:
        data_final.to_csv(file_name, index=False)
    return data_final

### Analysis

#### Titanic Dataset as Example

In [5]:
if CREATE_INPUT_DATASET:
    data = prepare_dataset("titanic", "acc", os.path.join("ratio_analysis_plots", "d_titanic.csv"))
else:
    data = pd.read_csv(os.path.join("ratio_analysis_plots", "d_titanic.csv"))

In [6]:
# remove empty columns
data.dropna(inplace=True)
data.shape

(138895, 10)

In [7]:
result_acc_bias1 = smf.ols(formula="accuracy ~ bias1", data=data).fit()
result_acc_bias1.params

Intercept    0.618392
bias1        0.002460
dtype: float64

In [8]:
result_acc_bias2 = smf.ols(formula="accuracy ~ bias2", data=data).fit()
result_acc_bias2.params

Intercept    0.602350
bias2        0.013663
dtype: float64

In [9]:
result_acc_bias_new = smf.ols(formula="accuracy ~ new_bias", data=data).fit()
result_acc_bias_new.params

Intercept    0.590758
new_bias     0.074345
dtype: float64

#### Compute and collect to csv

In [10]:
datasets = ["titanic", "german", "juvenile", "compas", "adult", "communities"]
target = "acc"
sheet_data1 = []
sheet_data2 = []
sheet_data3 = []
sheet_col1 = ["data_name", "bias1", "ml_name", "method_name", "Intercept"]
sheet_col2 = ["data_name", "bias2", "ml_name", "method_name", "Intercept"]
sheet_col3 = ["data_name", "new_bias", "ml_name", "method_name", "Intercept"]
for data_name in datasets:
    if CREATE_INPUT_DATASET:
        data = prepare_dataset(data_name, target, os.path.join("ratio_analysis_plots", "d_{}.csv".format(data_name)))
    else:
        data = pd.read_csv(os.path.join("ratio_analysis_plots", "d_{}.csv".format(data_name)))
    data.dropna(inplace=True)
    print("Dataset: {}, shape: {}".format(data_name, data.shape))
    for clf in classifiers:
        for method in methods:
            data_copy = data[data["ml_name"] == clf].copy()
            data_copy = data_copy[data_copy["method_name"] == method].copy()
            result_acc_bias1 = smf.ols(formula="accuracy ~ bias1", data=data_copy).fit().params.tolist()
            result_acc_bias2 = smf.ols(formula="accuracy ~ bias2", data=data_copy).fit().params.tolist()
            result_acc_bias_new = smf.ols(formula="accuracy ~ new_bias", data=data_copy).fit().params.tolist()
            sheet_data1.append([data_name, result_acc_bias1[1], clf, method, result_acc_bias1[0]])
            sheet_data2.append([data_name, result_acc_bias2[1], clf, method, result_acc_bias2[0]])
            sheet_data3.append([data_name, result_acc_bias_new[1], clf, method, result_acc_bias_new[0]])
with pd.ExcelWriter(os.path.join("ratio_analysis_plots", "d_collected_acc.xlsx")) as writer:
    pd.DataFrame(sheet_data1, columns=sheet_col1).to_excel(writer, sheet_name="bias1")
    pd.DataFrame(sheet_data2, columns=sheet_col2).to_excel(writer, sheet_name="bias2")
    pd.DataFrame(sheet_data3, columns=sheet_col3).to_excel(writer, sheet_name="new_bias")

Dataset: titanic, shape: (138895, 10)
Dataset: german, shape: (153088, 10)
Dataset: juvenile, shape: (157383, 10)
Dataset: compas, shape: (167813, 10)
Dataset: adult, shape: (163793, 10)
Dataset: communities, shape: (127403, 10)


In [11]:
datasets = ["titanic", "german", "juvenile", "compas", "adult", "communities"]
target = "acc"
sheet_data1 = []
sheet_data2 = []
sheet_data3 = []
sheet_data4 = []
sheet_col = ["data_name", "random_ratio", "ml_name", "method_name", "Intercept"]
for data_name in datasets:
    if CREATE_INPUT_DATASET:
        data = prepare_dataset(data_name, target, os.path.join("ratio_analysis_plots", "d_{}.csv".format(data_name)))
    else:
        data = pd.read_csv(os.path.join("ratio_analysis_plots", "d_{}.csv".format(data_name)))
    data.dropna(inplace=True)
    print("Dataset: {}, shape: {}".format(data_name, data.shape))
    for clf in classifiers:
        for method in methods:
            data_copy = data[data["ml_name"] == clf].copy()
            data_copy = data_copy[data_copy["method_name"] == method].copy()
            result_acc = smf.ols(formula="accuracy ~ random_ratio", data=data_copy).fit().params.tolist()
            result_bias1 = smf.ols(formula="bias1 ~ random_ratio", data=data_copy).fit().params.tolist()
            result_bias2 = smf.ols(formula="bias2 ~ random_ratio", data=data_copy).fit().params.tolist()
            result_bias_new = smf.ols(formula="new_bias ~ random_ratio", data=data_copy).fit().params.tolist()
            sheet_data1.append([data_name, result_acc[1], clf, method, result_acc[0]])
            sheet_data2.append([data_name, result_bias1[1], clf, method, result_bias1[0]])
            sheet_data3.append([data_name, result_bias2[1], clf, method, result_bias2[0]])
            sheet_data4.append([data_name, result_bias_new[1], clf, method, result_bias_new[0]])
with pd.ExcelWriter(os.path.join("ratio_analysis_plots", "d_collected_ratio.xlsx")) as writer:
    pd.DataFrame(sheet_data1, columns=sheet_col).to_excel(writer, sheet_name="accuracy")
    pd.DataFrame(sheet_data2, columns=sheet_col).to_excel(writer, sheet_name="bias1")
    pd.DataFrame(sheet_data3, columns=sheet_col).to_excel(writer, sheet_name="bias2")
    pd.DataFrame(sheet_data4, columns=sheet_col).to_excel(writer, sheet_name="new_bias")

Dataset: titanic, shape: (138895, 10)
Dataset: german, shape: (153088, 10)
Dataset: juvenile, shape: (157383, 10)
Dataset: compas, shape: (167813, 10)
Dataset: adult, shape: (163793, 10)
Dataset: communities, shape: (127403, 10)


In [12]:
datasets = ["titanic", "german", "juvenile", "compas", "adult", "communities"]
target = "acc"
sheet_data1 = []
sheet_data2 = []
sheet_data3 = []
sheet_data4 = []
sheet_col = ["data_name", "random_ratio", "random_ratio^2", "ml_name", "method_name", "Intercept"]
for data_name in datasets:
    if CREATE_INPUT_DATASET:
        data = prepare_dataset(data_name, target, os.path.join("ratio_analysis_plots", "d_{}.csv".format(data_name)))
    else:
        data = pd.read_csv(os.path.join("ratio_analysis_plots", "d_{}.csv".format(data_name)))
    data.dropna(inplace=True)
    print("Dataset: {}, shape: {}".format(data_name, data.shape))
    data["random_ratio2"] = np.square(data["random_ratio"])
    for clf in classifiers:
        for method in methods:
            data_copy = data[data["ml_name"] == clf].copy()
            data_copy = data_copy[data_copy["method_name"] == method].copy()
            result_acc = smf.ols(formula="accuracy ~ random_ratio + random_ratio2", data=data_copy).fit().params.tolist()
            result_bias1 = smf.ols(formula="bias1 ~ random_ratio + random_ratio2", data=data_copy).fit().params.tolist()
            result_bias2 = smf.ols(formula="bias2 ~ random_ratio + random_ratio2", data=data_copy).fit().params.tolist()
            result_bias_new = smf.ols(formula="new_bias ~ random_ratio + random_ratio2", data=data_copy).fit().params.tolist()
            sheet_data1.append([data_name, result_acc[1], result_acc[2], clf, method, result_acc[0]])
            sheet_data2.append([data_name, result_bias1[1], result_bias1[2], clf, method, result_bias1[0]])
            sheet_data3.append([data_name, result_bias2[1], result_bias2[2], clf, method, result_bias2[0]])
            sheet_data4.append([data_name, result_bias_new[1], result_bias_new[2], clf, method, result_bias_new[0]])
with pd.ExcelWriter(os.path.join("ratio_analysis_plots", "d_collected_ratio2.xlsx")) as writer:
    pd.DataFrame(sheet_data1, columns=sheet_col).to_excel(writer, sheet_name="accuracy")
    pd.DataFrame(sheet_data2, columns=sheet_col).to_excel(writer, sheet_name="bias1")
    pd.DataFrame(sheet_data3, columns=sheet_col).to_excel(writer, sheet_name="bias2")
    pd.DataFrame(sheet_data4, columns=sheet_col).to_excel(writer, sheet_name="new_bias")

Dataset: titanic, shape: (138895, 10)
Dataset: german, shape: (153088, 10)
Dataset: juvenile, shape: (157383, 10)
Dataset: compas, shape: (167813, 10)
Dataset: adult, shape: (163793, 10)
Dataset: communities, shape: (127403, 10)


Suppose the linear regression model is 

$$ y_i = \alpha + \beta_1 x_{i1} + \beta_2 x_{i2} + \varepsilon_i $$

for $i \in \{1, \cdots, N\}$.

Let the variance of $\beta_j$ be $\sigma_j^2$, so that the standard error of $\beta_j$ is $\sigma_j$ and the 95% confidence interval for $\beta_j$ is

$$ [\beta_j - \sigma_j \times 1.96, \beta_j + \sigma_j \times 1.96]$$

Denote the covariance between the estimates for $\beta_1$ and $\beta_2$ as $\sigma_{12}$.

Define a new random variable $\gamma = \beta_1 + \beta_2$. The variance of $\gamma$ is

$$ Var(\gamma) = Var(\beta_1 + \beta_2) = \sigma_1^2 + \sigma_2^2 + 2\sigma_{12} $$

And the 95% confidence interval for $\gamma$ is

$$ [(\beta_1 + \beta_2) - (\sqrt{\sigma_1^2 + \sigma_2^2 + 2\sigma_{12}})\times 1.96,
(\beta_1 + \beta_2) - (\sqrt{\sigma_1^2 + \sigma_2^2 + 2\sigma_{12}})\times 1.96] $$