## Linear Regression  
This notebook shows steps to collect "Missing Completely At Random" experiment results, and fit an exponential curve on each combination of `ML Algorithm`, `Imputation Method`, and `Dataset`  
Note that data for $\text{accuracy} < 0.65$ will be discarded

In [1]:
import os
os.chdir("..")
import pickle
import random
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from script_single_task import random_ratios, acc, f1score, bias1, bias2, newBias

In [2]:
datasets = ["adult", "compas", "communities", "german", "titanic", "bank"]
ml_names = ["KNN", "LinearSVC", "Forest", "LogReg", "Tree", "MLP"]
methods = ["mean_v1", "mean_v2", "similar_v1", "similar_v2", "multi_v1", "multi_v2"]
iter_per_ratio = 200

In [3]:
def prepareInputDataset(filename, overwrite=False):
    if os.path.exists(filename) and (not overwrite):
        return pd.read_csv(filename)
    global datasets, ml_names, methods, iter_per_ratio
    data = []
    for dataset in datasets:
        assert os.path.exists(os.path.join("condor_outputs", "acc", dataset))
        for method in methods:
            assert os.path.exists(os.path.join("condor_outputs", "acc", dataset, "{}.pkl".format(method)))
            with open(os.path.join("condor_outputs", "acc", dataset, "{}.pkl".format(method)), "rb") as inFile:
                pkl_data = pickle.load(inFile)
            j = 0
            for i in range(0, len(pkl_data), iter_per_ratio):
                i_data = pkl_data[i:(i+iter_per_ratio)]
                for ml_name in ml_names:
                    ml_data = [x[ml_name] for x in i_data]
                    for ml_mat in ml_data:
                        # [[acc avg], [bias1], [bias2], [f1 score], [real acc], [new bias]], remove -1, [None] cases
                        data_processed = [[], [], [], [], [], []]
                        for mm in ml_mat:
                            if len(mm) < 1:
                                continue
                            cf_m, acc_m = mm[0], mm[1]
                        try:
                            x = acc(cf_m)
                            y = bias1(cf_m)
                            z = bias2(cf_m)
                            w = f1score(cf_m)
                            k = newBias(cf_m)
                        except Exception as e:
                            continue
                        if (y > 0) and (z > 0) and len(w) == 2:
                            data_processed[0].append(x)
                            data_processed[1].append(y)
                            data_processed[2].append(z)
                            data_processed[3].append(np.mean(w))
                            data_processed[4].append(acc_m)
                            data_processed[5].append(k)
                        if [] in data_processed: continue
                        row_data = np.array([i // iter_per_ratio, round(random_ratios[j], 2), dataset, ml_name, method,
                                             np.mean(data_processed[1]), np.mean(data_processed[2]),
                                             np.mean(data_processed[5]), np.mean(data_processed[0]),
                                             np.mean(data_processed[3]), np.mean(data_processed[4])])
                        data.append(row_data)
                j += 1
    data = np.array(data)
    data = pd.DataFrame(data, columns=["Id", "Ratio", "Dataset", "ML_Name", "Method", "bias1", "bias2", "bias_new", "accuracy", "f1score", "accuracy_real"])
    data.to_csv(filename, index=False)
    return data

In [4]:
prepareInputDataset(os.path.join("ratio_analysis_plots", "d_collected.csv"))

Unnamed: 0,Id,Ratio,Dataset,ML_Name,Method,bias1,bias2,bias_new,accuracy,f1score,accuracy_real
0,0,0.00,adult,KNN,mean_v1,0.381850,1.740150,0.157350,0.792754,0.586099,0.792754
1,0,0.00,adult,KNN,mean_v1,0.334909,1.669381,0.144922,0.808720,0.623683,0.808720
2,0,0.00,adult,KNN,mean_v1,0.413788,1.864143,0.149795,0.806263,0.610466,0.806263
3,0,0.00,adult,KNN,mean_v1,0.361306,1.561695,0.131106,0.804421,0.602566,0.804421
4,0,0.00,adult,KNN,mean_v1,0.374037,1.754310,0.149328,0.792447,0.578183,0.792447
...,...,...,...,...,...,...,...,...,...,...,...
741714,17,0.85,bank,MLP,multi_v2,0.003499,0.134014,0.016121,0.867315,0.057515,0.867315
741715,17,0.85,bank,MLP,multi_v2,0.002195,0.109565,0.008656,0.866210,0.024096,0.866210
741716,17,0.85,bank,MLP,multi_v2,0.000254,0.011024,0.004306,0.865989,0.040120,0.865989
741717,17,0.85,bank,MLP,multi_v2,0.004837,0.252215,0.014215,0.870632,0.017704,0.870632


In [5]:
data = pd.read_csv(os.path.join("ratio_analysis_plots", "d_collected.csv"))
data.head()

Unnamed: 0,Id,Ratio,Dataset,ML_Name,Method,bias1,bias2,bias_new,accuracy,f1score,accuracy_real
0,0,0.0,adult,KNN,mean_v1,0.38185,1.74015,0.15735,0.792754,0.586099,0.792754
1,0,0.0,adult,KNN,mean_v1,0.334909,1.669381,0.144922,0.80872,0.623683,0.80872
2,0,0.0,adult,KNN,mean_v1,0.413788,1.864143,0.149795,0.806263,0.610466,0.806263
3,0,0.0,adult,KNN,mean_v1,0.361306,1.561695,0.131106,0.804421,0.602566,0.804421
4,0,0.0,adult,KNN,mean_v1,0.374037,1.75431,0.149328,0.792447,0.578183,0.792447


In [6]:
data.describe()

Unnamed: 0,Id,Ratio,bias1,bias2,bias_new,accuracy,f1score,accuracy_real
count,741719.0,741719.0,741719.0,741719.0,741719.0,741719.0,741719.0,741719.0
mean,9.140577,0.457029,1.13575,1.148304,0.284652,0.678933,0.460578,0.68031
std,5.528026,0.276401,2.838644,2.162087,0.252371,0.112944,0.150399,0.111303
min,0.0,0.0,1.387779e-17,1.110223e-16,6.2e-05,0.134011,0.007445,0.134011
25%,4.0,0.2,0.09320741,0.2333333,0.099363,0.608861,0.365385,0.611111
50%,9.0,0.45,0.3072917,0.5950783,0.227783,0.67,0.467433,0.674121
75%,14.0,0.7,0.8520222,1.310853,0.380252,0.762358,0.557885,0.760516
max,19.0,0.95,303.0294,182.3215,1.921389,0.923077,0.898764,0.925


For every (ML Model, Imputation Method, Dataset)
$$
\begin{aligned}
\text{Bias} &= k \times \exp(\lambda \times \text{Accuracy})\\
\log(\text{Bias}) &= \log(k) + (\lambda \times \text{Accuracy})\\
\log(\text{Bias}) &= \lambda \times \text{Accuracy} + m + \varepsilon
\end{aligned}
$$
Fit linear regression, and collect $m$ and $\lambda$, where $e^m = k$  
$\sum\varepsilon^2$

In [7]:
from sklearn.linear_model import LinearRegression
def processInputDataset(data, outputFilename):
    global datasets, ml_names, methods
    output_data = {
        "bias1": [],
        "bias2": [],
        "bias_new": []
    }
    for dataset in datasets:
        for ml_name in ml_names:
            for method in methods:
                for bias_name in output_data.keys():
                    current_data = data[data["Dataset"] == dataset].copy()
                    current_data = current_data[current_data["ML_Name"] == ml_name]
                    current_data = current_data[current_data["Method"] == method]
                    current_data = current_data[current_data["accuracy"] >= 0.65]
                    bias_vector = current_data[bias_name].to_numpy().ravel()
                    acc_vector = current_data["accuracy"].to_numpy().ravel()
                    clf = LinearRegression(fit_intercept=True, normalize=False)
                    clf.fit(acc_vector.reshape((-1, 1)), np.log(bias_vector))
                    prediction = clf.predict(acc_vector.reshape((-1, 1)))
                    residual = np.mean(np.power((prediction - np.log(bias_vector)), 2))
                    output_data[bias_name].append([
                        dataset, ml_name, method, clf.coef_[0], np.exp(clf.intercept_), residual
                    ])
    col_names = ["dataset", "ml_name", "method", "lambda", "k", "Avg.e^2"]
    with pd.ExcelWriter(outputFilename) as writer:
        pd.DataFrame(output_data["bias_new"], columns=col_names).to_excel(writer, sheet_name="bias_new", index=False)
        pd.DataFrame(output_data["bias1"], columns=col_names).to_excel(writer, sheet_name="bias1", index=False)
        pd.DataFrame(output_data["bias2"], columns=col_names).to_excel(writer, sheet_name="bias2", index=False)

In [8]:
processInputDataset(data, os.path.join("ratio_analysis_plots", "d_processed.xlsx"))