## Linear Regression  
This notebook shows steps to collect "Missing Completely At Random" experiment results, and fit an exponential curve on each combination of `ML Algorithm`, `Imputation Method`, and `Dataset`  
Note that data for $\text{accuracy} < 0.65$ will be discarded

In [1]:
import os
os.chdir("..")
import pickle
import random
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from script_single_task import random_ratios, acc, f1score, bias1, bias2, newBias

In [2]:
datasets = ["adult", "compas", "communities", "german", "titanic", "bank"]
ml_names = ["KNN", "LinearSVC", "Forest", "LogReg", "Tree", "MLP"]
methods = ["mean_v1", "mean_v2", "similar_v1", "similar_v2", "multi_v1", "multi_v2"]
iter_per_ratio = 200

In [3]:
def prepareInputDataset(filename, overwrite=False):
    if os.path.exists(filename) and (not overwrite):
        return pd.read_csv(filename)
    global datasets, ml_names, methods, iter_per_ratio
    data = []
    for dataset in datasets:
        assert os.path.exists(os.path.join("condor_outputs", "acc", dataset))
        for method in methods:
            assert os.path.exists(os.path.join("condor_outputs", "acc", dataset, "{}.pkl".format(method)))
            with open(os.path.join("condor_outputs", "acc", dataset, "{}.pkl".format(method)), "rb") as inFile:
                pkl_data = pickle.load(inFile)
            j = 0
            for i in range(0, len(pkl_data), iter_per_ratio):
                i_data = pkl_data[i:(i+iter_per_ratio)]
                for ml_name in ml_names:
                    ml_data = [x[ml_name] for x in i_data]
                    for ml_mat in ml_data:
                        # [[acc avg], [bias1], [bias2], [f1 score], [real acc], [new bias]], remove -1, [None] cases
                        data_processed = [[], [], [], [], [], []]
                        for mm in ml_mat:
                            if len(mm) < 1:
                                continue
                            cf_m, acc_m = mm[0], mm[1]
                        try:
                            x = acc(cf_m)
                            y = bias1(cf_m)
                            z = bias2(cf_m)
                            w = f1score(cf_m)
                            k = newBias(cf_m)
                        except Exception as e:
                            continue
                        if (y > 0) and (z > 0) and len(w) == 2:
                            data_processed[0].append(x)
                            data_processed[1].append(y)
                            data_processed[2].append(z)
                            data_processed[3].append(np.mean(w))
                            data_processed[4].append(acc_m)
                            data_processed[5].append(k)
                        if [] in data_processed: continue
                        row_data = np.array([i // iter_per_ratio, round(random_ratios[j], 2), dataset, ml_name, method,
                                             np.mean(data_processed[1]), np.mean(data_processed[2]),
                                             np.mean(data_processed[5]), np.mean(data_processed[0]),
                                             np.mean(data_processed[3]), np.mean(data_processed[4])])
                        data.append(row_data)
                j += 1
    data = np.array(data)
    data = pd.DataFrame(data, columns=["Id", "Ratio", "Dataset", "ML_Name", "Method", "bias1", "bias2", "bias_new", "accuracy", "f1score", "accuracy_real"])
    data.to_csv(filename, index=False)
    return data

In [4]:
prepareInputDataset(os.path.join("ratio_analysis_plots", "d_collected.csv"))

Unnamed: 0,Id,Ratio,Dataset,ML_Name,Method,bias1,bias2,bias_new,accuracy,f1score,accuracy_real
0,0,0.0,adult,KNN,mean_v1,0.43205111336914115,1.7888108828011546,0.1525437999579876,0.7962239583333334,0.5990512762593178,0.7962239583333334
1,0,0.0,adult,KNN,mean_v1,0.3967816993094079,1.796803353908621,0.1421383158955286,0.7962239583333334,0.6015212106955227,0.7962239583333334
2,0,0.0,adult,KNN,mean_v1,0.2839703095375258,1.0242612473602515,0.13881473983175935,0.8050130208333334,0.5882564335809624,0.8050130208333334
3,0,0.0,adult,KNN,mean_v1,0.4756232042828499,2.044317800784309,0.23263645559156979,0.8033854166666666,0.5975819422106372,0.8033854166666666
4,0,0.0,adult,KNN,mean_v1,0.3328738751197699,1.277517948856952,0.1506994502368188,0.7819010416666666,0.561186124998998,0.7819010416666666
...,...,...,...,...,...,...,...,...,...,...,...
684795,16,0.8,bank,MLP,multi_v2,0.000778658389105966,0.054828621690047985,0.007624086840919765,0.8726227333038479,0.038421524663677126,0.8726227333038479
684796,16,0.8,bank,MLP,multi_v2,0.042628378831607694,0.7239305834482898,0.11262468503397807,0.8586908447589562,0.0758930938787054,0.8586908447589562
684797,16,0.8,bank,MLP,multi_v2,0.004669567403280731,0.3065151751168057,0.005998585036326009,0.8730650154798761,0.010071008723215733,0.8730650154798761
684798,16,0.8,bank,MLP,multi_v2,0.00923362043790291,0.29959904301346607,0.015210717911756422,0.8657673595754091,0.05692969755469755,0.8657673595754091


In [5]:
data = pd.read_csv(os.path.join("ratio_analysis_plots", "d_collected.csv"))
data.head()

Unnamed: 0,Id,Ratio,Dataset,ML_Name,Method,bias1,bias2,bias_new,accuracy,f1score,accuracy_real
0,0,0.0,adult,KNN,mean_v1,0.432051,1.788811,0.152544,0.796224,0.599051,0.796224
1,0,0.0,adult,KNN,mean_v1,0.396782,1.796803,0.142138,0.796224,0.601521,0.796224
2,0,0.0,adult,KNN,mean_v1,0.28397,1.024261,0.138815,0.805013,0.588256,0.805013
3,0,0.0,adult,KNN,mean_v1,0.475623,2.044318,0.232636,0.803385,0.597582,0.803385
4,0,0.0,adult,KNN,mean_v1,0.332874,1.277518,0.150699,0.781901,0.561186,0.781901


In [6]:
data.describe()

Unnamed: 0,Id,Ratio,bias1,bias2,bias_new,accuracy,f1score,accuracy_real
count,684800.0,684800.0,684800.0,684800.0,684800.0,684800.0,684800.0,684800.0
mean,8.678952,0.433948,1.258585,1.156257,0.287343,0.678297,0.459011,0.679798
std,5.368392,0.26842,3.25949,2.809824,0.263863,0.114506,0.160393,0.113022
min,0.0,0.0,2.775558e-17,5.5511150000000004e-17,8.6e-05,0.125,0.00725,0.125
25%,4.0,0.2,0.08493615,0.2166667,0.093197,0.606599,0.358613,0.61
50%,9.0,0.45,0.3014691,0.5685443,0.226316,0.67,0.466667,0.670927
75%,13.0,0.65,0.8825006,1.275,0.386494,0.762044,0.558421,0.76
max,19.0,0.95,313.171,715.7586,1.922733,0.923913,0.90538,0.92


For every (ML Model, Imputation Method, Dataset)
$$
\begin{aligned}
\text{Bias} &= k \times \exp(\lambda \times \text{Accuracy})\\
\log(\text{Bias}) &= \log(k) + (\lambda \times \text{Accuracy})\\
\log(\text{Bias}) &= \lambda \times \text{Accuracy} + m + \varepsilon
\end{aligned}
$$
Fit linear regression, and collect $m$ and $\lambda$, where $e^m = k$  
$\sum\varepsilon^2$

In [7]:
from sklearn.linear_model import LinearRegression
def processInputDataset(data, outputFilename):
    global datasets, ml_names, methods
    output_data = {
        "bias1": [],
        "bias2": [],
        "bias_new": []
    }
    for dataset in datasets:
        for ml_name in ml_names:
            for method in methods:
                for bias_name in output_data.keys():
                    current_data = data[data["Dataset"] == dataset].copy()
                    current_data = current_data[current_data["ML_Name"] == ml_name]
                    current_data = current_data[current_data["Method"] == method]
                    current_data = current_data[current_data["accuracy"] >= 0.65]
                    bias_vector = current_data[bias_name].to_numpy().ravel()
                    acc_vector = current_data["accuracy"].to_numpy().ravel()
                    clf = LinearRegression(fit_intercept=True, normalize=False)
                    clf.fit(acc_vector.reshape((-1, 1)), np.log(bias_vector))
                    prediction = clf.predict(acc_vector.reshape((-1, 1)))
                    residual = np.mean(np.power((prediction - np.log(bias_vector)), 2))
                    output_data[bias_name].append([
                        dataset, ml_name, method, clf.coef_[0], np.exp(clf.intercept_), residual
                    ])
    col_names = ["dataset", "ml_name", "method", "lambda", "k", "Avg.e^2"]
    with pd.ExcelWriter(outputFilename) as writer:
        pd.DataFrame(output_data["bias_new"], columns=col_names).to_excel(writer, sheet_name="bias_new", index=False)
        pd.DataFrame(output_data["bias1"], columns=col_names).to_excel(writer, sheet_name="bias1", index=False)
        pd.DataFrame(output_data["bias2"], columns=col_names).to_excel(writer, sheet_name="bias2", index=False)

In [8]:
processInputDataset(data, os.path.join("ratio_analysis_plots", "d_processed.xlsx"))