## Linear Regression  
This notebook shows steps to collect "Missing Completely At Random" experiment results, and fit an exponential curve on each combination of `ML Algorithm`, `Imputation Method`, and `Dataset`  
Note that data for $\text{accuracy} < 0.65$ will be discarded

In [1]:
import os
os.chdir("..")
import pickle
import random
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from script_single_task import random_ratios, acc, f1score, bias1, bias2, newBias

In [2]:
datasets = ["adult", "compas", "communities", "german", "titanic", "bank"]
ml_names = ["KNN", "LinearSVC", "Forest", "LogReg", "Tree", "MLP"]
methods = ["mean_v1", "mean_v2", "similar_v1", "similar_v2", "multi_v1", "multi_v2"]
iter_per_ratio = 200

In [3]:
def prepareInputDataset(filename, overwrite=False):
    if os.path.exists(filename) and (not overwrite):
        return pd.read_csv(filename)
    global datasets, ml_names, methods, iter_per_ratio
    data = []
    for dataset in datasets:
        assert os.path.exists(os.path.join("condor_outputs", "acc", dataset))
        for method in methods:
            assert os.path.exists(os.path.join("condor_outputs", "acc", dataset, "{}.pkl".format(method)))
            with open(os.path.join("condor_outputs", "acc", dataset, "{}.pkl".format(method)), "rb") as inFile:
                pkl_data = pickle.load(inFile)
            j = 0
            for i in range(0, len(pkl_data), iter_per_ratio):
                i_data = pkl_data[i:(i+iter_per_ratio)]
                for ml_name in ml_names:
                    ml_data = [x[ml_name] for x in i_data]
                    for ml_mat in ml_data:
                        # [[acc avg], [bias1], [bias2], [f1 score], [real acc], [new bias]], remove -1, [None] cases
                        data_processed = [[], [], [], [], [], []]
                        for mm in ml_mat:
                            if len(mm) < 1:
                                continue
                            cf_m, acc_m = mm[0], mm[1]
                        try:
                            x = acc(cf_m)
                            y = bias1(cf_m)
                            z = bias2(cf_m)
                            w = f1score(cf_m)
                            k = newBias(cf_m)
                        except Exception as e:
                            continue
                        if (y > 0) and (z > 0) and len(w) == 2:
                            data_processed[0].append(x)
                            data_processed[1].append(y)
                            data_processed[2].append(z)
                            data_processed[3].append(np.mean(w))
                            data_processed[4].append(acc_m)
                            data_processed[5].append(k)
                        if [] in data_processed: continue
                        row_data = np.array([i // iter_per_ratio, round(random_ratios[j], 2), dataset, ml_name, method,
                                             np.mean(data_processed[1]), np.mean(data_processed[2]),
                                             np.mean(data_processed[5]), np.mean(data_processed[0]),
                                             np.mean(data_processed[3]), np.mean(data_processed[4])])
                        data.append(row_data)
                j += 1
    data = np.array(data)
    data = pd.DataFrame(data, columns=["Id", "Ratio", "Dataset", "ML_Name", "Method", "bias1", "bias2", "bias_new", "accuracy", "f1score", "accuracy_real"])
    data.to_csv(filename, index=False)
    return data

In [4]:
prepareInputDataset(os.path.join("ratio_analysis_plots", "d_collected.csv"))

Unnamed: 0,Id,Ratio,Dataset,ML_Name,Method,bias1,bias2,bias_new,accuracy,f1score,accuracy_real
0,0,0.0,adult,KNN,mean_v1,0.4785280917010637,1.7920218454629138,0.2247547465482332,0.7864583333333334,0.5736991616434559,0.7864583333333334
1,0,0.0,adult,KNN,mean_v1,0.4084988239752301,1.7816534868195109,0.15631258732159448,0.7932942708333334,0.5913190361647139,0.7932942708333334
2,0,0.0,adult,KNN,mean_v1,0.3863147723009045,1.3593051180257727,0.1481923822599727,0.798828125,0.5971236499889796,0.798828125
3,0,0.0,adult,KNN,mean_v1,0.36880605269041145,1.343092598808553,0.1434921292877055,0.7981770833333334,0.5923739616688188,0.7981770833333334
4,0,0.0,adult,KNN,mean_v1,0.3410212357402043,1.3034698593702174,0.15942308116346404,0.7916666666666666,0.560093463690586,0.7916666666666666
...,...,...,...,...,...,...,...,...,...,...,...
695911,17,0.85,bank,MLP,multi_v2,0.003498631223895335,0.1340144914270993,0.016120808554629325,0.8673153471915082,0.05751540867841546,0.8673153471915082
695912,17,0.85,bank,MLP,multi_v2,0.0021947473362233516,0.10956510466559188,0.008655577434475665,0.8662096417514374,0.024095553714443524,0.8662096417514374
695913,17,0.85,bank,MLP,multi_v2,0.00025395577110869094,0.011023564338407565,0.004305703952553391,0.8659885006634233,0.04011992671145412,0.8659885006634233
695914,17,0.85,bank,MLP,multi_v2,0.004836991131202929,0.2522153794156361,0.014214730778846833,0.8706324635117205,0.01770399411908846,0.8706324635117205


In [5]:
data = pd.read_csv(os.path.join("ratio_analysis_plots", "d_collected.csv"))
data.head()

Unnamed: 0,Id,Ratio,Dataset,ML_Name,Method,bias1,bias2,bias_new,accuracy,f1score,accuracy_real
0,0,0.0,adult,KNN,mean_v1,0.478528,1.792022,0.224755,0.786458,0.573699,0.786458
1,0,0.0,adult,KNN,mean_v1,0.408499,1.781653,0.156313,0.793294,0.591319,0.793294
2,0,0.0,adult,KNN,mean_v1,0.386315,1.359305,0.148192,0.798828,0.597124,0.798828
3,0,0.0,adult,KNN,mean_v1,0.368806,1.343093,0.143492,0.798177,0.592374,0.798177
4,0,0.0,adult,KNN,mean_v1,0.341021,1.30347,0.159423,0.791667,0.560093,0.791667


In [6]:
data.describe()

Unnamed: 0,Id,Ratio,bias1,bias2,bias_new,accuracy,f1score,accuracy_real
count,695916.0,695916.0,695916.0,695916.0,695916.0,695916.0,695916.0,695916.0
mean,8.723099,0.436155,1.177428,1.127942,0.28926,0.674831,0.460701,0.676299
std,5.433094,0.271655,2.936399,2.323797,0.258973,0.114086,0.151931,0.112408
min,0.0,0.0,1.387779e-17,1.110223e-16,6.2e-05,0.134011,0.007445,0.134011
25%,4.0,0.2,0.0890041,0.2222222,0.096389,0.602792,0.364069,0.609159
50%,9.0,0.45,0.2994382,0.5672255,0.228674,0.666667,0.467177,0.668797
75%,13.0,0.65,0.8615167,1.258412,0.390659,0.75651,0.557752,0.755
max,19.0,0.95,303.0294,579.428,1.921389,0.923077,0.898764,0.925


For every (ML Model, Imputation Method, Dataset)
$$
\begin{aligned}
\text{Bias} &= k \times \exp(\lambda \times \text{Accuracy})\\
\log(\text{Bias}) &= \log(k) + (\lambda \times \text{Accuracy})\\
\log(\text{Bias}) &= \lambda \times \text{Accuracy} + m + \varepsilon
\end{aligned}
$$
Fit linear regression, and collect $m$ and $\lambda$, where $e^m = k$  
$\sum\varepsilon^2$

In [7]:
from sklearn.linear_model import LinearRegression
def processInputDataset(data, outputFilename):
    global datasets, ml_names, methods
    output_data = {
        "bias1": [],
        "bias2": [],
        "bias_new": []
    }
    for dataset in datasets:
        for ml_name in ml_names:
            for method in methods:
                for bias_name in output_data.keys():
                    current_data = data[data["Dataset"] == dataset].copy()
                    current_data = current_data[current_data["ML_Name"] == ml_name]
                    current_data = current_data[current_data["Method"] == method]
                    current_data = current_data[current_data["accuracy"] >= 0.65]
                    bias_vector = current_data[bias_name].to_numpy().ravel()
                    acc_vector = current_data["accuracy"].to_numpy().ravel()
                    clf = LinearRegression(fit_intercept=True, normalize=False)
                    clf.fit(acc_vector.reshape((-1, 1)), np.log(bias_vector))
                    prediction = clf.predict(acc_vector.reshape((-1, 1)))
                    residual = np.mean(np.power((prediction - np.log(bias_vector)), 2))
                    output_data[bias_name].append([
                        dataset, ml_name, method, clf.coef_[0], np.exp(clf.intercept_), residual
                    ])
    col_names = ["dataset", "ml_name", "method", "lambda", "k", "Avg.e^2"]
    with pd.ExcelWriter(outputFilename) as writer:
        pd.DataFrame(output_data["bias_new"], columns=col_names).to_excel(writer, sheet_name="bias_new", index=False)
        pd.DataFrame(output_data["bias1"], columns=col_names).to_excel(writer, sheet_name="bias1", index=False)
        pd.DataFrame(output_data["bias2"], columns=col_names).to_excel(writer, sheet_name="bias2", index=False)

In [8]:
processInputDataset(data, os.path.join("ratio_analysis_plots", "d_processed.xlsx"))