## Linear Regression  
A Linear Regression function to find the coefficients based on ratio analysis outputs  
in order to find the trade off between bias (bias1 or bias2) and target value (accuracy or f1 score)  

In [1]:
import os
import pickle
import random
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from script_single_task import random_ratios

In [2]:
CREATE_INPUT_DATASET = False

In [3]:
iter_per_ratio = 300
classifiers = ["KNN", "LinearSVC", "SVC", "Forest", "LogReg", "Tree", "MLP"]
methods = ["mean_v1", "mean_v2", "similar_v1", "similar_v2", "multi_v1", "multi_v2"]
data_columns_1 = ["iter_number", "random_ratio", "ml_name", "method_name", "bias1", "bias2", "accuracy", "f1_score"]
data_columns_2 = ["iter_number", "random_ratio"] + \
                 ["ML_{}".format(x) for x in classifiers] + \
                 ["Imp_{}".format(x) for x in methods] + \
                 ["bias1", "bias2", "accuracy", "f1_score"]

### Prepare Dataset

In [4]:
def prepare_dataset(file_name_1=None, file_name_2=None):
    global classifiers, methods, data_columns
    data_1 = []
    data_2 = []
    for method in methods:
        if not os.path.exists("{}.pkl".format(method)):
            raise Exception("Required pkl not found: {}.pkl".format(method))
        with open("{}.pkl".format(method), "rb") as inFile:
            pkl_data = pickle.load(inFile)
        d_acc   = [x[0] for x in pkl_data]
        d_bias1 = [x[1] for x in pkl_data]
        d_bias2 = [x[2] for x in pkl_data]
        d_f1    = [x[3] for x in pkl_data]
        for i in range(iter_per_ratio):
            for j in range(len(random_ratios)):
                i_acc   = d_acc[i + j * iter_per_ratio]
                i_bias1 = d_bias1[i + j * iter_per_ratio]
                i_bias2 = d_bias2[i + j * iter_per_ratio]
                i_f1    = d_f1[i + j * iter_per_ratio]
                for clf in classifiers:
                    data_processed = [[], [], [], []] # [[acc avg], [bias1], [bias2], [f1 score]], remove -1, [None] cases
                    for x,y,z,w in zip(i_acc[clf], i_bias1[clf], i_bias2[clf], i_f1[clf]):
                        if (y > 0) and (z > 0) and len(w) == 2:
                            data_processed[0].append(x)
                            data_processed[1].append(y)
                            data_processed[2].append(z)
                            data_processed[3].append(np.mean(w))
                    row_data = [i, round(random_ratios[j], 2), clf, method, np.mean(data_processed[1]), np.mean(data_processed[2]), np.mean(data_processed[0]), np.mean(data_processed[3])]
                    data_1.append(row_data)
                    row_ML_data = [0 for _ in range(len(classifiers))]
                    row_Imp_data = [0 for _ in range(len(methods))]
                    row_ML_data[classifiers.index(clf)] = 1
                    row_Imp_data[methods.index(method)] = 1
                    row_data = [i, round(random_ratios[j], 2)] + \
                        row_ML_data + row_Imp_data + \
                        [np.mean(data_processed[1]), np.mean(data_processed[2]), np.mean(data_processed[0]), np.mean(data_processed[3])]
                    data_2.append(row_data)
    data_1 = pd.DataFrame(data_1, columns=data_columns_1)
    data_2 = pd.DataFrame(data_2, columns=data_columns_2)
    if file_name_1:
        data_1.to_csv(file_name_1, index=False)
    if file_name_2:
        data_2.to_csv(file_name_2, index=False)
    return (data_1, data_2)

In [5]:
if CREATE_INPUT_DATASET:
    prepare_dataset(os.path.join("ratio_analysis_plots", "collected_dataset.csv"),
                    os.path.join("ratio_analysis_plots", "collected_dataset_cat.csv"))

### Run Linear Regression

In [6]:
data = pd.read_csv(os.path.join("ratio_analysis_plots", "collected_dataset_cat.csv"))
data.drop(columns=["iter_number"], inplace=True)
rndid_clf = random.randint(0, len(classifiers)-1)
rndid_imp = random.randint(0, len(methods)-1)
data.drop(columns=["ML_{}".format(classifiers[rndid_clf]), "Imp_{}".format(methods[rndid_imp])], inplace=True)
print("Randomly Drop Column: ML_{}\tImp_{}".format(classifiers[rndid_clf], methods[rndid_imp]))

Randomly Drop Column: ML_KNN	Imp_similar_v2


In [7]:
data.head(5)

Unnamed: 0,random_ratio,ML_LinearSVC,ML_SVC,ML_Forest,ML_LogReg,ML_Tree,ML_MLP,Imp_mean_v1,Imp_mean_v2,Imp_similar_v1,Imp_multi_v1,Imp_multi_v2,bias1,bias2,accuracy,f1_score
0,0.0,0,0,0,0,0,0,1,0,0,0,0,0.097095,0.415398,0.62607,0.354117
1,0.0,1,0,0,0,0,0,1,0,0,0,0,1.136285,1.447717,0.65793,0.532717
2,0.0,0,1,0,0,0,0,1,0,0,0,0,0.891545,0.730072,0.60156,0.521637
3,0.0,0,0,1,0,0,0,1,0,0,0,0,0.459219,1.407112,0.689559,0.507812
4,0.0,0,0,0,1,0,0,1,0,0,0,0,0.989057,1.406028,0.665793,0.53884


In [8]:
cols = data.columns.tolist()[:-4]
X_train_bias1 = data[["bias1"] + cols]
X_train_bias2 = data[["bias2"] + cols]
y_train_acc = data[["accuracy"]].to_numpy()
y_train_f1 = data[["f1_score"]].to_numpy()

In [9]:
X_train_bias1.head(5)

Unnamed: 0,bias1,random_ratio,ML_LinearSVC,ML_SVC,ML_Forest,ML_LogReg,ML_Tree,ML_MLP,Imp_mean_v1,Imp_mean_v2,Imp_similar_v1,Imp_multi_v1,Imp_multi_v2
0,0.097095,0.0,0,0,0,0,0,0,1,0,0,0,0
1,1.136285,0.0,1,0,0,0,0,0,1,0,0,0,0
2,0.891545,0.0,0,1,0,0,0,0,1,0,0,0,0
3,0.459219,0.0,0,0,1,0,0,0,1,0,0,0,0
4,0.989057,0.0,0,0,0,1,0,0,1,0,0,0,0


In [10]:
X_train_bias2.head(5)

Unnamed: 0,bias2,random_ratio,ML_LinearSVC,ML_SVC,ML_Forest,ML_LogReg,ML_Tree,ML_MLP,Imp_mean_v1,Imp_mean_v2,Imp_similar_v1,Imp_multi_v1,Imp_multi_v2
0,0.415398,0.0,0,0,0,0,0,0,1,0,0,0,0
1,1.447717,0.0,1,0,0,0,0,0,1,0,0,0,0
2,0.730072,0.0,0,1,0,0,0,0,1,0,0,0,0
3,1.407112,0.0,0,0,1,0,0,0,1,0,0,0,0
4,1.406028,0.0,0,0,0,1,0,0,1,0,0,0,0


In [11]:
result_bias1_acc = sm.OLS(endog=y_train_acc, exog=sm.add_constant(X_train_bias1)).fit()
result_bias1_f1 = sm.OLS(endog=y_train_f1, exog=sm.add_constant(X_train_bias1)).fit()
result_bias2_acc = sm.OLS(endog=y_train_acc, exog=sm.add_constant(X_train_bias2)).fit()
result_bias2_f1 = sm.OLS(endog=y_train_f1, exog=sm.add_constant(X_train_bias2)).fit()

In [12]:
result_bias1_acc.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.737
Model:,OLS,Adj. R-squared:,0.737
Method:,Least Squares,F-statistic:,54320.0
Date:,"Sat, 27 Jun 2020",Prob (F-statistic):,0.0
Time:,12:26:50,Log-Likelihood:,543490.0
No. Observations:,252000,AIC:,-1087000.0
Df Residuals:,251986,BIC:,-1087000.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6673,0.000,3099.355,0.000,0.667,0.668
bias1,-0.0056,2.4e-05,-232.102,0.000,-0.006,-0.006
random_ratio,-0.1309,0.000,-663.552,0.000,-0.131,-0.131
ML_LinearSVC,0.0087,0.000,41.437,0.000,0.008,0.009
ML_SVC,-0.0415,0.000,-198.295,0.000,-0.042,-0.041
ML_Forest,-0.0007,0.000,-3.474,0.001,-0.001,-0.000
ML_LogReg,0.0113,0.000,54.128,0.000,0.011,0.012
ML_Tree,-0.0131,0.000,-62.382,0.000,-0.013,-0.013
ML_MLP,-0.0074,0.000,-35.051,0.000,-0.008,-0.007

0,1,2,3
Omnibus:,43445.612,Durbin-Watson:,2.318
Prob(Omnibus):,0.0,Jarque-Bera (JB):,202234.353
Skew:,-0.773,Prob(JB):,0.0
Kurtosis:,7.107,Cond. No.,21.1


In [13]:
result_bias2_acc.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.685
Model:,OLS,Adj. R-squared:,0.685
Method:,Least Squares,F-statistic:,42220.0
Date:,"Sat, 27 Jun 2020",Prob (F-statistic):,0.0
Time:,12:26:50,Log-Likelihood:,520900.0
No. Observations:,252000,AIC:,-1042000.0
Df Residuals:,251986,BIC:,-1042000.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6718,0.000,2861.877,0.000,0.671,0.672
bias2,0.0034,5.62e-05,60.584,0.000,0.003,0.004
random_ratio,-0.1403,0.000,-662.873,0.000,-0.141,-0.140
ML_LinearSVC,0.0046,0.000,19.986,0.000,0.004,0.005
ML_SVC,-0.0464,0.000,-202.123,0.000,-0.047,-0.046
ML_Forest,-0.0116,0.000,-50.978,0.000,-0.012,-0.011
ML_LogReg,0.0075,0.000,32.701,0.000,0.007,0.008
ML_Tree,-0.0179,0.000,-78.110,0.000,-0.018,-0.017
ML_MLP,-0.0138,0.000,-60.462,0.000,-0.014,-0.013

0,1,2,3
Omnibus:,53223.199,Durbin-Watson:,2.285
Prob(Omnibus):,0.0,Jarque-Bera (JB):,155776.812
Skew:,-1.107,Prob(JB):,0.0
Kurtosis:,6.152,Cond. No.,12.4


In [14]:
result_bias1_f1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.856
Model:,OLS,Adj. R-squared:,0.856
Method:,Least Squares,F-statistic:,115600.0
Date:,"Sat, 27 Jun 2020",Prob (F-statistic):,0.0
Time:,12:26:50,Log-Likelihood:,538890.0
No. Observations:,252000,AIC:,-1078000.0
Df Residuals:,251986,BIC:,-1078000.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3639,0.000,1659.677,0.000,0.363,0.364
bias1,0.0034,2.44e-05,141.254,0.000,0.003,0.003
random_ratio,-0.1466,0.000,-729.298,0.000,-0.147,-0.146
ML_LinearSVC,0.1605,0.000,753.826,0.000,0.160,0.161
ML_SVC,0.1553,0.000,728.676,0.000,0.155,0.156
ML_Forest,0.1491,0.000,686.022,0.000,0.149,0.149
ML_LogReg,0.1590,0.000,746.814,0.000,0.159,0.159
ML_Tree,0.1480,0.000,693.782,0.000,0.148,0.148
ML_MLP,0.1592,0.000,744.005,0.000,0.159,0.160

0,1,2,3
Omnibus:,32849.777,Durbin-Watson:,2.08
Prob(Omnibus):,0.0,Jarque-Bera (JB):,130014.486
Skew:,-0.612,Prob(JB):,0.0
Kurtosis:,6.299,Cond. No.,21.1


In [15]:
result_bias2_f1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.853
Model:,OLS,Adj. R-squared:,0.853
Method:,Least Squares,F-statistic:,112800.0
Date:,"Sat, 27 Jun 2020",Prob (F-statistic):,0.0
Time:,12:26:50,Log-Likelihood:,536310.0
No. Observations:,252000,AIC:,-1073000.0
Df Residuals:,251986,BIC:,-1072000.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3625,0.000,1641.792,0.000,0.362,0.363
bias2,-0.0063,5.28e-05,-120.162,0.000,-0.006,-0.006
random_ratio,-0.1403,0.000,-704.533,0.000,-0.141,-0.140
ML_LinearSVC,0.1645,0.000,763.348,0.000,0.164,0.165
ML_SVC,0.1600,0.000,741.549,0.000,0.160,0.160
ML_Forest,0.1566,0.000,728.511,0.000,0.156,0.157
ML_LogReg,0.1628,0.000,755.428,0.000,0.162,0.163
ML_Tree,0.1518,0.000,706.125,0.000,0.151,0.152
ML_MLP,0.1641,0.000,763.270,0.000,0.164,0.165

0,1,2,3
Omnibus:,20343.863,Durbin-Watson:,2.287
Prob(Omnibus):,0.0,Jarque-Bera (JB):,115075.582
Skew:,-0.152,Prob(JB):,0.0
Kurtosis:,6.297,Cond. No.,12.4
