# Compas Data Imputation Analysis

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from utils.data import create_compas_dataset, Dataset
from utils.generator import gen_complete_random
from utils.completer import complete_by_mean_col, complete_by_multi, complete_by_similar_row

In [4]:
data = create_compas_dataset()

In [5]:
data_compas_complete = data.copy()
tmp_concat = pd.concat([data_compas_complete.X, pd.DataFrame(data_compas_complete.y, columns=["_TARGET_"])], axis=1)
tmp_concat.dropna(inplace=True)
tmp_concat.reset_index(drop=True, inplace=True)
data_compas_complete.X = tmp_concat.drop(columns=["_TARGET_"]).copy()
data_compas_complete.y = tmp_concat["_TARGET_"].copy().to_numpy().ravel()

In [6]:
X = data_compas_complete.X.drop(columns=data_compas_complete.protected).copy().to_numpy()
y = data_compas_complete.y.copy()

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [8]:
from imblearn.over_sampling import SVMSMOTE

### Parameter Searching

In [9]:
from sklearn.model_selection import GridSearchCV

def grid_search(X, y, model, params, smote):
    print("Model: {}".format(model.__class__.__name__))
    X_res, y_res = smote.fit_resample(X, y) # enlarge dataset
    search = GridSearchCV(model, param_grid=params, cv=10, n_jobs=-1)
    search.fit(X_res, y_res)
    print("Best parameter: {}".format(search.best_params_))
    print("Acc best: {:.4f}".format(search.best_score_))
    print("Acc on input data: {:.4f}".format(search.best_estimator_.score(X, y)))
    print("Acc on enlarged data: {:.4f}".format(search.best_estimator_.score(X_res, y_res)))
    return search.best_params_

In [10]:
all_params = {
    "KNN": None,
    "LinearSVC": None,
    "SVC": None,
    "Forest": None,
    "LogReg": None,
    "Tree": None,
    "MLP": None,
}

In [11]:
tmp_params = {
    "n_neighbors": [2, 5, 10, 50, 100, 200, 500],
    "weights": ['uniform', 'distance'],
    "leaf_size": [10, 30, 100],
}
all_params["KNN"] = grid_search(X, y, KNeighborsClassifier(), tmp_params, SVMSMOTE(random_state=22))

Model: KNeighborsClassifier
Best parameter: {'leaf_size': 10, 'n_neighbors': 2, 'weights': 'distance'}
Acc best: 0.7460
Acc on input data: 1.0000
Acc on enlarged data: 1.0000


In [12]:
tmp_params = {
    "tol": [1e-5, 1e-4, 1e-3],
    "C": [0.001, 0.01, 0.1, 1, 10],
    "max_iter": [1000, 5000, 10000],
}
all_params["LinearSVC"] = grid_search(X, y, LinearSVC(dual=False), tmp_params, SVMSMOTE(random_state=22))

Model: LinearSVC
Best parameter: {'C': 0.1, 'max_iter': 1000, 'tol': 0.001}
Acc best: 0.6728
Acc on input data: 0.6641
Acc on enlarged data: 0.6725


In [13]:
tmp_params = {
    "tol": [1e-5, 1e-4, 1e-3],
    "C": [0.001, 0.01, 0.1, 1, 10],
    "max_iter": [1000, 5000, 10000, -1],
}
all_params["SVC"] = grid_search(X, y, SVC(), tmp_params, SVMSMOTE(random_state=22))

Model: SVC
Best parameter: {'C': 10, 'max_iter': -1, 'tol': 0.0001}
Acc best: 0.6452
Acc on input data: 0.6285
Acc on enlarged data: 0.6489


In [14]:
tmp_params = {
    "n_estimators": [50, 100, 200, 500],
    "max_depth": [None, 10, 50, 100],
    "min_samples_leaf": [1, 5, 10],
}
all_params["Forest"] = grid_search(X, y, RandomForestClassifier(), tmp_params, SVMSMOTE(random_state=22))

Model: RandomForestClassifier
Best parameter: {'max_depth': 100, 'min_samples_leaf': 5, 'n_estimators': 100}
Acc best: 0.7676
Acc on input data: 0.8210
Acc on enlarged data: 0.8597


In [15]:
tmp_params = {
    "tol": [1e-5, 1e-4, 1e-3],
    "C": [1e-2, 1e-1, 1, 1e1, 1e2],
    "max_iter": [100, 500, 1000, 2000],
}
all_params["LogReg"] = grid_search(X, y, LogisticRegression(), tmp_params, SVMSMOTE(random_state=22))

Model: LogisticRegression
Best parameter: {'C': 10.0, 'max_iter': 100, 'tol': 1e-05}
Acc best: 0.6755
Acc on input data: 0.6767
Acc on enlarged data: 0.6762


In [16]:
tmp_params = {
    "max_depth": [None, 10, 50, 100, 200],
    "max_leaf_nodes": [None, 10, 100, 1000],
    "min_samples_leaf": [1, 5, 10],
}
all_params["Tree"] = grid_search(X, y, DecisionTreeClassifier(), tmp_params, SVMSMOTE(random_state=22))

Model: DecisionTreeClassifier
Best parameter: {'max_depth': 10, 'max_leaf_nodes': 100, 'min_samples_leaf': 1}
Acc best: 0.7495
Acc on input data: 0.7267
Acc on enlarged data: 0.7765


In [17]:
tmp_params = {
    "alpha": [1e-5, 1e-4, 1e-3],
    "learning_rate_init": [1e-4, 1e-3, 1e-2],
    "max_iter": [200, 500, 1000],
}
all_params["MLP"] = grid_search(X, y, MLPClassifier(), tmp_params, SVMSMOTE(random_state=22))

Model: MLPClassifier
Best parameter: {'alpha': 0.001, 'learning_rate_init': 0.01, 'max_iter': 200}
Acc best: 0.6864
Acc on input data: 0.6837
Acc on enlarged data: 0.6932


### Data Imputation

In [18]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

def helper_freq(array):
    """simple helper function to return the most frequent number in an array"""
    count = np.bincount(array)
    return array[np.argmax(count)]

def average_cv(cv_data):
    # compute average for the confusion matrix data for each fold
    result = {}
    for name, data in cv_data.items():
        new_data = {
            "African-American": np.array([m["African-American"] for m in data]).mean(axis=0).tolist(),
            "Caucasian": np.array([m["Caucasian"] for m in data]).mean(axis=0).tolist()
        }
        result[name] = new_data
    return result

def compute_confusion_matrix(X_train, y_train, X_test, y_test, clf, protected_features, multi=False):
    # X are pandas dataframe
    # y are numpy array
    # clf is a sklearn classifier
    # protected_features is list
    smote = SVMSMOTE(random_state=22)
    if not multi:
        X_train = X_train.drop(columns=protected_features).copy().to_numpy()
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        clf.fit(X_train_res, y_train_res)
        print("Acc: {:.4f}".format(clf.score(X_test.drop(columns=protected_features).copy().to_numpy(), y_test)), end=" ")
        X_test_AA = X_test[X_test["race"] == "African-American"].drop(columns=protected_features).to_numpy()
        X_test_C = X_test[X_test["race"] == "Caucasian"].drop(columns=protected_features).to_numpy()
        y_test_AA = y_test[X_test[X_test["race"] == "African-American"].index.tolist()]
        y_test_C = y_test[X_test[X_test["race"] == "Caucasian"].index.tolist()]
        matrix_AA = confusion_matrix(y_test_AA, clf.predict(X_test_AA))
        matrix_C = confusion_matrix(y_test_C, clf.predict(X_test_C))
    else:
        prediction_AA = []
        prediction_C = []
        X_test_first = X_test[0]
        y_test_AA = y_test[X_test_first[X_test_first["race"] == "African-American"].index.tolist()]
        y_test_C = y_test[X_test_first[X_test_first["race"] == "Caucasian"].index.tolist()]
        scores = [0, 0]
        for X_train_m in X_train:
            X_train_m = X_train_m.drop(columns=protected_features).copy().to_numpy()
            X_train_res, y_train_res = smote.fit_resample(X_train_m, y_train)
            clf.fit(X_train_res, y_train_res)
            for X_test_m in X_test:
                X_test_AA = X_test_m[X_test_m["race"] == "African-American"].drop(columns=protected_features).to_numpy()
                X_test_C = X_test_m[X_test_m["race"] == "Caucasian"].drop(columns=protected_features).to_numpy()
                prediction_AA.append(clf.predict(X_test_AA))
                prediction_C.append(clf.predict(X_test_C))
                scores[0] += clf.score(X_test_m.drop(columns=protected_features).copy().to_numpy(), y_test)
                scores[1] += 1
        print("Acc: {:.4f}".format(scores[0] / scores[1]), end=" ")
        # compute final predictions by voting
        prediction_AA = np.apply_along_axis(helper_freq, 0, np.array(prediction_AA))
        prediction_C = np.apply_along_axis(helper_freq, 0, np.array(prediction_C))
        matrix_AA = confusion_matrix(y_test_AA, prediction_AA)
        matrix_C = confusion_matrix(y_test_C, prediction_C)
    result = {
        "African-American": matrix_AA.ravel().tolist(), # [tn, fp, fn, tp]
        "Caucasian": matrix_C.ravel().tolist()
    }
    return result

def test_imputation(X, y, protected_features, completer_func=None, multi=False):
    # X is pandas dataframe
    # y is numpy array,
    # protected_features is list
    # completer func is the imputation function
    global all_params
    clfs = { # define all the classifiers with best parameters
        "KNN": KNeighborsClassifier(n_neighbors=all_params["KNN"]["n_neighbors"], weights=all_params["KNN"]["weights"], leaf_size=all_params["KNN"]["leaf_size"]),
        "LinearSVC": LinearSVC(dual=False, tol=all_params["LinearSVC"]["tol"], C=all_params["LinearSVC"]["C"], max_iter=all_params["LinearSVC"]["max_iter"]),
        "SVC": SVC(tol=all_params["SVC"]["tol"], C=all_params["SVC"]["C"], max_iter=all_params["SVC"]["max_iter"]),
        "Forest": RandomForestClassifier(n_estimators=all_params["Forest"]["n_estimators"], max_depth=all_params["Forest"]["max_depth"], min_samples_leaf=all_params["Forest"]["min_samples_leaf"]),
        "LogReg": LogisticRegression(tol=all_params["LogReg"]["tol"], C=all_params["LogReg"]["C"], max_iter=all_params["LogReg"]["max_iter"]),
        "Tree": DecisionTreeClassifier(max_depth=all_params["Tree"]["max_depth"], max_leaf_nodes=all_params["Tree"]["max_leaf_nodes"], min_samples_leaf=all_params["Tree"]["min_samples_leaf"]),
        "MLP": MLPClassifier(alpha=all_params["MLP"]["alpha"], learning_rate_init=all_params["MLP"]["learning_rate_init"], max_iter=all_params["MLP"]["max_iter"]),
    }
    data_cv = { # save each cv output
        "KNN": [],
        "LinearSVC": [],
        "SVC": [],
        "Forest": [],
        "LogReg": [],
        "Tree": [],
        "MLP": [],
    }
    kf = KFold(n_splits=10)
    fold = 1
    for train_idx, test_idx in kf.split(X):
        print("Fold {}".format(fold), end=" ")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        X_test = X_test.reset_index(drop=True)
        X_train = X_train.reset_index(drop=True)
        if completer_func:
        # do imputations on training set and test set individually
            data_incomplete = Dataset("tmp", X_train, y_train, auto_convert=False, protected_features=protected_features)
            data_complete = completer_func(data_incomplete)
            X_train = [m.X.copy() for m in data_complete] if multi else data_complete.X.copy() 
            y_train = data_complete[0].y.copy() if multi else data_complete.y.copy()
            data_incomplete = Dataset("tmp", X_test, y_test, auto_convert=False, protected_features=protected_features)
            data_complete = completer_func(data_incomplete)
            X_test = [m.X.copy() for m in data_complete] if multi else data_complete.X.copy()
            y_test = data_complete[0].y.copy() if multi else data_complete.y.copy()
        # get result for each classifier
        for clf_name, clf in clfs.items():
            print("{}".format(clf_name), end=" ")
            result = compute_confusion_matrix(X_train, y_train, X_test, y_test, clf, protected_features, multi=multi)
            data_cv[clf_name].append(result)
        print()
        fold += 1
    print("Result:\n{}".format(data_cv))
    return data_cv

In [19]:
# try on original data
result_original = test_imputation(data_compas_complete.X.copy(), data_compas_complete.y.copy(), data_compas_complete.protected)

Fold 1 KNN Acc: 0.5985 LinearSVC Acc: 0.6848 SVC Acc: 0.6326 Forest Acc: 0.7220 LogReg Acc: 0.6816 Tree Acc: 0.7220 MLP Acc: 0.6443 
Fold 2 KNN Acc: 0.6337 LinearSVC Acc: 0.6763 SVC Acc: 0.6102 Forest Acc: 0.7167 LogReg Acc: 0.6954 Tree Acc: 0.7050 MLP Acc: 0.6986 
Fold 3 KNN Acc: 0.6251 LinearSVC Acc: 0.6741 SVC Acc: 0.6124 Forest Acc: 0.7167 LogReg Acc: 0.6784 Tree Acc: 0.6933 MLP Acc: 0.6592 
Fold 4 KNN Acc: 0.5911 LinearSVC Acc: 0.6709 SVC Acc: 0.6038 Forest Acc: 0.6965 LogReg Acc: 0.6645 Tree Acc: 0.6656 MLP Acc: 0.6006 
Fold 5 KNN Acc: 0.5896 LinearSVC Acc: 0.6557 SVC Acc: 0.6034 Forest Acc: 0.6823 LogReg Acc: 0.6780 Tree Acc: 0.6450 MLP Acc: 0.6727 
Fold 6 KNN Acc: 0.5864 LinearSVC Acc: 0.6493 SVC Acc: 0.6077 Forest Acc: 0.6823 LogReg Acc: 0.6525 Tree Acc: 0.6727 MLP Acc: 0.6471 
Fold 7 KNN Acc: 0.5672 LinearSVC Acc: 0.6706 SVC Acc: 0.5970 Forest Acc: 0.6983 LogReg Acc: 0.6738 Tree Acc: 0.6791 MLP Acc: 0.6567 
Fold 8 KNN Acc: 0.6130 LinearSVC Acc: 0.6684 SVC Acc: 0.6215 Forest A

In [20]:
# fill by mean on original
result_mean_original = test_imputation(data.X.copy(), data.y.copy(), data.protected, complete_by_mean_col)

Fold 1 KNN Acc: 0.6002 LinearSVC Acc: 0.6724 SVC Acc: 0.5819 Forest Acc: 0.7345 LogReg Acc: 0.6755 Tree Acc: 0.7314 MLP Acc: 0.6887 
Fold 2 KNN Acc: 0.6277 LinearSVC Acc: 0.6806 SVC Acc: 0.5738 Forest Acc: 0.7172 LogReg Acc: 0.6958 Tree Acc: 0.7060 MLP Acc: 0.6490 
Fold 3 KNN Acc: 0.6256 LinearSVC Acc: 0.6704 SVC Acc: 0.5910 Forest Acc: 0.7050 LogReg Acc: 0.6928 Tree Acc: 0.7019 MLP Acc: 0.6806 
Fold 4 KNN Acc: 0.5835 LinearSVC Acc: 0.6548 SVC Acc: 0.5642 Forest Acc: 0.6935 LogReg Acc: 0.6701 Tree Acc: 0.6711 MLP Acc: 0.5733 
Fold 5 KNN Acc: 0.5784 LinearSVC Acc: 0.6548 SVC Acc: 0.5774 Forest Acc: 0.6874 LogReg Acc: 0.6548 Tree Acc: 0.6752 MLP Acc: 0.6599 
Fold 6 KNN Acc: 0.5815 LinearSVC Acc: 0.6527 SVC Acc: 0.5692 Forest Acc: 0.6894 LogReg Acc: 0.6599 Tree Acc: 0.6874 MLP Acc: 0.6426 
Fold 7 KNN Acc: 0.5530 LinearSVC Acc: 0.6609 SVC Acc: 0.5743 Forest Acc: 0.7098 LogReg Acc: 0.6660 Tree Acc: 0.7037 MLP Acc: 0.6415 
Fold 8 KNN Acc: 0.5998 LinearSVC Acc: 0.6650 SVC Acc: 0.5601 Forest A

In [21]:
# fill by similar on original
result_similar_original = test_imputation(data.X.copy(), data.y.copy(), data.protected, complete_by_similar_row)

Fold 1 KNN Acc: 0.6104 LinearSVC Acc: 0.6755 SVC Acc: 0.6002 Forest Acc: 0.7426 LogReg Acc: 0.6907 Tree Acc: 0.7335 MLP Acc: 0.6918 
Fold 2 KNN Acc: 0.6378 LinearSVC Acc: 0.6826 SVC Acc: 0.6012 Forest Acc: 0.7121 LogReg Acc: 0.6928 Tree Acc: 0.7141 MLP Acc: 0.6867 
Fold 3 KNN Acc: 0.6246 LinearSVC Acc: 0.6714 SVC Acc: 0.6053 Forest Acc: 0.7172 LogReg Acc: 0.6796 Tree Acc: 0.7040 MLP Acc: 0.6297 
Fold 4 KNN Acc: 0.5998 LinearSVC Acc: 0.6538 SVC Acc: 0.5631 Forest Acc: 0.6965 LogReg Acc: 0.6690 Tree Acc: 0.6853 MLP Acc: 0.6477 
Fold 5 KNN Acc: 0.5855 LinearSVC Acc: 0.6507 SVC Acc: 0.5804 Forest Acc: 0.6884 LogReg Acc: 0.6589 Tree Acc: 0.6741 MLP Acc: 0.6293 
Fold 6 KNN Acc: 0.5845 LinearSVC Acc: 0.6517 SVC Acc: 0.5825 Forest Acc: 0.6782 LogReg Acc: 0.6497 Tree Acc: 0.6843 MLP Acc: 0.6426 
Fold 7 KNN Acc: 0.5682 LinearSVC Acc: 0.6599 SVC Acc: 0.5866 Forest Acc: 0.7047 LogReg Acc: 0.6690 Tree Acc: 0.6782 MLP Acc: 0.6762 
Fold 8 KNN Acc: 0.6039 LinearSVC Acc: 0.6670 SVC Acc: 0.5855 Forest A

In [22]:
# fill by multiple imputation on original
result_multi_original = test_imputation(data.X.copy(), data.y.copy(), data.protected, complete_by_multi, multi=True)

Fold 1 KNN Acc: 0.6044 LinearSVC Acc: 0.6764 SVC Acc: 0.5975 Forest Acc: 0.7390 LogReg Acc: 0.6839 Tree Acc: 0.7245 MLP Acc: 0.7031 
Fold 2 KNN Acc: 0.6364 LinearSVC Acc: 0.6802 SVC Acc: 0.5919 Forest Acc: 0.7186 LogReg Acc: 0.6850 Tree Acc: 0.7094 MLP Acc: 0.6843 
Fold 3 KNN Acc: 0.6219 LinearSVC Acc: 0.6721 SVC Acc: 0.6045 Forest Acc: 0.7183 LogReg Acc: 0.6768 Tree Acc: 0.6948 MLP Acc: 0.6443 
Fold 4 KNN Acc: 0.5987 LinearSVC Acc: 0.6539 SVC Acc: 0.5793 Forest Acc: 0.7007 LogReg Acc: 0.6589 Tree Acc: 0.6734 MLP Acc: 0.6580 
Fold 5 KNN Acc: 0.5996 LinearSVC Acc: 0.6567 SVC Acc: 0.5944 Forest Acc: 0.6924 LogReg Acc: 0.6599 Tree Acc: 0.6712 MLP Acc: 0.6587 
Fold 6 KNN Acc: 0.5863 LinearSVC Acc: 0.6525 SVC Acc: 0.5911 Forest Acc: 0.6882 LogReg Acc: 0.6545 Tree Acc: 0.6789 MLP Acc: 0.6458 
Fold 7 KNN Acc: 0.5712 LinearSVC Acc: 0.6644 SVC Acc: 0.5763 Forest Acc: 0.7068 LogReg Acc: 0.6730 Tree Acc: 0.6823 MLP Acc: 0.6698 
Fold 8 KNN Acc: 0.6140 LinearSVC Acc: 0.6679 SVC Acc: 0.5926 Forest A

In [23]:
# generate simulated missing data
data_sim = gen_complete_random(data_compas_complete, random_ratio=0.2)
data_sim.X.isnull().sum(axis=0)

gen_complete_random: 20980 NaN values have been inserted


age                        1735
age_cat                    1736
c_charge_degree            1797
priors_count               1771
juv_misd_count             1734
juv_fel_count              1737
juv_other_count            1750
c_charge_desc              1709
days_b_screening_arrest    1799
sex                        1732
race                       1764
length_of_stay             1716
dtype: int64

In [24]:
# fill by mean on simulated
result_mean_sim = test_imputation(data_sim.X.copy(), data_sim.y.copy(), data_sim.protected, complete_by_mean_col)

Fold 1 KNN Acc: 0.5453 LinearSVC Acc: 0.6794 SVC Acc: 0.5751 Forest Acc: 0.6209 LogReg Acc: 0.6741 Tree Acc: 0.6347 MLP Acc: 0.5538 
Fold 2 KNN Acc: 0.5857 LinearSVC Acc: 0.6635 SVC Acc: 0.5729 Forest Acc: 0.5761 LogReg Acc: 0.6731 Tree Acc: 0.6539 MLP Acc: 0.5687 
Fold 3 KNN Acc: 0.5985 LinearSVC Acc: 0.6613 SVC Acc: 0.5868 Forest Acc: 0.5580 LogReg Acc: 0.6688 Tree Acc: 0.6092 MLP Acc: 0.6741 
Fold 4 KNN Acc: 0.5591 LinearSVC Acc: 0.6496 SVC Acc: 0.5570 Forest Acc: 0.5793 LogReg Acc: 0.6400 Tree Acc: 0.6102 MLP Acc: 0.5857 
Fold 5 KNN Acc: 0.5480 LinearSVC Acc: 0.6354 SVC Acc: 0.5586 Forest Acc: 0.5256 LogReg Acc: 0.6439 Tree Acc: 0.6034 MLP Acc: 0.6610 
Fold 6 KNN Acc: 0.5640 LinearSVC Acc: 0.6482 SVC Acc: 0.5693 Forest Acc: 0.6119 LogReg Acc: 0.6599 Tree Acc: 0.6151 MLP Acc: 0.5949 
Fold 7 KNN Acc: 0.5362 LinearSVC Acc: 0.6599 SVC Acc: 0.5512 Forest Acc: 0.5224 LogReg Acc: 0.6557 Tree Acc: 0.5789 MLP Acc: 0.6461 
Fold 8 KNN Acc: 0.5938 LinearSVC Acc: 0.6695 SVC Acc: 0.5490 Forest A

In [25]:
# fill by similar on simulated
result_similar_sim = test_imputation(data_sim.X.copy(), data_sim.y.copy(), data_sim.protected, complete_by_similar_row)

Fold 1 KNN Acc: 0.5911 LinearSVC Acc: 0.6741 SVC Acc: 0.6049 Forest Acc: 0.7007 LogReg Acc: 0.6709 Tree Acc: 0.6688 MLP Acc: 0.6613 
Fold 2 KNN Acc: 0.6006 LinearSVC Acc: 0.6528 SVC Acc: 0.5666 Forest Acc: 0.6773 LogReg Acc: 0.6624 Tree Acc: 0.6454 MLP Acc: 0.6358 
Fold 3 KNN Acc: 0.6113 LinearSVC Acc: 0.6496 SVC Acc: 0.5879 Forest Acc: 0.6784 LogReg Acc: 0.6731 Tree Acc: 0.6379 MLP Acc: 0.6358 
Fold 4 KNN Acc: 0.5772 LinearSVC Acc: 0.6305 SVC Acc: 0.5431 Forest Acc: 0.6187 LogReg Acc: 0.6432 Tree Acc: 0.5804 MLP Acc: 0.5921 
Fold 5 KNN Acc: 0.5906 LinearSVC Acc: 0.6343 SVC Acc: 0.5629 Forest Acc: 0.6397 LogReg Acc: 0.6450 Tree Acc: 0.6439 MLP Acc: 0.6333 
Fold 6 KNN Acc: 0.5821 LinearSVC Acc: 0.6610 SVC Acc: 0.5949 Forest Acc: 0.6663 LogReg Acc: 0.6652 Tree Acc: 0.6631 MLP Acc: 0.6183 
Fold 7 KNN Acc: 0.5512 LinearSVC Acc: 0.6599 SVC Acc: 0.5554 Forest Acc: 0.6514 LogReg Acc: 0.6663 Tree Acc: 0.6226 MLP Acc: 0.6002 
Fold 8 KNN Acc: 0.5991 LinearSVC Acc: 0.6567 SVC Acc: 0.5917 Forest A

In [26]:
# fill by multiple imputation on simulated
result_multi_sim = test_imputation(data_sim.X.copy(), data_sim.y.copy(), data_sim.protected, complete_by_multi, multi=True)

Fold 1 KNN Acc: 0.5862 LinearSVC Acc: 0.6587 SVC Acc: 0.5842 Forest Acc: 0.6814 LogReg Acc: 0.6625 Tree Acc: 0.6574 MLP Acc: 0.6233 
Fold 2 KNN Acc: 0.6082 LinearSVC Acc: 0.6482 SVC Acc: 0.5752 Forest Acc: 0.6604 LogReg Acc: 0.6561 Tree Acc: 0.6363 MLP Acc: 0.6192 
Fold 3 KNN Acc: 0.5916 LinearSVC Acc: 0.6497 SVC Acc: 0.5826 Forest Acc: 0.6707 LogReg Acc: 0.6584 Tree Acc: 0.6415 MLP Acc: 0.6215 
Fold 4 KNN Acc: 0.5729 LinearSVC Acc: 0.6357 SVC Acc: 0.5636 Forest Acc: 0.6264 LogReg Acc: 0.6388 Tree Acc: 0.6169 MLP Acc: 0.6204 
Fold 5 KNN Acc: 0.5677 LinearSVC Acc: 0.6226 SVC Acc: 0.5496 Forest Acc: 0.6423 LogReg Acc: 0.6285 Tree Acc: 0.6253 MLP Acc: 0.6229 
Fold 6 KNN Acc: 0.5863 LinearSVC Acc: 0.6365 SVC Acc: 0.5871 Forest Acc: 0.6540 LogReg Acc: 0.6424 Tree Acc: 0.6374 MLP Acc: 0.6235 
Fold 7 KNN Acc: 0.5383 LinearSVC Acc: 0.6345 SVC Acc: 0.5471 Forest Acc: 0.6299 LogReg Acc: 0.6396 Tree Acc: 0.6088 MLP Acc: 0.5954 
Fold 8 KNN Acc: 0.5960 LinearSVC Acc: 0.6399 SVC Acc: 0.5720 Forest A

In [28]:
import json

json_data = {
    "Original": result_original,
    "Fill by Mean (Original)": result_mean_original,
    "Fill by Similar (Original)": result_similar_original,
    "Fill by Multiple Imputation (Original)": result_multi_original,
    "Fill by Mean (Simulated)": result_mean_sim,
    "Fill by Similar (Simulated)": result_similar_sim,
    "Fill by Multiple Imputation (Simulated)": result_multi_sim
}
with open("raw_confusion_matrix.json", "w") as outFile:
    json.dump(json_data, outFile)

In [29]:
def average_cv_mat(cv_data):
    result = []
    result.append(np.array([m["African-American"] for m in cv_data["KNN"]]).mean(axis=0).tolist())
    result.append(np.array([m["Caucasian"] for m in cv_data["KNN"]]).mean(axis=0).tolist())
    result.append(np.array([m["African-American"] for m in cv_data["LinearSVC"]]).mean(axis=0).tolist())
    result.append(np.array([m["Caucasian"] for m in cv_data["LinearSVC"]]).mean(axis=0).tolist())
    result.append(np.array([m["African-American"] for m in cv_data["SVC"]]).mean(axis=0).tolist())
    result.append(np.array([m["Caucasian"] for m in cv_data["SVC"]]).mean(axis=0).tolist())
    result.append(np.array([m["African-American"] for m in cv_data["Forest"]]).mean(axis=0).tolist())
    result.append(np.array([m["Caucasian"] for m in cv_data["Forest"]]).mean(axis=0).tolist())
    result.append(np.array([m["African-American"] for m in cv_data["LogReg"]]).mean(axis=0).tolist())
    result.append(np.array([m["Caucasian"] for m in cv_data["LogReg"]]).mean(axis=0).tolist())
    result.append(np.array([m["African-American"] for m in cv_data["Tree"]]).mean(axis=0).tolist())
    result.append(np.array([m["Caucasian"] for m in cv_data["Tree"]]).mean(axis=0).tolist())
    result.append(np.array([m["African-American"] for m in cv_data["MLP"]]).mean(axis=0).tolist())
    result.append(np.array([m["Caucasian"] for m in cv_data["MLP"]]).mean(axis=0).tolist())
    result = np.array(result).ravel().tolist()
    return result

csv_data = []
csv_data.append(average_cv_mat(result_original))
csv_data.append(average_cv_mat(result_mean_original))
csv_data.append(average_cv_mat(result_similar_original))
csv_data.append(average_cv_mat(result_multi_original))
csv_data.append(average_cv_mat(result_mean_sim))
csv_data.append(average_cv_mat(result_similar_sim))
csv_data.append(average_cv_mat(result_multi_sim))

columns = [x + " " + y for y in 
           ["(KNN)", "(LinearSVC)", "(SVC)", "(Forest)", "(Log Reg)", "(Tree)", "(MLP)"] for x in
           ["TN_AA", "FP_AA", "FN_AA", "TP_AA", "TN_C", "FP_C", "FN_C", "TP_C"]]
df = pd.DataFrame(csv_data, columns=columns,
                  index=["Original (No Imputation)",
                         "Fill by Mean (Original)",
                         "Fill by Similar (Original)",
                         "Fill by Multiple Imputation (Original)",
                         "Fill by Mean (Simulated)",
                         "Fill by Similar (Simulated)",
                         "Fill by Multiple Imputation (Simulated)"])
df

Unnamed: 0,TN_AA (KNN),FP_AA (KNN),FN_AA (KNN),TP_AA (KNN),TN_C (KNN),FP_C (KNN),FN_C (KNN),TP_C (KNN),TN_AA (LinearSVC),FP_AA (LinearSVC),...,FN_C (Tree),TP_C (Tree),TN_AA (MLP),FP_AA (MLP),FN_AA (MLP),TP_AA (MLP),TN_C (MLP),FP_C (MLP),FN_C (MLP),TP_C (MLP)
Original (No Imputation),177.2,100.7,99.7,89.4,164.9,67.2,56.4,36.6,160.5,117.4,...,58.9,34.1,153.5,124.4,49.6,139.5,169.5,62.6,43.7,49.3
Fill by Mean (Original),186.2,112.5,100.5,91.9,170.3,72.3,56.6,36.9,171.3,127.4,...,61.0,32.5,162.3,136.4,48.5,143.9,179.5,63.1,45.5,48.0
Fill by Similar (Original),192.0,106.7,103.2,89.2,172.4,70.2,56.9,36.6,172.6,126.1,...,63.2,30.3,169.7,129.0,54.0,138.4,181.2,61.4,44.9,48.6
Fill by Multiple Imputation (Original),193.3,105.4,101.6,90.8,172.4,70.2,56.9,36.6,174.5,124.2,...,61.6,31.9,168.1,130.6,52.0,140.4,180.2,62.4,45.5,48.0
Fill by Mean (Simulated),135.7,89.2,81.8,72.9,125.9,61.6,45.7,29.2,133.4,91.5,...,40.6,34.3,113.6,111.3,43.0,111.7,125.3,62.2,34.8,40.1
Fill by Similar (Simulated),143.4,81.5,85.1,69.6,131.2,56.3,46.6,28.3,131.0,93.9,...,45.6,29.3,112.7,112.2,38.9,115.8,125.3,62.2,33.7,41.2
Fill by Multiple Imputation (Simulated),139.8,85.1,88.4,66.3,127.6,59.9,45.9,29.0,131.5,93.4,...,43.0,31.9,120.9,104.0,44.3,110.4,127.0,60.5,34.1,40.8


In [30]:
df.to_csv("compas_analysis.csv")