In [1]:
import numpy as np
import pandas as pd

import shap
import pickle

import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from lightgbm import LGBMClassifier

from sklearn.pipeline import Pipeline

In [2]:
TARGET_COL = "Two_yr_Recidivism"
df = pd.read_csv('data/propublica_data_for_fairml.csv')

In [3]:
print(df.shape)
display(df.columns)
df.head()

(6172, 12)


Index(['Two_yr_Recidivism', 'Number_of_Priors', 'score_factor',
       'Age_Above_FourtyFive', 'Age_Below_TwentyFive', 'African_American',
       'Asian', 'Hispanic', 'Native_American', 'Other', 'Female',
       'Misdemeanor'],
      dtype='object')

Unnamed: 0,Two_yr_Recidivism,Number_of_Priors,score_factor,Age_Above_FourtyFive,Age_Below_TwentyFive,African_American,Asian,Hispanic,Native_American,Other,Female,Misdemeanor
0,0,0,0,1,0,0,0,0,0,1,0,0
1,1,0,0,0,0,1,0,0,0,0,0,0
2,1,4,0,0,1,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,1
4,1,14,1,0,0,0,0,0,0,0,0,0


In [5]:
y = df[TARGET_COL]
x = df.drop(columns=[TARGET_COL])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, shuffle=True, stratify=y, random_state=42)

x_train_lgbm, x_val_lgbm, y_train_lgbm, y_val_lgbm = train_test_split(x_train, y_train, test_size=0.25, shuffle=True, stratify=y_train, random_state=42)

In [6]:
names = [
    "nearest_neighbors",
    "linear_svm",
    "gaussian_process",
    "decision_tree",
    "random_forest",
    "mlp",
    "adaboost",
]

In [12]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=15),
    RandomForestClassifier(max_depth=15, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
]

In [13]:
#pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
scores = {}
for name, model in zip(names, classifiers):
    model.fit(x_train, y_train)
    score = model.score(x_test, y_test)
    scores[name] = score
    # save the model to disk
    filename = f'models/{name}.pk'
    pickle.dump(model, open(filename, 'wb'))
    



In [14]:
print(scores)

{'nearest_neighbors': 0.6338302009073234, 'linear_svm': 0.659753726506805, 'gaussian_process': 0.6902138690861958, 'decision_tree': 0.661049902786779, 'random_forest': 0.6701231367465975, 'mlp': 0.6908619572261827, 'adaboost': 0.6804925469863902}


In [19]:
x_test.iloc[0]

Number_of_Priors        3
score_factor            0
Age_Above_FourtyFive    0
Age_Below_TwentyFive    0
African_American        1
Asian                   0
Hispanic                0
Native_American         0
Other                   0
Female                  0
Misdemeanor             0
Name: 5352, dtype: int64

In [22]:
x_test.iloc[0].to_numpy()

array([3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [23]:
classifiers[3].predict(x_test.iloc[0].to_numpy().reshape(1, -1))

X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names


array([1])

In [24]:
y_test.iloc[0]

1

In [32]:
x_test.to_csv('data/x_test.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

In [33]:
aaa = pd.read_csv('data/x_test.csv')
bbb = pd.read_csv('data/y_test.csv')

In [36]:
aaa.head()

Unnamed: 0,Number_of_Priors,score_factor,Age_Above_FourtyFive,Age_Below_TwentyFive,African_American,Asian,Hispanic,Native_American,Other,Female,Misdemeanor
0,3,0,0,0,1,0,0,0,0,0,0
1,3,1,0,1,1,0,0,0,0,0,0
2,8,1,0,0,1,0,0,0,0,0,0
3,2,0,1,0,0,0,0,0,0,0,0
4,1,0,0,1,1,0,0,0,0,0,1


In [37]:
bbb.head()

Unnamed: 0,Two_yr_Recidivism
0,1
1,1
2,1
3,0
4,0
