## Study Compas Dataset  
[Link](https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb)

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
DATASET_PATH = os.path.join("dataset", "compas", "compas-scores.csv")

In [3]:
data = pd.read_csv(DATASET_PATH)
data.head(5)

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,vr_offense_date,vr_charge_desc,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,type_of_assessment,decile_score.1,score_text,screening_date
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,,,Risk of Violence,1,Low,2013-08-14,Risk of Recidivism,1,Low,2013-08-14
1,2,michael ryan,michael,ryan,2014-12-31,Male,1985-02-06,31,25 - 45,Caucasian,...,,,Risk of Violence,2,Low,2014-12-31,Risk of Recidivism,5,Medium,2014-12-31
2,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,2013-07-05,Felony Battery (Dom Strang),Risk of Violence,1,Low,2013-01-27,Risk of Recidivism,3,Low,2013-01-27
3,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,,,Risk of Violence,3,Low,2013-04-14,Risk of Recidivism,4,Low,2013-04-14
4,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,,,Risk of Violence,6,Medium,2013-01-13,Risk of Recidivism,8,High,2013-01-13


In [4]:
data.shape

(11757, 47)

In [5]:
data.columns

Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'num_r_cases',
       'r_case_number', 'r_charge_degree', 'r_days_from_arrest',
       'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out',
       'is_violent_recid', 'num_vr_cases', 'vr_case_number',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'type_of_assessment', 'decile_score.1',
       'score_text', 'screening_date'],
      dtype='object')

In [6]:
def check_data(df, columns):
    for col in columns:
        print(col + ":")
        if not col in df.columns:
            print("Nothing found")
        else:
            print(df[col].unique())
        print()

In [7]:
check_data(data, ['age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 'priors_count', 
                    'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out']
          )

age:
[69 31 34 24 23 43 44 41 39 20 26 21 27 37 22 47 25 64 42 32 54 49 36 33
 30 63 55 29 38 53 51 28 35 48 46 62 56 45 40 59 50 66 52 71 19 61 58 78
 57 68 18 70 60 65 83 67 75 73 72 77 76 74 96 80 79 86]

c_charge_degree:
['F' 'O' 'M']

race:
['Other' 'Caucasian' 'African-American' 'Hispanic' 'Asian'
 'Native American']

age_cat:
['Greater than 45' '25 - 45' 'Less than 25']

score_text:
['Low' 'Medium' 'High' nan]

sex:
['Male' 'Female']

priors_count:
[ 0  4  1  2 14  3  7  6  5 13  8  9 15 19 21 22 20 10 12 28 11 23 25 24
 36 18 29 16 17 33 30 43 27 26 38 37 35 31 39]

days_b_screening_arrest:
[-1.000e+00        nan  0.000e+00  4.280e+02 -2.000e+01  2.200e+01
 -2.000e+00 -1.320e+02  5.300e+01  8.100e+01 -5.900e+01 -2.400e+01
  3.020e+02 -2.190e+02 -1.300e+01 -3.800e+01  7.000e+01 -3.250e+02
 -1.500e+01 -7.800e+01 -4.300e+01 -3.890e+02 -3.180e+02 -5.000e+00
  1.370e+02 -1.810e+02 -2.390e+02 -2.100e+01 -6.400e+01 -1.000e+01
 -1.370e+02  1.780e+02 -3.000e+01 -3.000e+00  6.700e+01  1.

In [8]:
# select data and preprocess
data_processed = data[data['days_b_screening_arrest'] <= 30]
data_processed = data[data['days_b_screening_arrest'] >= -30] # select within 30 days
data_processed.shape

(9778, 47)

In [9]:
data_processed = data_processed[data_processed['is_recid'] != -1]
data_processed = data_processed[data_processed['c_charge_degree'] != 'O']
data_processed.dropna(subset=["score_text"], inplace=True)
data_processed.shape

(9760, 47)

In [10]:
data_processed = data_processed.reset_index(drop=True)

In [11]:
data_train = data_processed[["age_cat", "c_charge_degree", "race", "sex", "priors_count"]].copy()
data_compare = data_processed[["score_text", "decile_score"]].copy()
data_target = data_processed[["is_recid"]].copy()

In [12]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9760 entries, 0 to 9759
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age_cat          9760 non-null   object
 1   c_charge_degree  9760 non-null   object
 2   race             9760 non-null   object
 3   sex              9760 non-null   object
 4   priors_count     9760 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 381.4+ KB


In [13]:
data_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9760 entries, 0 to 9759
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   is_recid  9760 non-null   int64
dtypes: int64(1)
memory usage: 76.4 KB


In [14]:
data_target["is_recid"].value_counts()

0    6360
1    3400
Name: is_recid, dtype: int64

In [15]:
data_train["age_cat"].value_counts()

25 - 45            5578
Less than 25       2091
Greater than 45    2091
Name: age_cat, dtype: int64

In [16]:
data_train["c_charge_degree"].value_counts()

F    6365
M    3395
Name: c_charge_degree, dtype: int64

In [17]:
data_train["race"].value_counts()

African-American    4879
Caucasian           3367
Hispanic             855
Other                581
Asian                 48
Native American       30
Name: race, dtype: int64

In [18]:
data_train["sex"].value_counts()

Male      7763
Female    1997
Name: sex, dtype: int64

In [19]:
data_train["priors_count"].value_counts()

0     3395
1     1819
2     1074
3      718
4      497
5      404
6      305
7      275
8      222
9      175
10     137
11     113
13      96
12      83
14      70
15      55
16      53
17      44
19      35
18      33
20      25
21      23
23      20
22      19
24      15
25      11
27      11
26       9
28       6
29       5
33       4
31       2
30       2
35       1
36       1
37       1
38       1
43       1
Name: priors_count, dtype: int64

In [20]:
data_compare.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9760 entries, 0 to 9759
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   score_text    9760 non-null   object
 1   decile_score  9760 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 152.6+ KB


In [21]:
data_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9760 entries, 0 to 9759
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   is_recid  9760 non-null   int64
dtypes: int64(1)
memory usage: 76.4 KB


In [22]:
# preprocessing
from sklearn.preprocessing import LabelEncoder
encoders = {}
for col in ['age_cat', 'c_charge_degree', 'race', 'sex']:
    encoder = LabelEncoder()
    data_train[col] = encoder.fit_transform(data_train[col])
    encoders[col] = encoder
data_train.head(5)

Unnamed: 0,age_cat,c_charge_degree,race,sex,priors_count
0,1,0,5,1,0
1,0,0,0,1,0
2,2,0,0,1,4
3,0,1,5,1,0
4,0,0,2,1,14


In [23]:
data_train.describe()

Unnamed: 0,age_cat,c_charge_degree,race,sex,priors_count
count,9760.0,9760.0,9760.0,9760.0,9760.0
mean,0.642725,0.347848,1.267623,0.795389,3.128586
std,0.811283,0.476312,1.450518,0.403437,4.670708
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,0.0
50%,0.0,0.0,1.0,1.0,1.0
75%,1.0,1.0,2.0,1.0,4.0
max,2.0,1.0,5.0,1.0,43.0


## Build Model  
The goal here is to build several models to predict the possibility of a person's is_recid in future  
and compare it with the decile_score and score_text output  

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

In [25]:
result = cross_validate(estimator=KNeighborsClassifier(),
                        X=data_train.to_numpy(), y=data_target.to_numpy().ravel(), cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_neighbor = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_neighbor['avg_score'])

0.6420081967213115


In [26]:
result = cross_validate(estimator=LinearSVC(dual=False),
                        X=data_train.to_numpy(), y=data_target.to_numpy().ravel(), cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_linearSVC = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_linearSVC['avg_score'])

0.6762295081967213


In [27]:
result = cross_validate(estimator=SGDClassifier(),
                        X=data_train.to_numpy(), y=data_target.to_numpy().ravel(), cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_SGD = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_SGD['avg_score'])

0.6648565573770491


In [28]:
result = cross_validate(estimator=LogisticRegression(),
                        X=data_train.to_numpy(), y=data_target.to_numpy().ravel(), cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_logreg = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_logreg['avg_score'])

0.6774590163934426


In [29]:
result = cross_validate(estimator=RandomForestClassifier(),
                        X=data_train.to_numpy(), y=data_target.to_numpy().ravel(), cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_forest = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_forest['avg_score'])

0.6700819672131149


## Improvement

In [30]:
# select balanced rows
idx0 = data_target[data_target["is_recid"] == 0].copy().index
idx1 = data_target[data_target["is_recid"] == 1].copy().index
idx0.shape, idx1.shape

((6360,), (3400,))

In [31]:
rnd_idx0 = idx0.to_numpy()[np.random.permutation(idx0.shape[0])][:idx1.shape[0]]
final_index = rnd_idx0.tolist() + idx1.tolist()
len(final_index)

6800

In [32]:
data_target_selected = data_target.iloc[final_index].copy()
data_target_selected.shape

(6800, 1)

In [33]:
data_target_selected[data_target_selected["is_recid"] == 0].shape

(3400, 1)

In [34]:
data_target_selected[data_target_selected["is_recid"] == 1].shape

(3400, 1)

In [35]:
data_train_selected = data_train.iloc[final_index].copy()

In [36]:
result = cross_validate(estimator=KNeighborsClassifier(),
                        X=data_train_selected.to_numpy(), y=data_target_selected.to_numpy().ravel(), cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_neighbor_selected = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_neighbor_selected['avg_score'])

0.6075000000000002


In [37]:
result = cross_validate(estimator=LinearSVC(dual=False),
                        X=data_train_selected.to_numpy(), y=data_target_selected.to_numpy().ravel(), cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_linearSVC_selected = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_linearSVC_selected['avg_score'])

0.6426470588235293


In [38]:
result = cross_validate(estimator=SGDClassifier(),
                        X=data_train_selected.to_numpy(), y=data_target_selected.to_numpy().ravel(), cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_SGD_selected = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_SGD_selected['avg_score'])

0.5833823529411765


In [39]:
result = cross_validate(estimator=LogisticRegression(),
                        X=data_train_selected.to_numpy(), y=data_target_selected.to_numpy().ravel(), cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_logreg_selected = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_logreg_selected['avg_score'])

0.6447058823529412


In [40]:
result = cross_validate(estimator=RandomForestClassifier(),
                        X=data_train_selected.to_numpy(), y=data_target_selected.to_numpy().ravel(), cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_forest_selected = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_forest_selected['avg_score'])

0.6430882352941177


In [41]:
# now try SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE()
data_train_res, data_target_res = sm.fit_resample(data_train.to_numpy(), data_target.to_numpy().ravel())
data_train_res.shape, data_target_res.shape

((12720, 5), (12720,))

In [42]:
data_train.shape

(9760, 5)

In [45]:
result = cross_validate(estimator=KNeighborsClassifier(),
                        X=data_train_res, y=data_target_res, cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_neighbor_res = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_neighbor_res['avg_score'])

0.6330188679245283


In [46]:
result = cross_validate(estimator=LinearSVC(dual=False),
                        X=data_train_res, y=data_target_res, cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_linearSVC_res = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_linearSVC_res['avg_score'])

0.6405660377358491


In [47]:
result = cross_validate(estimator=SGDClassifier(),
                        X=data_train_res, y=data_target_res, cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_SGD_res = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_SGD_res['avg_score'])

0.6047955974842767


In [48]:
result = cross_validate(estimator=LogisticRegression(),
                        X=data_train_res, y=data_target_res, cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_logreg_res = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_logreg_res['avg_score'])

0.6434748427672956


In [49]:
result = cross_validate(estimator=RandomForestClassifier(),
                        X=data_train_res, y=data_target_res, cv=5,
                        return_estimator=True)
result_best_ids = np.argmax(result["test_score"])
model_forest_res = {
    'avg_score': result['test_score'].mean(),
    'best_estimator': result['estimator'][result_best_ids]
}
print(model_forest_res['avg_score'])

0.6607704402515724


In [54]:
def test_smote(smote_obj, data_train, data_target):
    data_train_res, data_target_res = smote_obj.fit_resample(data_train, data_target)
    print("KNN: {}".format(cross_validate(estimator=KNeighborsClassifier(), X=data_train_res, y=data_target_res, cv=5)["test_score"].mean()))
    print("SVC: {}".format(cross_validate(estimator=LinearSVC(), X=data_train_res, y=data_target_res, cv=5)["test_score"].mean()))
    print("SGD: {}".format(cross_validate(estimator=SGDClassifier(), X=data_train_res, y=data_target_res, cv=5)["test_score"].mean()))
    print("LogReg: {}".format(cross_validate(estimator=LogisticRegression(), X=data_train_res, y=data_target_res, cv=5)["test_score"].mean()))
    print("Forest: {}".format(cross_validate(estimator=RandomForestClassifier(), X=data_train_res, y=data_target_res, cv=5)["test_score"].mean()))

In [61]:
from imblearn.over_sampling import BorderlineSMOTE, KMeansSMOTE, SVMSMOTE, ADASYN
import warnings
warnings.filterwarnings('ignore') # ignore sklearn warnings

In [57]:
test_smote(BorderlineSMOTE(), data_train.to_numpy(), data_target.to_numpy())

KNN: 0.5944182389937106
SVC: 0.6043238993710691
SGD: 0.5839622641509433
LogReg: 0.6060534591194968
Forest: 0.6187106918238994


In [58]:
test_smote(KMeansSMOTE(), data_train.to_numpy(), data_target.to_numpy())

KNN: 0.7240876487378137
SVC: 0.7403625001544526
SGD: 0.7490067464877488
LogReg: 0.7423276000543673
Forest: 0.7324995366423249


In [60]:
test_smote(SVMSMOTE(), data_train.to_numpy(), data_target.to_numpy())

KNN: 0.625314465408805
SVC: 0.6492138364779874
SGD: 0.6496069182389939
LogReg: 0.6493710691823898
Forest: 0.6676100628930818


In [62]:
test_smote(ADASYN(), data_train.to_numpy(), data_target.to_numpy())

KNN: 0.5843039891171243
SVC: 0.6196459074860272
SGD: 0.5792122899304811
LogReg: 0.621575098512896
Forest: 0.6336908446140724
