# Multinomial regression

## Initial classification

**Notebook Setup**

In [1]:
import sys
sys.path.append("../src")
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

%matplotlib inline

In [3]:
plt.style.use('ggplot')

In [4]:
%matplotlib inline

from project import rf_models, preprocessing

df = pd.read_csv("../data/initial_custody_2017_gsprs.csv", low_memory=False)

#df['high_ic_instit_adj'] = np.where(df['ic_institut_adj']>2, 1, 0)
#df['ic_override_up'] = np.where(df['ic_ovride_cust_lvl']-df['ic_custdy_level']>0,1,0)
#df['high_ic'] = np.where(df['ic_custdy_level']>3, 1, 0)


# preprocessing
data = preprocessing.preprocess_input_vars(df)
data = data[data.ic_custdy_level > 1]
data = data[
    [
        "gender_female",
        "age_gt_45",
        "age_lt_25",
        "race_B",
        "race_A",
        "race_H",
        "race_I",
        "race_O",
        "off_1_prs_max",
        "off_1_gs_max",
        "ic_custdy_level",
        "prior_commits",
        "ic_institut_adj",
        "escape_hist_1",
        #"escape_hist_2",
        #"escape_hist_3",
        #"escape_hist_4",
        "escape_hist_5",
        "mrt_stat_DIV",
        "mrt_stat_SEP",
        "mrt_stat_MAR",
        "mrt_stat_WID",
        "employed",
        #"high_ic_instit_adj",
        #"high_ic"
    ]
]
data = data.dropna()

df_ic_all = data

#### Multinomial

In [5]:
X = df_ic_all.drop("ic_custdy_level", axis=1)
y = df_ic_all["ic_custdy_level"]
X_train, X_test, Y_train, Y_test = train_test_split(X, y, train_size=0.2, random_state=1)
model = LogisticRegression(multi_class="multinomial", max_iter=10_000)
model.fit(X_train, Y_train)

LogisticRegression(max_iter=10000, multi_class='multinomial')

In [6]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X_test, Y_test, scoring='accuracy', cv=cv, n_jobs=-1)
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Mean Accuracy: 0.772 (0.011)


In [7]:
model.coef_

array([[ 9.45366089e-01,  1.32697981e+00, -1.35227195e+00,
        -7.61379257e-01, -1.48722442e-01, -1.99960007e-01,
        -9.41225456e-02, -1.88991453e-01, -4.90122256e-01,
        -5.73972280e-01, -2.18661719e-01, -1.14357995e+00,
         1.36935526e+00, -6.74878609e-01,  1.96726545e+00,
         1.89798759e+00,  1.90235272e+00,  1.44611041e+00,
         8.06675193e-01],
       [-1.19255729e-01,  2.56929169e-01, -2.31747133e-01,
         1.62365286e-01,  4.86575829e-01,  1.80671958e-01,
        -1.49862137e-01,  2.36247082e-02, -1.52595785e-01,
        -3.70023558e-01, -2.58003711e-02, -5.15920908e-01,
         5.24294545e-01, -9.94631180e-01,  3.10534589e-01,
         3.60910689e-01,  5.61153855e-02, -2.90173546e-01,
         2.71477560e-01],
       [-6.10461031e-01, -1.01932573e+00,  6.88172594e-01,
         2.52715464e-01, -3.25992644e-01,  1.00374904e-01,
         2.45080684e-01,  1.67520739e-01,  2.25145230e-01,
        -1.03636713e-01,  5.91245261e-02,  7.16175334e-01,
    

In [38]:
np.exp(model.coef_)

array([[2.55032921, 3.78123441, 0.25807213, 0.46751328, 0.86119173,
        0.82094573, 0.90848852, 0.83186223, 0.61212444, 0.56326397,
        0.80342801, 0.31890363, 3.92946757, 0.49630633, 7.12750585,
        6.63981256, 6.68216363, 4.24406331, 2.23795295],
       [0.88186853, 1.29866519, 0.79240551, 1.17683539, 1.63041391,
        1.20061604, 0.86189373, 1.02083758, 0.85773488, 0.6908533 ,
        0.97440912, 0.59729353, 1.68700526, 0.37247144, 1.36004621,
        1.42938388, 1.05635068, 0.74616376, 1.31056831],
       [0.53874968, 0.36202239, 1.98992413, 1.28848841, 0.72046968,
        1.10860623, 1.27899284, 1.17932683, 1.25215346, 0.90192462,
        1.06094581, 2.04861795, 0.85849543, 6.23459349, 0.23930786,
        0.21119664, 0.25565838, 0.36345578, 0.59628303],
       [0.82530198, 0.56251456, 2.45739457, 1.41061925, 0.98852252,
        0.91517489, 0.99852449, 0.99852229, 1.52107157, 2.84925912,
        1.20397784, 2.56266454, 0.17571623, 0.86765899, 0.43107397,
        0.498

### Feature selection

In [39]:
from sklearn.feature_selection import RFE

data_final_vars=df_ic_all.columns.values.tolist()
yvars = ['ic_custdy_level']
Xvars = [i for i in data_final_vars if i not in yvars]

rfe = RFE(model, n_features_to_select=4, step=1)
rfe = rfe.fit(X, y.values.ravel())

zz= list(zip(Xvars,list(rfe.support_)))
ll = [a for (a,b) in zz if b]
ll

['age_lt_25', 'escape_hist_5', 'mrt_stat_DIV', 'mrt_stat_SEP']

#### Class imbalance

In [40]:
count_2 = len(df_ic_all[df_ic_all['ic_custdy_level']==2])
count_3 = len(df_ic_all[df_ic_all['ic_custdy_level']==3])
count_4 = len(df_ic_all[df_ic_all['ic_custdy_level']==4])
count_5 = len(df_ic_all[df_ic_all['ic_custdy_level']==5])

tot = count_2 + count_3 + count_4 + count_5

pct_2 = count_2/tot
print("percentage of lev 1 is", pct_2*100)
pct_3 = count_3/tot
print("percentage of lev 1 is", pct_3*100)
pct_4 = count_4/tot
print("percentage of lev 1 is", pct_4*100)
pct_5 = count_5/tot
print("percentage of lev 1 is", pct_5*100)


pct_2 +pct_3 + pct_4 +pct_5 
print(count_2,count_3,count_4,count_5,count_2+count_3+count_4+count_5)

percentage of lev 1 is 9.583785740137532
percentage of lev 1 is 34.31777053926891
percentage of lev 1 is 55.72927976836771
percentage of lev 1 is 0.3691639522258415
1324 4741 7699 51 13815


#### Oversampling to fix

In [8]:
from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)

X, y = os.fit_resample(X, y)

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, train_size=0.2, random_state=1)
model = LogisticRegression(multi_class="multinomial", max_iter=10_000)
model.fit(X_train, Y_train)

LogisticRegression(max_iter=10000, multi_class='multinomial')

In [43]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X_test, Y_test, scoring='accuracy', cv=cv, n_jobs=-1)
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Mean Accuracy: 0.678 (0.009)


In [44]:
np.exp(model.coef_)

array([[ 2.13066577,  4.40448155,  0.10124403,  0.291185  ,  0.41872623,
         0.68737375,  0.61844444,  0.61796751,  0.6853197 ,  0.75550368,
         0.86554388,  0.26505451,  2.93421788,  0.17593156,  7.48962171,
         5.15211257,  9.58336416,  1.54281008,  4.19074125],
       [ 1.5645998 ,  1.60363264,  1.05123428,  1.2234713 ,  0.89188043,
         1.457986  ,  1.05187855,  1.08091995,  0.89806976,  0.88113592,
         1.03184466,  0.59807166,  2.11002938,  1.21107011,  2.55265688,
         2.61108532,  2.52238615,  0.96694274,  2.45712923],
       [ 1.88186724,  0.58781152,  3.05249295,  2.78889814,  3.92800236,
         2.38785672,  2.02616705,  2.44192558,  1.29234711,  1.11049595,
         1.10098153,  2.19965653,  1.21627785, 13.39777959,  0.73220734,
         0.75602257,  0.85175168,  1.95662238,  1.42831084],
       [ 0.15940151,  0.24085864,  3.07805523,  1.00647864,  0.68169711,
         0.41787409,  0.7586797 ,  0.61306765,  1.2572381 ,  1.35270636,
         1.016

In [45]:
model.classes_

array([2., 3., 4., 5.])

In [46]:
# For CL 2
features = list(X_train.columns.values)
coeffs = list(np.exp(model.coef_)[0])
list(zip(features, coeffs))

[('gender_female', 2.1306657677712715),
 ('age_gt_45', 4.404481549784025),
 ('age_lt_25', 0.10124402608437318),
 ('race_B', 0.2911850000018083),
 ('race_A', 0.41872623090653205),
 ('race_H', 0.6873737507628503),
 ('race_I', 0.6184444399949701),
 ('race_O', 0.6179675104573243),
 ('off_1_prs_max', 0.685319702406421),
 ('off_1_gs_max', 0.7555036810814972),
 ('prior_commits', 0.8655438829101658),
 ('ic_institut_adj', 0.26505450575176054),
 ('escape_hist_1', 2.934217882716126),
 ('escape_hist_5', 0.17593156068491284),
 ('mrt_stat_DIV', 7.489621711936725),
 ('mrt_stat_SEP', 5.152112572979169),
 ('mrt_stat_MAR', 9.583364157799284),
 ('mrt_stat_WID', 1.5428100823077298),
 ('employed', 4.190741247082359)]

In [47]:
# For CL 3
features = list(X_train.columns.values)
coeffs = list(np.exp(model.coef_)[1])
list(zip(features, coeffs))

[('gender_female', 1.5645998006924493),
 ('age_gt_45', 1.6036326401257435),
 ('age_lt_25', 1.051234279884506),
 ('race_B', 1.2234712996365984),
 ('race_A', 0.8918804347840003),
 ('race_H', 1.457985996135163),
 ('race_I', 1.0518785483572728),
 ('race_O', 1.0809199485372003),
 ('off_1_prs_max', 0.898069761772171),
 ('off_1_gs_max', 0.8811359158494946),
 ('prior_commits', 1.031844661994937),
 ('ic_institut_adj', 0.5980716642989681),
 ('escape_hist_1', 2.1100293767681446),
 ('escape_hist_5', 1.2110701051971025),
 ('mrt_stat_DIV', 2.5526568778154797),
 ('mrt_stat_SEP', 2.6110853175241417),
 ('mrt_stat_MAR', 2.5223861502919322),
 ('mrt_stat_WID', 0.9669427434631649),
 ('employed', 2.4571292280294657)]

In [48]:
# For CL 4
features = list(X_train.columns.values)
coeffs = list(np.exp(model.coef_)[2])
list(zip(features, coeffs))

[('gender_female', 1.8818672401169523),
 ('age_gt_45', 0.5878115150094636),
 ('age_lt_25', 3.0524929547513513),
 ('race_B', 2.7888981379734674),
 ('race_A', 3.928002362413417),
 ('race_H', 2.387856722204119),
 ('race_I', 2.026167046202765),
 ('race_O', 2.4419255755611013),
 ('off_1_prs_max', 1.2923471116617713),
 ('off_1_gs_max', 1.110495950300301),
 ('prior_commits', 1.100981525039836),
 ('ic_institut_adj', 2.1996565289146686),
 ('escape_hist_1', 1.2162778513157915),
 ('escape_hist_5', 13.397779587496688),
 ('mrt_stat_DIV', 0.7322073350101205),
 ('mrt_stat_SEP', 0.7560225660448033),
 ('mrt_stat_MAR', 0.8517516801329712),
 ('mrt_stat_WID', 1.9566223761447437),
 ('employed', 1.4283108428081206)]

In [49]:
# For CL 5
features = list(X_train.columns.values)
coeffs = list(np.exp(model.coef_)[3])
list(zip(features, coeffs))

[('gender_female', 0.15940150555143648),
 ('age_gt_45', 0.24085863910972877),
 ('age_lt_25', 3.0780552306648845),
 ('race_B', 1.0064786399181076),
 ('race_A', 0.6816971131898661),
 ('race_H', 0.41787408587951747),
 ('race_I', 0.7586797013146075),
 ('race_O', 0.6130676525028547),
 ('off_1_prs_max', 1.2572380953020987),
 ('off_1_gs_max', 1.3527063612331396),
 ('prior_commits', 1.0169897022775576),
 ('ic_institut_adj', 2.867851833246515),
 ('escape_hist_1', 0.13279641179705792),
 ('escape_hist_5', 0.35031128827025243),
 ('mrt_stat_DIV', 0.07143541961856172),
 ('mrt_stat_SEP', 0.09832384079125706),
 ('mrt_stat_MAR', 0.04856880708292741),
 ('mrt_stat_WID', 0.34259400360706777),
 ('employed', 0.06799209489002236)]

## Feature selection

In [50]:
rfe = RFE(model, n_features_to_select=4, step=1)
rfe = rfe.fit(X, y.values.ravel())


zz= list(zip(Xvars,list(rfe.support_)))
ll = [a for (a,b) in zz if b]
ll

['escape_hist_5', 'mrt_stat_DIV', 'mrt_stat_SEP', 'mrt_stat_MAR']

## SVM

In [18]:
#We ommitted escape histories 2, 3 and 4 because they were highly correlated with escape history 1
#and because we want to know if they're frequent escapees or not, and not necessarily how many times

from sklearn.metrics import accuracy_score





#applying Support Vector Classifier 
#fitting kernel SVM to training dataset
from sklearn.svm import SVC
classifier_df = SVC(kernel = 'linear' , random_state = 0)
classifier_df.fit(X_train,Y_train)

#predicting test data result
y_pred = classifier_df.predict(X_test)


#setting up accuracy score

acc = accuracy_score(Y_test,y_pred) *100
print("Accuracy for our dataset in predicting test data is : {:.2f}%".format(acc))


from sklearn.metrics import classification_report
print(classification_report(Y_test, y_pred))


Accuracy for our dataset in predicting test data is : 66.81%
              precision    recall  f1-score   support

         2.0       0.75      0.76      0.76      6187
         3.0       0.56      0.62      0.59      6157
         4.0       0.68      0.50      0.57      6161
         5.0       0.70      0.79      0.74      6132

    accuracy                           0.67     24637
   macro avg       0.67      0.67      0.66     24637
weighted avg       0.67      0.67      0.66     24637



## Neural network

In [19]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', 
                    alpha=1e-5,
                    hidden_layer_sizes=(14,), 
                    random_state=1, 
                    max_iter=10000)

clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

acc = accuracy_score(Y_test,Y_pred) *100
print("Accuracy for our dataset in predicting test data is : {:.2f}%".format(acc))

from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))

Accuracy for our dataset in predicting test data is : 71.24%
              precision    recall  f1-score   support

         2.0       0.75      0.76      0.76      6187
         3.0       0.56      0.62      0.59      6157
         4.0       0.68      0.50      0.57      6161
         5.0       0.70      0.79      0.74      6132

    accuracy                           0.67     24637
   macro avg       0.67      0.67      0.66     24637
weighted avg       0.67      0.67      0.66     24637



## Random forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 0)
clf.fit(X_train, Y_train)

from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)

acc = accuracy_score(Y_test,y_pred) *100
print("Accuracy for our dataset in predicting test data is : {:.2f}%".format(acc))

from sklearn.metrics import classification_report
print(classification_report(Y_test, y_pred))

Accuracy for our dataset in predicting test data is : 83.96%
              precision    recall  f1-score   support

         2.0       0.84      0.83      0.84      6187
         3.0       0.71      0.73      0.72      6157
         4.0       0.84      0.81      0.83      6161
         5.0       0.97      0.98      0.97      6132

    accuracy                           0.84     24637
   macro avg       0.84      0.84      0.84     24637
weighted avg       0.84      0.84      0.84     24637

