# Multinomial regression

## Reclassification

**Notebook Setup**

In [1]:
import sys
sys.path.append("../src")
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

%matplotlib inline

In [3]:
plt.style.use('ggplot')

In [4]:
%matplotlib inline

from project import rf_models, preprocessing

df = pd.read_csv("../data/re_custody_2017_gsprs.csv", low_memory=False)



# preprocessing
data = preprocessing.preprocess_input_vars_re(df)
data = data[data.ic_custdy_level > 1]
data['high_re_discip_reports'] = np.where(data['re_discip_reports']>2, 1, 0)

data['re_override_up'] = np.where(data['re_ovride_cust_lvl']-data['re_custody_level']>0,1,0)
data['high_re'] = np.where(data['re_custody_level']>3, 1, 0)
print(data.columns)
data = data[
    [
        "gender_female",
        "age_gt_45",
        "age_lt_25",
        "race_B",
        "race_A",
        "race_H",
        "race_I",
        "race_O",
        "off_1_prs_max",
        "off_1_gs_max",
        #"ic_custdy_level",
        "prior_commits",
        "re_discip_reports",
        "re_escp_hist_1",
        #"re_escp_hist_2",
        #"re_escp_hist_3",
        #"re_escp_hist_4",
        "re_escp_hist_5",
        "mrt_stat_DIV",
        "mrt_stat_SEP",
        "mrt_stat_MAR",
        "mrt_stat_WID",
        "employed",
        #"high_re_discip_reports",
        #"high_re"
        "re_custody_level"
    ]
]
data = data.dropna()


df_re_all = data


Index(['re_curr_off_cd_1', 're_curr_off_cd_2', 're_curr_off_cd_3',
       're_prev_off_cd_1', 're_prev_off_cd_2', 're_prev_off_cd_3',
       're_escp_hist_1', 're_escp_hist_2', 're_escp_hist_3', 're_escp_hist_4',
       're_escp_hist_5', 're_discip_reports', 're_age_for_class',
       're_instit_violence', 'ic_prior_commits', 'race', 'sex',
       'ethnic_identity', 'citizenship', 'religion', 'legal_zip_code',
       'ic_employ_ind', 'date_of_birth', 're_custody_level', 'ic_custdy_level',
       'control_number', 're_ovride_cust_lvl', 're_de_year', 'off_1_gs_max',
       'off_1_gs_min', 'off_2_gs_max', 'off_2_gs_min', 'off_3_gs_max',
       'off_3_gs_min', 'off_1_prs_max', 'off_1_prs_min', 'off_2_prs_max',
       'off_2_prs_min', 'off_3_prs_max', 'off_3_prs_min', 'marital_status',
       'ic_mrtl_stat_fr_cl', 'affilatns_ind', 'affilatn_code_1',
       'affilatn_code_2', 'affilatn_code_3', 'affilatn_code_4',
       'affilatn_code_5', 'affilatn_code_6', 'affilatn_code_7',
       'affilat

#### Multinomial

In [5]:
X = df_re_all.drop("re_custody_level", axis=1)
y = df_re_all["re_custody_level"]
X_train, X_test, Y_train, Y_test = train_test_split(X, y, train_size=0.2, random_state=1)
model = LogisticRegression(multi_class="multinomial", max_iter=10_000)
model.fit(X_train, Y_train)

LogisticRegression(max_iter=10000, multi_class='multinomial')

In [6]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X_test, Y_test, scoring='accuracy', cv=cv, n_jobs=-1)
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Mean Accuracy: 0.776 (0.009)


In [7]:
model.coef_

array([[ 9.17804440e-01,  6.65755349e-01, -1.87830798e-01,
        -4.65658077e-01, -4.98887383e-01, -1.06809926e-01,
        -4.01235233e-01, -1.14859922e+00, -1.27872830e-01,
        -9.32573582e-02,  4.74646855e-02, -2.16143393e+00,
         8.28841169e-02, -5.21582233e-01,  6.69550087e-01,
         2.95560075e-01,  3.54585410e-01,  5.34471938e-02,
         4.15974798e-01],
       [ 4.06399582e-01,  1.44778954e-01, -1.06503077e-01,
         5.31963549e-02,  8.12135483e-01, -6.12034993e-02,
         5.95439143e-02,  5.49446158e-01,  1.42115474e-02,
        -7.70948383e-03, -2.56128719e-02,  4.78275971e-02,
         2.87270376e-02,  6.34391804e-02, -2.79312869e-01,
         3.46922941e-01,  6.77814231e-02,  4.75397291e-02,
        -1.86558167e-02],
       [-3.97302824e-01, -4.17080498e-01,  3.53047094e-01,
         2.55899388e-01, -2.06936342e-01, -1.28765500e-01,
         5.58298411e-01,  7.14436122e-01,  1.23627877e-01,
         1.02715204e-01, -1.51826792e-02,  1.34118339e+00,
    

In [8]:
np.exp(model.coef_)

array([[2.50378714, 1.94595985, 0.82875492, 0.62772188, 0.60720587,
        0.89869648, 0.66949256, 0.31708062, 0.87996528, 0.91095903,
        1.04860917, 0.11515987, 1.0864159 , 0.59358062, 1.95335828,
        1.34387882, 1.4255895 , 1.05490128, 1.51584767],
       [1.50140237, 1.15578406, 0.89897228, 1.05463671, 2.25271349,
        0.9406318 , 1.06135237, 1.73229334, 1.01431301, 0.99232016,
        0.97471236, 1.04898979, 1.02914364, 1.06549468, 0.75630324,
        1.41470771, 1.07013138, 1.04868786, 0.98151713],
       [0.67213046, 0.65896787, 1.42339818, 1.29162277, 0.81307141,
        0.87918011, 1.74769611, 2.04303434, 1.1315947 , 1.10817576,
        0.984932  , 3.82356559, 0.78048826, 0.88763451, 0.62375888,
        0.63919676, 0.66401623, 0.48485354, 0.93371601],
       [0.39577825, 0.67472249, 0.94297716, 1.16948366, 0.89914429,
        1.34551781, 0.80524629, 0.89111386, 0.99008291, 0.99825317,
        0.99335305, 2.16500559, 1.14593904, 1.78129078, 1.08518901,
        0.822

In [9]:
# For CL 2
features = list(X_train.columns.values)
coeffs = list(np.exp(model.coef_)[0])
list(zip(features, coeffs))

[('gender_female', 2.5037871363371296),
 ('age_gt_45', 1.9459598453182136),
 ('age_lt_25', 0.828754922644935),
 ('race_B', 0.6277218799255938),
 ('race_A', 0.6072058712942865),
 ('race_H', 0.8986964757762457),
 ('race_I', 0.6694925560349174),
 ('race_O', 0.31708062018041705),
 ('off_1_prs_max', 0.8799652771251462),
 ('off_1_gs_max', 0.910959027321151),
 ('prior_commits', 1.0486091693879847),
 ('re_discip_reports', 0.1151598717843118),
 ('re_escp_hist_1', 1.0864159040085224),
 ('re_escp_hist_5', 0.5935806215268788),
 ('mrt_stat_DIV', 1.9533582822159643),
 ('mrt_stat_SEP', 1.343878820614645),
 ('mrt_stat_MAR', 1.4255894967141525),
 ('mrt_stat_WID', 1.054901284981055),
 ('employed', 1.515847665994266)]

In [10]:
# For CL 3
features = list(X_train.columns.values)
coeffs = list(np.exp(model.coef_)[1])
list(zip(features, coeffs))

[('gender_female', 1.501402366252697),
 ('age_gt_45', 1.1557840607759005),
 ('age_lt_25', 0.8989722822376531),
 ('race_B', 1.0546367078587178),
 ('race_A', 2.252713485427167),
 ('race_H', 0.9406318023682558),
 ('race_I', 1.0613523684670025),
 ('race_O', 1.732293335677388),
 ('off_1_prs_max', 1.0143130115390369),
 ('off_1_gs_max', 0.9923201580207428),
 ('prior_commits', 0.9747123551012784),
 ('re_discip_reports', 1.0489897908706034),
 ('re_escp_hist_1', 1.029143638659561),
 ('re_escp_hist_5', 1.0654946808791885),
 ('mrt_stat_DIV', 0.7563032425274437),
 ('mrt_stat_SEP', 1.4147077052223924),
 ('mrt_stat_MAR', 1.0701313769113963),
 ('mrt_stat_WID', 1.0486878637633852),
 ('employed', 0.9815171258726776)]

In [11]:
# For CL 4
features = list(X_train.columns.values)
coeffs = list(np.exp(model.coef_)[2])
list(zip(features, coeffs))

[('gender_female', 0.6721304578968723),
 ('age_gt_45', 0.6589678724681564),
 ('age_lt_25', 1.423398175691561),
 ('race_B', 1.291622769201147),
 ('race_A', 0.8130714068796918),
 ('race_H', 0.8791801088306046),
 ('race_I', 1.747696108260385),
 ('race_O', 2.0430343356113023),
 ('off_1_prs_max', 1.1315946999912243),
 ('off_1_gs_max', 1.1081757604158935),
 ('prior_commits', 0.98493199657566),
 ('re_discip_reports', 3.8235655874185897),
 ('re_escp_hist_1', 0.7804882619020769),
 ('re_escp_hist_5', 0.8876345125784378),
 ('mrt_stat_DIV', 0.6237588840823309),
 ('mrt_stat_SEP', 0.6391967579711528),
 ('mrt_stat_MAR', 0.6640162305423297),
 ('mrt_stat_WID', 0.4848535418876657),
 ('employed', 0.9337160056305428)]

In [13]:
# For CL 5
features = list(X_train.columns.values)
coeffs = list(np.exp(model.coef_)[3])
list(zip(features, coeffs))

[('gender_female', 0.39577825019166357),
 ('age_gt_45', 0.6747224852368022),
 ('age_lt_25', 0.9429771575498141),
 ('race_B', 1.1694836582428287),
 ('race_A', 0.8991442892457416),
 ('race_H', 1.345517806793521),
 ('race_I', 0.805246294328737),
 ('race_O', 0.8911138648270196),
 ('off_1_prs_max', 0.9900829078005837),
 ('off_1_gs_max', 0.9982531652740093),
 ('prior_commits', 0.9933530549170289),
 ('re_discip_reports', 2.165005587693428),
 ('re_escp_hist_1', 1.1459390407332926),
 ('re_escp_hist_5', 1.781290779590312),
 ('mrt_stat_DIV', 1.0851890055319298),
 ('mrt_stat_SEP', 0.8228839808530729),
 ('mrt_stat_MAR', 0.9871649341614837),
 ('mrt_stat_WID', 1.8643668152186061),
 ('employed', 0.7198330047146084)]

### Feature selection

In [14]:
from sklearn.feature_selection import RFE

data_final_vars=df_re_all.columns.values.tolist()
yvars = ['re_custody_level']
Xvars = [i for i in data_final_vars if i not in yvars]

rfe = RFE(model, n_features_to_select=8, step=1)
rfe = rfe.fit(X, y.values.ravel())

zz= list(zip(Xvars,list(rfe.support_)))
ll = [a for (a,b) in zz if b]
ll

['gender_female',
 'age_gt_45',
 'race_A',
 're_discip_reports',
 'mrt_stat_DIV',
 'mrt_stat_SEP',
 'mrt_stat_MAR',
 'employed']

#### Class imbalance

In [15]:
count_2 = len(df_re_all[df_re_all['re_custody_level']==2])
count_3 = len(df_re_all[df_re_all['re_custody_level']==3])
count_4 = len(df_re_all[df_re_all['re_custody_level']==4])
count_5 = len(df_re_all[df_re_all['re_custody_level']==5])

tot = count_2 + count_3 + count_4 + count_5

pct_2 = count_2/tot
print("percentage of lev 2 is", pct_2*100)
pct_3 = count_3/tot
print("percentage of lev 3 is", pct_3*100)
pct_4 = count_4/tot
print("percentage of lev 4 is", pct_4*100)
pct_5 = count_5/tot
print("percentage of lev 5 is", pct_5*100)


pct_2 +pct_3 + pct_4 +pct_5 

percentage of lev 2 is 60.156464452374415
percentage of lev 3 is 22.36480922316772
percentage of lev 4 is 12.242657150699973
percentage of lev 5 is 5.236069173757891


1.0

#### Oversampling to fix

In [16]:
from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)

X, y = os.fit_resample(X, y)

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, train_size=0.2, random_state=1)
model = LogisticRegression(multi_class="multinomial", max_iter=10_000)
model.fit(X_train, Y_train)

LogisticRegression(max_iter=10000, multi_class='multinomial')

In [18]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X_test, Y_test, scoring='accuracy', cv=cv, n_jobs=-1)
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Mean Accuracy: 0.573 (0.007)


In [19]:
np.exp(model.coef_)

array([[5.78411467, 4.37038786, 1.11419576, 1.39118567, 1.26689318,
        2.30928592, 0.69388357, 1.89758085, 0.89575868, 0.90602791,
        1.06800092, 0.12312354, 1.9579385 , 1.84297649, 3.58743955,
        3.59140873, 3.84262969, 1.68862098, 2.8103826 ],
       [1.81427057, 1.49665559, 0.67045756, 0.94500785, 0.80613115,
        0.93140017, 1.9658766 , 1.04320326, 1.04023151, 1.01550026,
        0.9989254 , 1.12237368, 1.02184092, 0.88933676, 0.86831491,
        1.48769178, 0.9359293 , 0.6815824 , 1.30849395],
       [0.35315728, 0.46472802, 1.56162424, 1.16407307, 0.99524094,
        0.90609033, 0.89689664, 0.98302215, 1.06755322, 1.0817903 ,
        0.95019302, 3.50585355, 0.696055  , 0.87882621, 0.59492718,
        0.86932481, 0.65370415, 0.36999303, 0.63079354],
       [0.26983165, 0.32897227, 0.85721678, 0.65343027, 0.9838436 ,
        0.51311473, 0.81736257, 0.51388681, 1.00528553, 1.00469745,
        0.98646909, 2.06408542, 0.71808211, 0.69424212, 0.53960298,
        0.215

In [26]:
# For CL 2
features = list(X_train.columns.values)
coeffs = list(np.exp(model.coef_)[0])
list(zip(features, coeffs))

[('gender_female', 5.784114673411529),
 ('age_gt_45', 4.370387858800114),
 ('age_lt_25', 1.1141957573483274),
 ('race_B', 1.3911856714008854),
 ('race_A', 1.266893179962617),
 ('race_H', 2.309285921361442),
 ('race_I', 0.6938835707509523),
 ('race_O', 1.89758085216362),
 ('off_1_prs_max', 0.8957586832016886),
 ('off_1_gs_max', 0.9060279094585155),
 ('prior_commits', 1.068000918047938),
 ('re_discip_reports', 0.1231235390031406),
 ('re_escp_hist_1', 1.9579384986882713),
 ('re_escp_hist_5', 1.8429764868516205),
 ('mrt_stat_DIV', 3.587439548528901),
 ('mrt_stat_SEP', 3.5914087315800787),
 ('mrt_stat_MAR', 3.8426296913612554),
 ('mrt_stat_WID', 1.6886209791198246),
 ('employed', 2.810382598118759)]

In [27]:
# For CL 3
features = list(X_train.columns.values)
coeffs = list(np.exp(model.coef_)[1])
list(zip(features, coeffs))

[('gender_female', 1.814270568959974),
 ('age_gt_45', 1.4966555859531965),
 ('age_lt_25', 0.6704575632597396),
 ('race_B', 0.9450078502145366),
 ('race_A', 0.8061311517721217),
 ('race_H', 0.9314001666882661),
 ('race_I', 1.9658766034594075),
 ('race_O', 1.0432032576940136),
 ('off_1_prs_max', 1.0402315149253774),
 ('off_1_gs_max', 1.0155002632923493),
 ('prior_commits', 0.9989253995473749),
 ('re_discip_reports', 1.12237367918668),
 ('re_escp_hist_1', 1.0218409214736333),
 ('re_escp_hist_5', 0.8893367599122296),
 ('mrt_stat_DIV', 0.8683149102319874),
 ('mrt_stat_SEP', 1.4876917756764316),
 ('mrt_stat_MAR', 0.9359292990757966),
 ('mrt_stat_WID', 0.6815824030619526),
 ('employed', 1.3084939508902862)]

In [28]:
# For CL 4
features = list(X_train.columns.values)
coeffs = list(np.exp(model.coef_)[2])
list(zip(features, coeffs))

[('gender_female', 0.3531572827629946),
 ('age_gt_45', 0.4647280213862168),
 ('age_lt_25', 1.5616242438223409),
 ('race_B', 1.164073065141239),
 ('race_A', 0.995240937359671),
 ('race_H', 0.9060903314028289),
 ('race_I', 0.8968966375476176),
 ('race_O', 0.9830221465618765),
 ('off_1_prs_max', 1.0675532155358953),
 ('off_1_gs_max', 1.0817903046011734),
 ('prior_commits', 0.95019301948615),
 ('re_discip_reports', 3.5058535500548698),
 ('re_escp_hist_1', 0.6960549957380078),
 ('re_escp_hist_5', 0.878826213604372),
 ('mrt_stat_DIV', 0.5949271827651591),
 ('mrt_stat_SEP', 0.8693248090934237),
 ('mrt_stat_MAR', 0.6537041530285963),
 ('mrt_stat_WID', 0.36999302785121396),
 ('employed', 0.6307935357653568)]

In [29]:
# For CL 5
features = list(X_train.columns.values)
coeffs = list(np.exp(model.coef_)[3])
list(zip(features, coeffs))

[('gender_female', 0.2698316464463032),
 ('age_gt_45', 0.3289722676105704),
 ('age_lt_25', 0.8572167777033057),
 ('race_B', 0.6534302672355999),
 ('race_A', 0.9838436026743212),
 ('race_H', 0.5131147328948413),
 ('race_I', 0.8173625690469667),
 ('race_O', 0.5138868136584845),
 ('off_1_prs_max', 1.0052855340936062),
 ('off_1_gs_max', 1.0046974474051054),
 ('prior_commits', 0.9864690917178893),
 ('re_discip_reports', 2.0640854192854072),
 ('re_escp_hist_1', 0.718082112277222),
 ('re_escp_hist_5', 0.6942421202893079),
 ('mrt_stat_DIV', 0.5396029772151589),
 ('mrt_stat_SEP', 0.21529806660015455),
 ('mrt_stat_MAR', 0.42535072636512355),
 ('mrt_stat_WID', 2.3483126075983405),
 ('employed', 0.43109759395890834)]

In [30]:
rfe = RFE(model, n_features_to_select=4, step=1)
rfe = rfe.fit(X, y.values.ravel())


zz= list(zip(Xvars,list(rfe.support_)))
ll = [a for (a,b) in zz if b]
ll

['gender_female', 'mrt_stat_DIV', 'mrt_stat_SEP', 'mrt_stat_MAR']