File contents:
    usual data cleaning
    random forest models using a pipeline:
        nulls replaced by mode with boolean tracking
        nulls replaced by distribution with boolean tracking

In [1]:
# load in entire data file
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

# imblearn packages
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE

# pipeline
from imblearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline

# sklearn packages
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split   # For Data Partitioning
from sklearn.feature_selection import RFE              # To implement RFE
from sklearn.model_selection import StratifiedKFold, KFold              # For creating folds
from sklearn.model_selection import cross_val_score    # For implementing Cross Validation experiments
from sklearn.model_selection import GridSearchCV       # To implement GridSearch CV
from sklearn.model_selection import RandomizedSearchCV # To implement Randomized Search CV
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

filename = 'mh-cld-all_years.pkl'
df = pd.read_pickle(filename)
df

Unnamed: 0,YEAR,AGE,EDUC,ETHNIC,RACE,GENDER,SPHSERVICE,CMPSERVICE,OPISERVICE,RTCSERVICE,...,ODDFLG,PDDFLG,PERSONFLG,SCHIZOFLG,ALCSUBFLG,OTHERDISFLG,STATEFIP,DIVISION,REGION,CASEID
0,2013,6,3,-9,5,1,2,1,2,2,...,0,0,0,0,0,0,1,6,3,20130000001
1,2013,6,4,-9,6,1,2,1,2,2,...,0,0,0,1,0,1,1,6,3,20130000002
2,2013,11,3,3,6,1,1,1,2,2,...,0,0,0,1,0,0,1,6,3,20130000003
3,2013,8,2,4,2,1,1,2,2,2,...,0,0,0,1,0,0,1,6,3,20130000004
4,2013,9,5,3,-9,1,1,2,2,2,...,0,0,0,1,0,0,1,6,3,20130000005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362039,2019,5,-9,4,5,1,2,1,2,2,...,0,0,0,0,0,0,99,0,0,20196362040
6362040,2019,4,4,4,6,1,2,1,2,2,...,0,0,0,0,0,0,99,0,0,20196362041
6362041,2019,8,1,4,2,1,2,1,2,2,...,0,0,0,1,0,0,99,0,0,20196362042
6362042,2019,11,4,4,4,1,2,1,2,2,...,0,0,0,0,0,0,99,0,0,20196362043


In [2]:
# remove rows that may not be relevant or will sway data

cols = ['AGE', 'MH1', 'EDUC', 'ETHNIC', 'RACE', 'GENDER', 'MARSTAT', 'SAP', 'EMPLOY', 'DETNLF', 'LIVARAG', 'NUMMHS', 'STATEFIP']
df_relevant = df[cols]
df_relevant

Unnamed: 0,AGE,MH1,EDUC,ETHNIC,RACE,GENDER,MARSTAT,SAP,EMPLOY,DETNLF,LIVARAG,NUMMHS,STATEFIP
0,6,7,3,-9,5,1,1,2,-9,-9,-9,1,1
1,6,11,4,-9,6,1,1,2,-9,-9,2,2,1
2,11,11,3,3,6,1,1,2,5,5,-9,1,1
3,8,11,2,4,2,1,1,2,-9,-9,2,1,1
4,9,11,5,3,-9,1,1,2,-9,-9,-9,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362039,5,6,-9,4,5,1,-9,-9,-9,-9,-9,1,99
6362040,4,2,4,4,6,1,1,-9,2,-9,2,1,99
6362041,8,11,1,4,2,1,1,-9,4,-9,2,1,99
6362042,11,7,4,4,4,1,2,-9,5,1,2,1,99


# Counting null values in each column

# Merging EMPLOY and DETNLF

In [3]:
df_relevant = df_relevant.sample(n = 200000)

In [4]:
# merging EMPLOY and DETNLF
df_relevant['EMPLOY'] = df_relevant.apply(lambda row: row['DETNLF'] if row['EMPLOY'] == 5 else row['EMPLOY'], axis = 1)

In [5]:
df_relevant = df_relevant.drop(columns = 'DETNLF')

# Changing columns to categorical

In [6]:
# import excel keys for categorizing
group_ages = df_relevant['AGE'] 

# series of counts of each state type
ser = (group_ages.value_counts())
ser
# make the series a df
df_temp = ser.to_frame()
# replace the index to be numbers and age value is a column
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
df_temp
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)

# map age categories to number values
df_age = pd.read_csv('age_key.csv')
AGE_dict = dict(zip(df_temp.value, df_age.age))
AGE_dict
df_relevant = df_relevant.replace({'AGE': AGE_dict})

In [7]:
group_educ = df_relevant['EDUC'] 

# series of counts of each state type
ser = (group_educ.value_counts())
ser
# make the series a df
df_temp = ser.to_frame()
# replace the index to be numbers and age value is a column
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
df_temp
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)

# map age categories to number values
df_educ = pd.read_csv('educ_key.csv')
EDUC_dict = dict(zip(df_temp.value, df_educ.educ))
EDUC_dict
df_relevant = df_relevant.replace({'EDUC': EDUC_dict})

In [8]:
group_ethnic = df_relevant['ETHNIC'] 

# series of counts of each state type
ser = (group_ethnic.value_counts())
ser
# make the series a df
df_temp = ser.to_frame()
# replace the index to be numbers and age value is a column
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
df_temp
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)

# map age categories to number values
df_ethnic = pd.read_csv('ethnic_key.csv')
ETHNIC_dict = dict(zip(df_temp.value, df_ethnic.ethnic))
ETHNIC_dict
df_relevant = df_relevant.replace({'ETHNIC': ETHNIC_dict})

In [9]:
group_race = df_relevant['RACE'] 

# series of counts of each state type
ser = (group_race.value_counts())
ser
# make the series a df
df_temp = ser.to_frame()
# replace the index to be numbers and age value is a column
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
df_temp
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)

# map age categories to number values
df_race = pd.read_csv('race_key.csv')
RACE_dict = dict(zip(df_temp.value, df_race.race))
RACE_dict
df_relevant = df_relevant.replace({'RACE': RACE_dict})

In [10]:
group_gender = df_relevant['GENDER'] 

# series of counts of each state type
ser = (group_gender.value_counts())
ser
# make the series a df
df_temp = ser.to_frame()
# replace the index to be numbers and age value is a column
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
df_temp
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)

# map age categories to number values
df_gender = pd.read_csv('gender_key.csv')
GENDER_dict = dict(zip(df_temp.value, df_gender.gender))
GENDER_dict
df_relevant = df_relevant.replace({'GENDER': GENDER_dict})

In [11]:
group_marstat = df_relevant['MARSTAT'] 

# series of counts of each state type
ser = (group_marstat.value_counts())
ser
# make the series a df
df_temp = ser.to_frame()
# replace the index to be numbers and age value is a column
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
df_temp
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)

# map age categories to number values
df_marstat = pd.read_csv('marstat_key.csv')
MARSTAT_dict = dict(zip(df_temp.value, df_marstat.marstat))
MARSTAT_dict
df_relevant = df_relevant.replace({'MARSTAT': MARSTAT_dict})

In [12]:
group_sap = df_relevant['SAP'] 

# series of counts of each state type
ser = (group_sap.value_counts())
ser
# make the series a df
df_temp = ser.to_frame()
# replace the index to be numbers and age value is a column
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
df_temp
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)

# map age categories to number values
df_sap = pd.read_csv('sap_key.csv')
SAP_dict = dict(zip(df_temp.value, df_sap.sap))
SAP_dict
df_relevant = df_relevant.replace({'SAP': SAP_dict})

In [13]:
group_employ = df_relevant['EMPLOY'] 

# series of counts of each state type
ser = (group_employ.value_counts())
ser
# make the series a df
df_temp = ser.to_frame()
# replace the index to be numbers and age value is a column
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
df_temp
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)

# map age categories to number values
df_employ = pd.read_csv('employ_key.csv')
EMPLOY_dict = dict(zip(df_temp.value, df_employ.employ))
EMPLOY_dict
df_relevant = df_relevant.replace({'EMPLOY': EMPLOY_dict})

In [14]:
group_livarag = df_relevant['LIVARAG'] 

# series of counts of each state type
ser = (group_livarag.value_counts())
ser
# make the series a df
df_temp = ser.to_frame()
# replace the index to be numbers and age value is a column
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
df_temp
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)

# map age categories to number values
df_livarag = pd.read_csv('livarag_key.csv')
LIVARAG_dict = dict(zip(df_temp.value, df_livarag.livarag))
LIVARAG_dict
df_relevant = df_relevant.replace({'LIVARAG': LIVARAG_dict})

In [15]:
df_relevant['NUMMHS'] = df_relevant['NUMMHS'].astype(str)

In [16]:
df_disorders = pd.read_csv('Disorders_Key.csv')

disorder_dict = dict(zip(df_disorders.ID, df_disorders.DISORDER))
df_relevant = df_relevant.replace({'MH1': disorder_dict})

In [17]:
df_states = pd.read_csv('States_ID.csv')
state_dict = dict(zip(df_states.STATEFID, df_states.STATE))
df_relevant = df_relevant.replace({'STATEFIP': state_dict})

# remove states with not enough data to count
filt4 = df_relevant['STATEFIP'] != 4
filt19 = df_relevant['STATEFIP'] != 19
filt20 = df_relevant['STATEFIP'] != 20
filt23 = df_relevant['STATEFIP'] != 23
filt54 = df_relevant['STATEFIP'] != 54
filt99 = df_relevant['STATEFIP'] != 99
df_relevant = df_relevant[filt4 & filt19 & filt20 & filt23 & filt54 & filt99]
df_relevant['STATEFIP'].value_counts()

CA    23712
PA    17424
OH    15408
TX    12542
NJ     8427
MN     7648
FL     7634
MD     6587
WA     6336
NC     5680
KY     5459
IN     4692
NM     4599
OR     4547
CO     4404
VA     4202
TN     3847
AL     3426
CT     3251
OK     3217
MI     3179
IL     3159
MS     2878
SC     2863
WI     2725
MO     2616
AR     2538
NY     1952
UT     1838
MT     1758
LA     1419
RI     1116
DC     1066
MA     1025
VT     1016
NV      840
NE      802
WY      555
SD      533
ID      509
ND      501
PR      479
DE      358
HI      355
AK       53
Name: STATEFIP, dtype: int64

In [18]:
# filtering for most common disorders
depression_filt = df_relevant['MH1'] == 'Depression'
trauma_filt = df_relevant['MH1'] == 'Bipolar'
bipolar_filt = df_relevant['MH1'] == 'Trauma-related'

df_relevant = df_relevant[(depression_filt | trauma_filt | bipolar_filt)]

# Pipeline - Nulls Replaced by Mode

In [24]:
df_modes = df_relevant.copy()
def modes(column):
    new_col = column + '_replaced'
    df_modes[new_col] = False
    mode = df_modes[column].mode()
    print(mode[0])
    df_modes[column] = df_modes.apply(lambda row: mode[0] if pd.isna(row[column]) else row[column], axis = 1)
    df_modes[new_col] = df_modes.apply(lambda row: True if pd.isna(row[column]) else False, axis = 1)

In [25]:
# replace each row with its mode
modes('AGE')
modes('EDUC')
modes('ETHNIC')
modes('RACE')
modes('GENDER')
modes('MARSTAT')
modes('SAP')
modes('EMPLOY')
modes('LIVARAG')
modes('NUMMHS')
modes('STATEFIP')

0-11
12 or GED
Not of Hispanic or Latino origin
White
Female
Never married
No
Full time
Private residence
1
CA


In [26]:
# defining folds and classifiers
kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)

# defining test_x and test_y
x = pd.get_dummies(df_modes.drop(columns = 'MH1'), drop_first = True)
y = df_modes['MH1']

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.4, random_state = 1)

In [None]:
pipe = imbpipeline([
    ('smote', SMOTE()),
    ('clf', RandomForestClassifier())
])

parameters = {
    'smote__k_neighbors': [5, 25, 75],
    'clf__max_depth': [10, 60 , 100],
    'clf__min_samples_split': [10, 20, 40],
    'clf__min_samples_leaf': [45, 55, 65]
}

pip_model = GridSearchCV(pipe, param_grid = parameters, cv = kfold, scoring = 'f1_weighted', n_jobs = -1, verbose = 1)
pip_model.fit(train_x, train_y)

print("Best parameters: ", pip_model.best_params_)
print("Best score: ", pip_model.best_score_)
y_pred = pip_model.predict(test_x)
print(classification_report(test_y, y_pred))

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [None]:
bestTree = pip_model.best_params_

In [None]:
# Displaying feature importance
importance = bestTree.feature_importances_
feature_imp = pd.DataFrame(list(zip(train_x.columns, importance)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)

feature_imp.head()

In [None]:
# Specifying figure size
fig, ax = plt.subplots(figsize=(6, 6)) 

# Generating confusion matrix
plot_confusion_matrix(bestTree, test_x, test_y,
                      cmap = plt.cm.Blues,
                      values_format = '',
                      ax = ax);

# Pipeline - Nulls Replaced by Distribution

In [None]:
df_dist = df_relevant.copy()

In [None]:
def dist(column):
    new_col = column + '_replaced'
    df_dist[new_col] = False
    
    total_len = len(df_dist)
    dist_ser = df_dist[column].value_counts()
    dist_df = dist_ser.to_frame()
    dist_df['cat'] = dist_df.index.tolist()
    numb_lst = list(range(0, len(dist_df.index.tolist())))
    dist_df.set_index([pd.Index(numb_lst)], inplace = True)
    prop_lst = []

    # make proportions of nulls in each column a dataframe
    for numb in dist_df[column]:
        prop = numb / total_len
        prop_lst.append(prop)

    dist_df['proportion'] = prop_lst

    # counting the number of nulls
    null_filt = df_dist[column].isna()
    len_nulls = len(df_dist[null_filt])
    index = 0

    def repeater(n, row, item):
        for x in range(int(n)):
            if pd.isna(row[column]):
                row([new_col]) = True
                ret = item
            else:
                ret = row[column]
            return ret

    for item in dist_df['cat']:
        changes = len_nulls * prop_lst[index]
        df_dist[column] = df_dist.apply(lambda row: repeater(changes, row, item), axis = 1)
        index = index + 1

In [None]:
dist('AGE')
dist('EDUC')
dist('ETHNIC')
dist('RACE')
dist('GENDER')
dist('MARSTAT')
dist('SAP')
dist('EMPLOY')
dist('LIVARAG')

In [None]:
# defining folds and classifiers
kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)

# defining test_x and test_y
x = pd.get_dummies(df_dist.drop(columns = 'MH1'), drop_first = True)
y = df_dist['MH1']

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.4, random_state = 1)

In [None]:
pipe = imbpipeline([
    ('smote', SMOTE()),
    ('clf', RandomForestClassifier())
])

parameters = {
    'smote__k_neighbors': [5, 25, 50, 75],
    'clf__max_depth': [10, 60 , 100],
    'clf__min_samples_split': [10, 20, 40],
    'clf__min_samples_leaf': [45, 55, 65]
}

pip_model = GridSearchCV(pipe, param_grid = parameters, cv = kfold, scoring = 'f1_weighted', n_jobs = -1, verbose = 1)
pip_model.fit(train_x, train_y)

print("Best parameters: ", pip_model.best_params_)
print("Best score: ", pip_model.best_score_)
y_pred = pip_model.predict(test_x)
print(classification_report(test_y, y_pred))

In [None]:
bestTree = pip_model.best_params_

In [None]:
# Displaying feature importance
importance = bestTree.feature_importances_
feature_imp = pd.DataFrame(list(zip(train_x.columns, importance)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)

feature_imp.head()

In [None]:
# Specifying figure size
fig, ax = plt.subplots(figsize=(6, 6)) 

# Generating confusion matrix
plot_confusion_matrix(bestTree, test_x, test_y,
                      cmap = plt.cm.Blues,
                      values_format = '',
                      ax = ax);