In [2]:
# load in entire data file
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import pickle
import csv

# imblearn packages
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE

# pipeline
from imblearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline

# sklearn packages
import sklearn
from sklearn.model_selection import train_test_split   # For Data Partitioning
from sklearn.feature_selection import RFE              # To implement RFE
from sklearn.model_selection import StratifiedKFold, KFold              # For creating folds
from sklearn.model_selection import cross_val_score    # For implementing Cross Validation experiments
from sklearn.model_selection import GridSearchCV       # To implement GridSearch CV
from sklearn.model_selection import RandomizedSearchCV # To implement Randomized Search CV
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier

from sklearn.datasets import make_blobs              # To create artificial data 
from sklearn.cluster import KMeans                   # To implement K-means clustering
from sklearn.metrics import silhouette_score         # To compute Silhouette Score
from collections import Counter                      # To count items

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

filename = 'mh-cld-all_years.pkl'
df = pd.read_pickle(filename)
df

Unnamed: 0,YEAR,AGE,EDUC,ETHNIC,RACE,GENDER,SPHSERVICE,CMPSERVICE,OPISERVICE,RTCSERVICE,...,ODDFLG,PDDFLG,PERSONFLG,SCHIZOFLG,ALCSUBFLG,OTHERDISFLG,STATEFIP,DIVISION,REGION,CASEID
0,2013,6,3,-9,5,1,2,1,2,2,...,0,0,0,0,0,0,1,6,3,20130000001
1,2013,6,4,-9,6,1,2,1,2,2,...,0,0,0,1,0,1,1,6,3,20130000002
2,2013,11,3,3,6,1,1,1,2,2,...,0,0,0,1,0,0,1,6,3,20130000003
3,2013,8,2,4,2,1,1,2,2,2,...,0,0,0,1,0,0,1,6,3,20130000004
4,2013,9,5,3,-9,1,1,2,2,2,...,0,0,0,1,0,0,1,6,3,20130000005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362039,2019,5,-9,4,5,1,2,1,2,2,...,0,0,0,0,0,0,99,0,0,20196362040
6362040,2019,4,4,4,6,1,2,1,2,2,...,0,0,0,0,0,0,99,0,0,20196362041
6362041,2019,8,1,4,2,1,2,1,2,2,...,0,0,0,1,0,0,99,0,0,20196362042
6362042,2019,11,4,4,4,1,2,1,2,2,...,0,0,0,0,0,0,99,0,0,20196362043


In [3]:
# looking at one year for now
# remove rows that may not be relevant or will sway data
filt2019 = df['YEAR'] == 2019
df = df[filt2019]
cols = ['AGE', 'MH1', 'EDUC', 'ETHNIC', 'RACE', 'GENDER', 'MARSTAT', 'SAP', 'EMPLOY', 'DETNLF', 'LIVARAG', 'NUMMHS', 'STATEFIP']
df_relevant = df[cols]


# merging EMPLOY and DETNLF
df_relevant['EMPLOY'] = df_relevant.apply(lambda row: row['DETNLF'] if row['EMPLOY'] == 5 else row['EMPLOY'], axis = 1)
df_relevant = df_relevant.drop(columns = 'DETNLF')

# making columns categorical

# age
# import excel keys for categorizing
group_ages = df_relevant['AGE'] 
# series of counts of each state type
ser = (group_ages.value_counts())
# make the series a df
df_temp = ser.to_frame()
# replace the index to be numbers and age value is a column
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)
# map age categories to number values
df_age = pd.read_csv('age_key.csv')
AGE_dict = dict(zip(df_temp.value, df_age.age))
df_relevant = df_relevant.replace({'AGE': AGE_dict})

# education
group_educ = df_relevant['EDUC'] 
ser = (group_educ.value_counts())
df_temp = ser.to_frame()
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)
df_educ = pd.read_csv('educ_key.csv')
EDUC_dict = dict(zip(df_temp.value, df_educ.educ))
df_relevant = df_relevant.replace({'EDUC': EDUC_dict})

# ethnicity
group_ethnic = df_relevant['ETHNIC'] 
ser = (group_ethnic.value_counts())
df_temp = ser.to_frame()
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)
df_ethnic = pd.read_csv('ethnic_key.csv')
ETHNIC_dict = dict(zip(df_temp.value, df_ethnic.ethnic))
df_relevant = df_relevant.replace({'ETHNIC': ETHNIC_dict})

# race
group_race = df_relevant['RACE'] 
ser = (group_race.value_counts())
df_temp = ser.to_frame()
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)
df_race = pd.read_csv('race_key.csv')
RACE_dict = dict(zip(df_temp.value, df_race.race))
df_relevant = df_relevant.replace({'RACE': RACE_dict})

# gender
group_gender = df_relevant['GENDER'] 
ser = (group_gender.value_counts())
df_temp = ser.to_frame()
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)
df_gender = pd.read_csv('gender_key.csv')
GENDER_dict = dict(zip(df_temp.value, df_gender.gender))
df_relevant = df_relevant.replace({'GENDER': GENDER_dict})

# marstat
group_marstat = df_relevant['MARSTAT'] 
ser = (group_marstat.value_counts())
df_temp = ser.to_frame()
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)
df_marstat = pd.read_csv('marstat_key.csv')
MARSTAT_dict = dict(zip(df_temp.value, df_marstat.marstat))
df_relevant = df_relevant.replace({'MARSTAT': MARSTAT_dict})

# sap
group_sap = df_relevant['SAP'] 
ser = (group_sap.value_counts())
df_temp = ser.to_frame()
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)
df_sap = pd.read_csv('sap_key.csv')
SAP_dict = dict(zip(df_temp.value, df_sap.sap))
df_relevant = df_relevant.replace({'SAP': SAP_dict})

# employ
group_employ = df_relevant['EMPLOY'] 
ser = (group_employ.value_counts())
df_temp = ser.to_frame()
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)
df_employ = pd.read_csv('employ_key.csv')
EMPLOY_dict = dict(zip(df_temp.value, df_employ.employ))
df_relevant = df_relevant.replace({'EMPLOY': EMPLOY_dict})

# livarag
group_livarag = df_relevant['LIVARAG'] 
ser = (group_livarag.value_counts())
df_temp = ser.to_frame()
df_temp['value'] = df_temp.index.tolist()
df_temp.sort_values(by = 'value', inplace = True)
null_row = df_temp.iloc[0]
df_temp.loc[len(df_temp.index)] = null_row
df_temp = df_temp.iloc[1:]
numb_lst = list(range(0, len(df_temp.index.tolist())))
df_temp.set_index([pd.Index(numb_lst)], inplace = True)
df_livarag = pd.read_csv('livarag_key.csv')
LIVARAG_dict = dict(zip(df_temp.value, df_livarag.livarag))
df_relevant = df_relevant.replace({'LIVARAG': LIVARAG_dict})

# nummhs
df_relevant['NUMMHS'] = df_relevant['NUMMHS'].astype(str)

# disorders
df_disorders = pd.read_csv('Disorders_Key.csv')
disorder_dict = dict(zip(df_disorders.ID, df_disorders.DISORDER))
df_relevant = df_relevant.replace({'MH1': disorder_dict})

# states
df_states = pd.read_csv('States_ID.csv')
state_dict = dict(zip(df_states.STATEFID, df_states.STATE))
df_relevant = df_relevant.replace({'STATEFIP': state_dict})
# remove states with not enough data to count
filt4 = df_relevant['STATEFIP'] != 4
filt19 = df_relevant['STATEFIP'] != 19
filt20 = df_relevant['STATEFIP'] != 20
filt23 = df_relevant['STATEFIP'] != 23
filt54 = df_relevant['STATEFIP'] != 54
filt99 = df_relevant['STATEFIP'] != 99
df_relevant = df_relevant[filt4 & filt19 & filt20 & filt23 & filt54 & filt99]
df_relevant['STATEFIP'].value_counts()

CA    742725
PA    586913
OH    579276
NJ    425727
TX    416338
MN    287670
MI    244168
WA    240139
MD    226003
FL    204076
NM    203037
KY    165999
OR    142930
IN    139126
CO    137290
VA    126089
TN    112426
NC    107091
OK    106638
AL    103203
SC     99514
CT     94199
MS     87406
MO     80770
AR     76139
WI     74279
MT     71770
NY     57356
UT     56224
IL     45670
DC     45189
LA     36271
RI     32557
VT     31333
NE     26280
MA     26275
NV     18931
ID     17465
WY     16710
SD     16207
ND     15143
DE     14032
AK     11082
HI      9905
PR      3947
Name: STATEFIP, dtype: int64

In [3]:
temp = df_relevant.sample(n = 500)
temp.to_csv('mh_assignment_sample.csv')

In [3]:
new_frame = df_relevant.copy()
new_frame.dropna(inplace = True)
#new_frame.drop(columns = 'YEAR', inplace = True)
new_frame.to_csv('data_cleaned_2019.csv')

In [4]:
new_frame = pd.get_dummies(new_frame)

y_cols = (new_frame.iloc[:, 12:25]).columns

x_frame = new_frame.drop(columns = y_cols)

x_cols = x_frame.columns

frame = pd.DataFrame(columns = [x_cols])

for disorder in y_cols:
    lst = []
    for demographic in x_cols:
        lst.append(new_frame[disorder].corr(new_frame[demographic]))
    #print(len(lst))
    frame.loc[len(frame)] = lst

frame.set_index(y_cols, inplace = True)
frame

Unnamed: 0,AGE_15-17,AGE_18-20,AGE_21-24,AGE_25-29,AGE_30-34,AGE_35-39,AGE_40-44,AGE_45-49,AGE_50-54,AGE_55-59,...,STATEFIP_OR,STATEFIP_PA,STATEFIP_PR,STATEFIP_SC,STATEFIP_SD,STATEFIP_TN,STATEFIP_TX,STATEFIP_UT,STATEFIP_WA,STATEFIP_WY
MH1_ADHD,0.142843,0.07022,0.023184,0.01364,0.000664,-0.015009,-0.023794,-0.029101,-0.034494,-0.036098,...,-0.031061,0.001021,0.001466,-0.015323,-0.00027,-0.005221,-0.045306,-0.009553,-0.002335,0.00427
MH1_Anxiety,0.036266,0.02567,0.015987,0.017941,0.013502,0.005133,-0.004488,-0.012916,-0.021386,-0.026371,...,0.076188,-0.001145,-0.007314,-0.013008,-0.000621,0.002667,-0.108147,0.021274,0.034345,0.025259
MH1_Bipolar,-0.078071,-0.037318,-0.011042,0.00286,0.01445,0.022979,0.028129,0.02687,0.019359,0.009366,...,-0.041885,0.000699,-0.009113,0.015267,-0.000847,0.006861,0.173027,-0.010592,-0.037805,-0.011143
MH1_Conduct disorder,0.082584,0.02621,0.000724,-0.001668,-0.001852,-0.000241,-0.005936,-0.013042,-0.016726,-0.017276,...,-0.014345,-0.000228,0.006744,-0.007219,-0.000124,0.001631,-0.022305,-0.00738,0.001764,0.00337
"MH1_Delirium, dementia",-0.009835,-0.010051,-0.011827,-0.014405,-0.013783,-0.013271,-0.011167,-0.009189,-0.007387,-0.004154,...,-0.005717,-0.000187,-0.001044,-0.00066,-0.000102,-0.000428,-0.017365,0.016211,0.001586,0.000567
MH1_Depression,-0.003457,0.01433,0.001452,-0.017903,-0.0243,-0.022862,-0.011645,0.005076,0.016207,0.021047,...,-0.043447,-0.001969,-0.008087,-0.005419,0.000342,0.002782,0.056527,0.015525,-0.002241,0.002554
MH1_Oppositional defiant disorder,0.115161,0.026407,0.003739,0.000986,0.000192,-0.002583,-0.013081,-0.019762,-0.021545,-0.021315,...,-0.015426,-0.000245,-0.00167,-0.009992,-0.000133,-0.003845,-0.024304,-0.008343,-0.007021,-0.000828
MH1_Other,0.03375,0.026689,0.017195,0.010879,-0.002752,-0.013062,-0.011165,-0.013238,-0.015322,-0.013739,...,0.001703,0.000662,-0.000341,-0.027295,0.00098,-0.006365,-0.091859,0.00034,-0.020308,0.004425
MH1_Personality disorder,-0.015263,-0.001856,0.008172,0.006724,0.005258,0.002905,0.003184,0.001631,-0.001965,-0.00428,...,0.005244,0.002063,-0.001613,0.020134,-0.00017,0.000892,-0.029845,0.011316,0.01182,0.014441
MH1_Pervasive developmental disorder,0.035012,0.051431,0.045134,0.022224,-0.004744,-0.014792,-0.016397,-0.01878,-0.021224,-0.020742,...,-0.018478,-0.000297,-0.002886,-0.010779,-0.000161,-0.00259,-0.025291,-0.005492,-0.015951,-0.000695


# Clustering

In [5]:
# adding y variable
frame['Disorder'] = y_cols
frame

Unnamed: 0,AGE_15-17,AGE_18-20,AGE_21-24,AGE_25-29,AGE_30-34,AGE_35-39,AGE_40-44,AGE_45-49,AGE_50-54,AGE_55-59,...,STATEFIP_PA,STATEFIP_PR,STATEFIP_SC,STATEFIP_SD,STATEFIP_TN,STATEFIP_TX,STATEFIP_UT,STATEFIP_WA,STATEFIP_WY,Disorder
MH1_ADHD,0.142843,0.07022,0.023184,0.01364,0.000664,-0.015009,-0.023794,-0.029101,-0.034494,-0.036098,...,0.001021,0.001466,-0.015323,-0.00027,-0.005221,-0.045306,-0.009553,-0.002335,0.00427,MH1_ADHD
MH1_Anxiety,0.036266,0.02567,0.015987,0.017941,0.013502,0.005133,-0.004488,-0.012916,-0.021386,-0.026371,...,-0.001145,-0.007314,-0.013008,-0.000621,0.002667,-0.108147,0.021274,0.034345,0.025259,MH1_Anxiety
MH1_Bipolar,-0.078071,-0.037318,-0.011042,0.00286,0.01445,0.022979,0.028129,0.02687,0.019359,0.009366,...,0.000699,-0.009113,0.015267,-0.000847,0.006861,0.173027,-0.010592,-0.037805,-0.011143,MH1_Bipolar
MH1_Conduct disorder,0.082584,0.02621,0.000724,-0.001668,-0.001852,-0.000241,-0.005936,-0.013042,-0.016726,-0.017276,...,-0.000228,0.006744,-0.007219,-0.000124,0.001631,-0.022305,-0.00738,0.001764,0.00337,MH1_Conduct disorder
"MH1_Delirium, dementia",-0.009835,-0.010051,-0.011827,-0.014405,-0.013783,-0.013271,-0.011167,-0.009189,-0.007387,-0.004154,...,-0.000187,-0.001044,-0.00066,-0.000102,-0.000428,-0.017365,0.016211,0.001586,0.000567,"MH1_Delirium, dementia"
MH1_Depression,-0.003457,0.01433,0.001452,-0.017903,-0.0243,-0.022862,-0.011645,0.005076,0.016207,0.021047,...,-0.001969,-0.008087,-0.005419,0.000342,0.002782,0.056527,0.015525,-0.002241,0.002554,MH1_Depression
MH1_Oppositional defiant disorder,0.115161,0.026407,0.003739,0.000986,0.000192,-0.002583,-0.013081,-0.019762,-0.021545,-0.021315,...,-0.000245,-0.00167,-0.009992,-0.000133,-0.003845,-0.024304,-0.008343,-0.007021,-0.000828,MH1_Oppositional defiant disorder
MH1_Other,0.03375,0.026689,0.017195,0.010879,-0.002752,-0.013062,-0.011165,-0.013238,-0.015322,-0.013739,...,0.000662,-0.000341,-0.027295,0.00098,-0.006365,-0.091859,0.00034,-0.020308,0.004425,MH1_Other
MH1_Personality disorder,-0.015263,-0.001856,0.008172,0.006724,0.005258,0.002905,0.003184,0.001631,-0.001965,-0.00428,...,0.002063,-0.001613,0.020134,-0.00017,0.000892,-0.029845,0.011316,0.01182,0.014441,MH1_Personality disorder
MH1_Pervasive developmental disorder,0.035012,0.051431,0.045134,0.022224,-0.004744,-0.014792,-0.016397,-0.01878,-0.021224,-0.020742,...,-0.000297,-0.002886,-0.010779,-0.000161,-0.00259,-0.025291,-0.005492,-0.015951,-0.000695,MH1_Pervasive developmental disorder


In [6]:
x = pd.get_dummies(frame.drop(columns = 'Disorder'))

In [7]:
for i in [3, 4, 5]:

    # Define K-means model
    kmeans = KMeans(n_clusters = i, random_state = 42)

    # Fit the model on input features
    kmeans.fit(x)

    # Predict the cluster for each instance
    y_kmeans = kmeans.predict(x)

    # Cluster labels returned by K-means
    y_kmeans
    score = silhouette_score(x, kmeans.labels_, metric = 'euclidean')
    print('For %d clusters, Silhouetter Score: %.3f' % (i, score)) 

For 3 clusters, Silhouetter Score: 0.243
For 4 clusters, Silhouetter Score: 0.209
For 5 clusters, Silhouetter Score: 0.212


In [8]:
# Define K-means model
kmeans = KMeans(n_clusters = 3, random_state = 42)

# Fit the model on input features
kmeans.fit(x)

# Predict the cluster for each instance
y_kmeans = kmeans.predict(x)

# Cluster labels returned by K-means
y_kmeans

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0], dtype=int32)

In [9]:
# assign cluster to each row
pickle.dump(kmeans, open('clustering_model.sav', 'wb'))

frame['CLUSTER'] = y_kmeans
frame.to_csv('correlation_cluster_frame.csv')
frame

Unnamed: 0,AGE_15-17,AGE_18-20,AGE_21-24,AGE_25-29,AGE_30-34,AGE_35-39,AGE_40-44,AGE_45-49,AGE_50-54,AGE_55-59,...,STATEFIP_PR,STATEFIP_SC,STATEFIP_SD,STATEFIP_TN,STATEFIP_TX,STATEFIP_UT,STATEFIP_WA,STATEFIP_WY,Disorder,CLUSTER
MH1_ADHD,0.142843,0.07022,0.023184,0.01364,0.000664,-0.015009,-0.023794,-0.029101,-0.034494,-0.036098,...,0.001466,-0.015323,-0.00027,-0.005221,-0.045306,-0.009553,-0.002335,0.00427,MH1_ADHD,0
MH1_Anxiety,0.036266,0.02567,0.015987,0.017941,0.013502,0.005133,-0.004488,-0.012916,-0.021386,-0.026371,...,-0.007314,-0.013008,-0.000621,0.002667,-0.108147,0.021274,0.034345,0.025259,MH1_Anxiety,0
MH1_Bipolar,-0.078071,-0.037318,-0.011042,0.00286,0.01445,0.022979,0.028129,0.02687,0.019359,0.009366,...,-0.009113,0.015267,-0.000847,0.006861,0.173027,-0.010592,-0.037805,-0.011143,MH1_Bipolar,1
MH1_Conduct disorder,0.082584,0.02621,0.000724,-0.001668,-0.001852,-0.000241,-0.005936,-0.013042,-0.016726,-0.017276,...,0.006744,-0.007219,-0.000124,0.001631,-0.022305,-0.00738,0.001764,0.00337,MH1_Conduct disorder,0
"MH1_Delirium, dementia",-0.009835,-0.010051,-0.011827,-0.014405,-0.013783,-0.013271,-0.011167,-0.009189,-0.007387,-0.004154,...,-0.001044,-0.00066,-0.000102,-0.000428,-0.017365,0.016211,0.001586,0.000567,"MH1_Delirium, dementia",0
MH1_Depression,-0.003457,0.01433,0.001452,-0.017903,-0.0243,-0.022862,-0.011645,0.005076,0.016207,0.021047,...,-0.008087,-0.005419,0.000342,0.002782,0.056527,0.015525,-0.002241,0.002554,MH1_Depression,1
MH1_Oppositional defiant disorder,0.115161,0.026407,0.003739,0.000986,0.000192,-0.002583,-0.013081,-0.019762,-0.021545,-0.021315,...,-0.00167,-0.009992,-0.000133,-0.003845,-0.024304,-0.008343,-0.007021,-0.000828,MH1_Oppositional defiant disorder,0
MH1_Other,0.03375,0.026689,0.017195,0.010879,-0.002752,-0.013062,-0.011165,-0.013238,-0.015322,-0.013739,...,-0.000341,-0.027295,0.00098,-0.006365,-0.091859,0.00034,-0.020308,0.004425,MH1_Other,0
MH1_Personality disorder,-0.015263,-0.001856,0.008172,0.006724,0.005258,0.002905,0.003184,0.001631,-0.001965,-0.00428,...,-0.001613,0.020134,-0.00017,0.000892,-0.029845,0.011316,0.01182,0.014441,MH1_Personality disorder,0
MH1_Pervasive developmental disorder,0.035012,0.051431,0.045134,0.022224,-0.004744,-0.014792,-0.016397,-0.01878,-0.021224,-0.020742,...,-0.002886,-0.010779,-0.000161,-0.00259,-0.025291,-0.005492,-0.015951,-0.000695,MH1_Pervasive developmental disorder,0


In [10]:
score = silhouette_score(x, kmeans.labels_, metric = 'euclidean')
print('Silhouetter Score: %.3f' % score)

Silhouetter Score: 0.243


# Classify main df into clusters

In [4]:
# inititalize column
df_relevant['CLUSTER'] = df_relevant['MH1']

group_cluster = df_relevant['CLUSTER'] 
ser = (group_cluster.value_counts())
df_temp = ser.to_frame()
df_temp['MH'] = df_temp.index.tolist()
df_temp.sort_values(by = 'MH', inplace = True)

df_cluster = pd.read_csv('cluster_key.csv')
CLUSTER_dict = dict(zip(df_temp.MH, df_cluster.cluster))
CLUSTER_dict
df_relevant = df_relevant.replace({'CLUSTER': CLUSTER_dict})

#filt2019 = df_relevant['YEAR'] == 2019
#df_relevant = df_relevant[filt2019]
#df_relevant.drop(columns = 'YEAR', inplace = True)
#df_relevant.to_csv('data_clustered2019.csv')

# Cluster Values Map

In [20]:
def findModes(cluster):
    lst = []
    clusterFilt = df_relevant['CLUSTER'] == cluster
    df = df_relevant[clusterFilt]
    for col in df.columns:
        mode = df[col].mode().values[0]
        lst.append(mode)
        
    return lst

lst0 = pd.Series(findModes(0))
lst1 = pd.Series(findModes(1))    
lst2 = pd.Series(findModes(2))

cluster_frame = pd.DataFrame(columns = ['Cluster 0', 'Cluster 1', 'Cluster 2'])
cluster_frame['Cluster 0'] = lst0
cluster_frame['Cluster 1'] = lst1
cluster_frame['Cluster 2'] = lst2
cluster_frame = cluster_frame.transpose()
cluster_frame.columns = ['AGE', 'MH', 'EDUC', 'ETHNIC', 'RACE', 'GENDER', 'MARSTAT', 'SAP', 'EMPLOY', 'LIVARAG',
                        'NUMMHS', 'STATEFIP', 'CLUSTER']

cluster_frame.drop(columns = 'CLUSTER', inplace = True)
cluster_frame.to_csv('cluster_value_map.csv')

# DT No Nulls

In [13]:
df_nonulls = df_relevant.dropna()
x = pd.get_dummies(df_nonulls.drop(columns = ['MH1', 'CLUSTER']), drop_first = True)
y = df_nonulls['CLUSTER']

df_out = x.copy()
df_out['CLUSTER'] = y
df_out = df_out.iloc[:1]
df_out.to_csv('nonull_cc2019_dummies.csv')

#split into train and test
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3, random_state = 1) 

#defining the folds
folds = KFold(n_splits = 5, shuffle = True, random_state = 1)

clf = DecisionTreeClassifier(random_state = 42)

hyper_params = {
    'max_depth': [10, 60 , 100], 
    'min_samples_split': [10, 20, 40], 
    'min_samples_leaf': [45, 55, 65]
}

# Call GridSearchCV()
model_clf = GridSearchCV(estimator = clf, 
                        param_grid = hyper_params, 
                        scoring = 'f1_weighted', 
                        cv = folds, 
                        verbose = 1,
                        n_jobs = -1) # Will utilize all available CPUs 

model_clf.fit(train_x, train_y)  
print('Score: ', model_clf.best_score_)
print('Parameters: ', model_clf.best_params_)
y_pred = model_clf.predict(test_x)
print(classification_report(test_y, y_pred))

Fitting 5 folds for each of 27 candidates, totalling 135 fits


KeyboardInterrupt: 

In [None]:
bestTree = model_clf.best_estimator_
# Save the model to local machine

filename = 'dt_clustered_nonulls.sav'
#pickle.dump(bestTree, open(filename, 'wb'))
# Displaying feature importance
importance = bestTree.feature_importances_
feature_imp = pd.DataFrame(list(zip(train_x.columns, importance)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)

feature_imp.head()

In [None]:
# Specifying figure size
fig, ax = plt.subplots(figsize=(6, 6)) 

# Generating confusion matrix
plot_confusion_matrix(bestTree, test_x, test_y,
                      cmap = plt.cm.Blues,
                      values_format = '',
                      ax = ax);

# DT Modes

In [None]:
df_relevant

In [5]:
df_modes = df_relevant.copy()
def modes(column):
    new_col = column + '_replaced'
    df_modes[new_col] = False
    mode = df_modes[column].mode()
    #print(mode[0])
    df_modes[column] = df_modes.apply(lambda row: mode[0] if pd.isna(row[column]) else row[column], axis = 1)
    df_modes[new_col] = df_modes.apply(lambda row: True if pd.isna(row[column]) else False, axis = 1)
    
# replace each row with its mode
modes('AGE')
modes('EDUC')
modes('ETHNIC')
modes('RACE')
modes('GENDER')
modes('MARSTAT')
modes('SAP')
modes('EMPLOY')
modes('LIVARAG')
modes('NUMMHS')
modes('STATEFIP')
df_modes.isna().sum()

AGE                       0
MH1                  726973
EDUC                      0
ETHNIC                    0
RACE                      0
GENDER                    0
MARSTAT                   0
SAP                       0
EMPLOY                    0
LIVARAG                   0
NUMMHS                    0
STATEFIP                  0
CLUSTER              726973
AGE_replaced              0
EDUC_replaced             0
ETHNIC_replaced           0
RACE_replaced             0
GENDER_replaced           0
MARSTAT_replaced          0
SAP_replaced              0
EMPLOY_replaced           0
LIVARAG_replaced          0
NUMMHS_replaced           0
STATEFIP_replaced         0
dtype: int64

In [1]:
df_modes.dropna(inplace = True)
#df_modes = df_modes.sample(n = 1000000)
x = pd.get_dummies(df_modes.drop(columns = ['MH1', 'CLUSTER']), drop_first = True)
y = df_modes['CLUSTER']

x.to_csv('modes_x_dummies.csv')

# df_out = x.copy()
# df_out['CLUSTER'] = y
# df_out = df_out.iloc[:1]
# df_out.to_csv('modes_cc2019_dummies.csv')

#split into train and test
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3, random_state = 1) 

#defining the folds
folds = KFold(n_splits = 5, shuffle = True, random_state = 1)

clf = DecisionTreeClassifier(random_state = 42)

hyper_params = {
    'max_depth': [10, 60 , 100], 
    'min_samples_split': [10, 20, 40], 
    'min_samples_leaf': [45, 55, 65]
}

# Call GridSearchCV()
model_clf = GridSearchCV(estimator = clf, 
                        param_grid = hyper_params, 
                        scoring = 'f1_weighted', 
                        cv = folds, 
                        verbose = 1,
                        n_jobs = -1) # Will utilize all available CPUs 

model_clf.fit(train_x, train_y)  
print('Score: ', model_clf.best_score_)
print('Parameters: ', model_clf.best_params_)
y_pred = model_clf.predict(test_x)
print(classification_report(test_y, y_pred))

NameError: name 'df_modes' is not defined

In [None]:
bestTree2 = model_clf.best_estimator_
# Save the model to local machine

filename = 'dt_clustered_modes.sav'
pickle.dump(bestTree2, open(filename, 'wb'))
# Displaying feature importance
importance = bestTree2.feature_importances_
feature_imp = pd.DataFrame(list(zip(train_x.columns, importance)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)
feature_imp.to_csv('feature_imp_modes.csv')

feature_imp.head()

In [None]:
# Specifying figure size
fig, ax = plt.subplots(figsize=(6, 6)) 

# Generating confusion matrix
plot_confusion_matrix(bestTree2, test_x, test_y,
                      cmap = plt.cm.Blues,
                      values_format = '',
                      ax = ax);

# Tree Figure

In [6]:
import pickle
import matplotlib.pyplot as plt
from sklearn import tree
filename = 'dt_clustered_modes.sav'
loaded_model = pickle.load(open(filename, 'rb'))

In [7]:
# Visualizing Decision Tree
fig = plt.figure(figsize=(25,20))
a = tree.plot_tree(decision_tree = bestTree2,
                   feature_names = train_x.columns,
                   filled = True)

NameError: name 'train_X' is not defined

<Figure size 1800x1440 with 0 Axes>

In [None]:
# Saving Decision Tree
fig.savefig("modes_decision_tree.pdf", bbox_inches='tight')

# DT Distribution

In [None]:
df_dist = df_relevant.copy()

# Get names of columns with missing values
cols_with_missing = [col for col in df_dist.columns
                     if df_dist[col].isnull().any()]

# For each column set the row to Ture/False if that row had a missing
for col in cols_with_missing:
    df_dist[col + '_was_missing'] = df_dist[col].isnull()
    
def dist(column):
    total_len = len(df_dist)
    dist_ser = df_dist[column].value_counts()
    dist_df = dist_ser.to_frame()
    dist_df['cat'] = dist_df.index.tolist()
    numb_lst = list(range(0, len(dist_df.index.tolist())))
    dist_df.set_index([pd.Index(numb_lst)], inplace = True)
    prop_lst = []

    # make proportions of nulls in each column a dataframe
    for numb in dist_df[column]:
        prop = numb / total_len
        prop_lst.append(prop)

    dist_df['proportion'] = prop_lst

    # counting the number of nulls
    null_filt = df_dist[column].isna()
    len_nulls = len(df_dist[null_filt])
    index = 0

    def repeater(n, row, item):
        for x in range(int(n)):
            if pd.isna(row[column]):
                ret = item
            else:
                ret = row[column]
            return ret

    for item in dist_df['cat']:
        changes = len_nulls * prop_lst[index]
        df_dist[column] = df_dist.apply(lambda row: repeater(changes, row, item), axis = 1)
        index = index + 1
        
dist('AGE')
dist('EDUC')
dist('ETHNIC')
dist('RACE')
dist('GENDER')
dist('MARSTAT')
dist('SAP')
dist('EMPLOY')
dist('LIVARAG')
df_dist.isna().sum()

In [None]:
df_dist.dropna(inplace = True)
x = pd.get_dummies(df_dist.drop(columns = ['MH1', 'CLUSTER']), drop_first = True)
y = df_dist['CLUSTER']

# df_out = x.copy()
# df_out['CLUSTER'] = y
# df_out = df_out.iloc[:1]
# df_out.to_csv('dist_cc2019_dummies.csv')

#split into train and test
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3, random_state = 1) 

#defining the folds
folds = KFold(n_splits = 5, shuffle = True, random_state = 1)

clf = DecisionTreeClassifier(random_state = 42)

hyper_params = {
    'max_depth': [10, 60 , 100], 
    'min_samples_split': [10, 20, 40], 
    'min_samples_leaf': [45, 55, 65]
}

# Call GridSearchCV()
model_clf = GridSearchCV(estimator = clf, 
                        param_grid = hyper_params, 
                        scoring = 'f1_weighted', 
                        cv = folds, 
                        verbose = 1,
                        n_jobs = -1) # Will utilize all available CPUs 

model_clf.fit(train_x, train_y)  
print('Score: ', model_clf.best_score_)
print('Parameters: ', model_clf.best_params_)
y_pred = model_clf.predict(test_x)
print(classification_report(test_y, y_pred))

In [None]:
bestTree3 = model_clf.best_estimator_
# Save the model to local machine

filename = 'dt_clustered_dist.sav'
pickle.dump(bestTree3, open(filename, 'wb'))
# Displaying feature importance
importance = bestTree3.feature_importances_
feature_imp = pd.DataFrame(list(zip(train_x.columns, importance)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)

feature_imp.head()

In [None]:
# Specifying figure size
fig, ax = plt.subplots(figsize=(6, 6)) 

# Generating confusion matrix
plot_confusion_matrix(bestTree3, test_x, test_y,
                      cmap = plt.cm.Blues,
                      values_format = '',
                      ax = ax);