In [None]:
# Importing Libraries 

In [None]:
# imports required 
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
import warnings
warnings.simplefilter(action = "ignore") 

In [None]:
# reading the dataset 
df = pd.read_csv("/Users/alexahoynacke/Desktop/CWRUBootcamp/Final Project/2011_Updated.csv")

In [None]:
# Data Analysis 

In [None]:
# show dataset 
# The first 5 observation units of the data
df.head()

In [None]:
# shape of the data, columns and rows 
df.shape

In [None]:
# data info 
df.info()

In [None]:
# Descriptive statistics of the data
df.describe([0.10,0.25,0.50,0.75,0.90,0.95,0.99]).T

In [None]:
# Check if the data set has any null values 
df.isnull().head(10)

In [None]:
# check the number of null values 
df.isnull().sum()

In [None]:
# Data Visualization

In [None]:
p = df.hist(figsize = (20,20))

In [None]:
# Histogram and density graphs of variables 
fig, ax = plt.subplots(4,2, figsize=(16,16))
sns.distplot(df.GENHLTH, bins = 20, ax=ax[0,0]) 
sns.distplot(df._AGE_G, bins = 20, ax=ax[0,1]) 
sns.distplot(df.HTM4, bins = 20, ax=ax[1,0]) 
sns.distplot(df.WTKG3, bins = 20, ax=ax[1,1]) 
sns.distplot(df._BMI5, bins = 20, ax=ax[2,0])
sns.distplot(df._FRUTSUM, bins = 20, ax=ax[2,1])
sns.distplot(df._VEGESUM, bins = 20, ax=ax[3,0]) 
sns.distplot(df.EXERANY2, bins = 20, ax=ax[3,1]) 


 

In [None]:
# Removing NAN Values 

In [None]:
df['GENHLTH'].fillna(df['GENHLTH'].mean(), inplace = True)
df['PHYSHLTH'].fillna(df['PHYSHLTH'].mean(), inplace = True)
df['POORHLTH'].fillna(df['POORHLTH'].median(), inplace = True)
df['BPHIGH4'].fillna(df['BPHIGH4'].median(), inplace = True)
df['TOLDHI2'].fillna(df['TOLDHI2'].median(), inplace = True)
df['CVDINFR4'].fillna(df['CVDINFR4'].mean(), inplace = True)
df['ASTHNOW'].fillna(df['ASTHNOW'].mean(), inplace = True)
df['CHCCOPD'].fillna(df['CHCCOPD'].median(), inplace = True)
df['HAVARTH3'].fillna(df['HAVARTH3'].median(), inplace = True)
df['ADDEPEV2'].fillna(df['ADDEPEV2'].median(), inplace = True)
df['CHCKIDNY'].fillna(df['CHCKIDNY'].mean(), inplace = True)
df['DIABETE3'].fillna(df['DIABETE3'].mean(), inplace = True)
df['SMOKDAY2'].fillna(df['SMOKDAY2'].median(), inplace = True)
df['USENOW3'].fillna(df['USENOW3'].median(), inplace = True)
df['MARITAL'].fillna(df['MARITAL'].median(), inplace = True)
df['EDUCA'].fillna(df['EDUCA'].mean(), inplace = True)
df['EMPLOY'].fillna(df['EMPLOY'].mean(), inplace = True)
df['INCOME2'].fillna(df['INCOME2'].median(), inplace = True)
df['RENTHOM1'].fillna(df['RENTHOM1'].median(), inplace = True)
df['EXERANY2'].fillna(df['EXERANY2'].median(), inplace = True)
df['ALCDAY5'].fillna(df['ALCDAY5'].mean(), inplace = True)
df['BPHI2MR'].fillna(df['BPHI2MR'].mean(), inplace = True)
df['ADPLEASR'].fillna(df['ADPLEASR'].median(), inplace = True)
df['ADDOWN'].fillna(df['ADDOWN'].median(), inplace = True)
df['ADSLEEP'].fillna(df['ADSLEEP'].median(), inplace = True)
df['ADENERGY'].fillna(df['ADENERGY'].mean(), inplace = True)
df['ADEAT1'].fillna(df['ADEAT1'].mean(), inplace = True)
df['ADFAIL'].fillna(df['ADFAIL'].median(), inplace = True)
df['ADTHINK'].fillna(df['ADTHINK'].median(), inplace = True)
df['ADMOVE'].fillna(df['ADMOVE'].median(), inplace = True)
df['MISTMNT'].fillna(df['MISTMNT'].mean(), inplace = True)
df['ADANXEV'].fillna(df['ADANXEV'].mean(), inplace = True)
df['SCNTMONY'].fillna(df['SCNTMONY'].median(), inplace = True)
df['SCNTMEAL'].fillna(df['SCNTMEAL'].median(), inplace = True)
df['RACE2'].fillna(df['RACE2'].median(), inplace = True)
df['HTM4'].fillna(df['HTM4'].mean(), inplace = True)
df['WTKG3'].fillna(df['WTKG3'].mean(), inplace = True)
df['_BMI5'].fillna(df['_BMI5'].median(), inplace = True)
df['_FRUTSUM'].fillna(df['_FRUTSUM'].median(), inplace = True)
df['_VEGESUM'].fillna(df['_VEGESUM'].median(), inplace = True)

In [None]:
# Plotting the distributions after removing NAN Values 

In [None]:
p = df.hist(figsize = (20,20))

In [None]:
# Correlation between all the features 

In [None]:
plt.figure(figsize=(53,50))
# seaborn has an easy method to showcase heatmap
p = sns.heatmap(df.corr(), annot=True,cmap ='RdYlGn')

In [None]:
df.head()

In [None]:
# Check the Diabetes column - how balanced it is 

In [None]:
color_wheel = {1: "#0392cf", 2: "#7bc043"}
colors = df["DIABETE3"].map(lambda x: color_wheel.get(x + 1))
print(df.DIABETE3.value_counts())
p=df.DIABETE3.value_counts().plot(kind="bar")

In [None]:
# Scaling the Data 

In [None]:
# Scaled data so that all values are on the same scale 
sc_X = StandardScaler()
X =  pd.DataFrame(sc_X.fit_transform(df.drop(["DIABETE3"],axis = 1),), columns=[
'_STATE',
'IYEAR',
'GENHLTH',
'PHYSHLTH',
'MENTHLTH',
'POORHLTH',
'HLTHPLN1',
'PERSDOC2',
'MEDCOST',
'CHECKUP1',
'BPHIGH4',
'TOLDHI2',
'CVDINFR4',
'CVDCRHD4',
'CVDSTRK3',
'ASTHMA3',
'ASTHNOW',
'CHCCOPD',
'HAVARTH3',
'ADDEPEV2',
'CHCKIDNY',
'DIABETE3',
'SMOKDAY2',
'USENOW3',
'MARITAL',
'EDUCA',
'EMPLOY',
'INCOME2',
'RENTHOM1',
'SEX',
'EXERANY2',
'ALCDAY5',
'BPHI2MR',
'ADPLEASR',
'ADDOWN',
'ADSLEEP',
'ADENERGY',
'ADEAT1',
'ADFAIL',
'ADTHINK',
'ADMOVE',
'MISTMNT',
'ADANXEV',
'SCNTMONY',
'SCNTMEAL',
'RACE2',
'_AGE_G',
'HTM4',
'WTKG3',
'_BMI5',
'_FRUTSUM',
'_VEGESUM'])
X.head()

In [None]:
# Assess target column - Diabetes 
y = df.DIABETE3
y

In [None]:
# Model Building 

In [None]:
X = df.drop('DIABETE3', axis=1)
y = df['DIABETE3']

In [None]:
# split into training and testing data 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33,
                                                    random_state=7)

In [None]:
# Random Forest 

In [None]:
# Label Encoder so the orginal values are encoded as 0 or 1 
from sklearn import preprocessing
from sklearn import utils

#convert y values to categorical values
lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y)

#view transformed values
print(y_transformed)



In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)

In [None]:
rfc_train = rfc.predict(X_train)
from sklearn import metrics

print("Accuracy_Score =", format(metrics.accuracy_score(y_train, rfc_train)))

In [None]:
from sklearn import metrics

predictions = rfc.predict(X_test)
print("Accuracy_Score =", format(metrics.accuracy_score(y_test, predictions)))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test,predictions))

In [None]:
# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

In [None]:
from sklearn import metrics

predictions = dtree.predict(X_test)
print("Accuracy Score =", format(metrics.accuracy_score(y_test,predictions)))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test,predictions))

In [None]:
# Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC

svc_model = SVC()
svc_model.fit(X_train, y_train)

In [None]:
svc_pred = svc_model.predict(X_test)

In [None]:
from sklearn import metrics

print("Accuracy Score =", format(metrics.accuracy_score(y_test, svc_pred)))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, svc_pred))
print(classification_report(y_test,svc_pred))

In [None]:
# Feature Importance 

In [None]:
rfc.feature_importances_

In [None]:
(pd.Series(rfc.feature_importances_, index=X.columns).plot(kind='barh'))