In [None]:
import sys
import pandas as pd
import matplotlib
import numpy as np
import scipy as sp
import IPython
import sklearn
import random
import time
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

In [None]:
train_data = pd.read_csv('./input/handouts_fhs.csv')
train_data_copy = train_data.copy(deep = True)

pd.set_option('display.max_columns', None)
pd.options.display.max_rows = train_data_copy.shape[0]

print (train_data_copy.info())
train_data_copy.sample(10)

In [None]:
print('Train columns with missing values:\n', train_data_copy.isnull().sum())
train_data_copy.describe(include = 'all')

In [None]:
missing_quantitative_vars = ['totchol1', 'cigpday1', 'bmi1', 'heartrte1', 'glucose1', 'totchol2', 'age2', 'sysbp2', 'diabp2', 'cigpday2', 'bmi2', 'heartrte2', 'glucose2', 'totchol3', 'age3', 'sysbp3', 'diabp3', 'cigpday3', 'bmi3', 'heartrte3', 'glucose3', 'hdlc3', 'ldlc3', 'bmidiff']
for var in missing_quantitative_vars:
  train_data_copy[var].fillna(train_data_copy[var].median(), inplace = True)

missing_qualitative_vars = ['bpmeds1', 'sex2', 'cursmoke2', 'diabetes2', 'bpmeds2', 'prevchd2', 'prevap2', 'prevmi2', 'prevstrk2', 'prevhyp2', 'sex3', 'cursmoke3', 'diabetes3', 'bpmeds3', 'prevchd3', 'prevap3', 'prevmi3', 'prevstrk3', 'prevhyp3']
for var in missing_qualitative_vars:
  train_data_copy[var].fillna(train_data_copy[var].mode()[0], inplace = True)

train_data_copy = train_data_copy.drop(['randid', 'hdlc1', 'ldlc1', 'hdlc2', 'ldlc2'], axis = 1)

print(train_data_copy.isnull().sum())

In [None]:
for num_var in train_data_copy:
    if (train_data_copy[num_var].name[:4] == 'time'):
        train_data_copy[num_var + 'Bin'] = pd.cut(train_data_copy[num_var], 12)
    elif (train_data_copy[num_var].dtype == 'float64' or train_data_copy[num_var].dtype == 'int64'):
        train_data_copy[num_var + 'Bin'] = pd.cut(train_data_copy[num_var], 6)
    
label = LabelEncoder()
for col in train_data_copy:
    if (train_data_copy[col].dtype.name == 'object' or train_data_copy[col].dtype.name == 'category'):
        categorical_code = train_data_copy[col].name + '_code'
        train_data_copy[categorical_code] = label.fit_transform(train_data_copy[col])

### if you wanna fail at life xddddd
#train_data_copy['ProbableHeartDisease'] = 0
#train_data_copy['ProbableHeartDisease'].loc[(train_data_copy['prevchd1'] == 1) |
#                                            (train_data_copy['mi_fchd'] == 1) |
#                                            (train_data_copy['anychd'] == 1) |
#                                            (train_data_copy['cvd'] == 1)] = 1



train_data_copy.info()
train_data_copy.sample(10)

In [None]:
print('Train columns with null values:\n', train_data_copy.isnull().sum())
print('-'*10)
print(train_data_copy.info())

In [None]:
Target = ['anychd_code']

train_data_copy_x = []
for i in range(len(train_data_copy.columns)): # original vars
    colName = train_data_copy.columns[i]
    if (colName[len(colName)-1-4:len(colName)] != '_code' and 
        train_data_copy[colName].dtype.name != 'category' and
        colName != 'anychd_code'):
        train_data_copy_x.append(colName)
        
train_data_copy_x_calc = []
for i in range(len(train_data_copy.columns)): # numerical and _code variables
    colName = train_data_copy.columns[i]
    if (train_data_copy[colName].dtype != 'object' and 
        train_data_copy[colName].dtype.name != 'category' and
        colName[len(colName)-1-7:len(colName)] != 'Bin_code' and
        colName != 'anychd_code'):
        train_data_copy_x_calc.append(colName)

train_data_copy_xy = Target + train_data_copy_x
print ('Original X Y: ', train_data_copy_xy, '\n')

train_data_copy_x_bin = []
for i in range(len(train_data_copy.columns)): # numerical, _code, and Bin_code variables
    colName = train_data_copy.columns[i]
    if (train_data_copy[colName].dtype != 'object' and 
        train_data_copy[colName].dtype.name != 'category' and
        colName != 'anychd_code'):
        train_data_copy_x_bin.append(colName)

train_data_copy_xy_bin = Target + train_data_copy_x_bin
print ('Bin X Y: ', train_data_copy_xy_bin, '\n')

train_data_copy_dummy = pd.get_dummies(train_data_copy[train_data_copy_x])
train_data_copy_x_dummy = train_data_copy_dummy.columns.tolist()
train_data_copy_xy_dummy = Target + train_data_copy_x_dummy
print ('Dummy X Y: ', train_data_copy_xy_dummy, '\n')

train_data_copy_dummy.head()

In [None]:
print('Train columns with null values: \n', train_data_copy.isnull().sum())
print("-"*10)
print (train_data_copy.info())
print("-"*10)

train_data_copy.describe(include = 'all')

In [None]:
train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(train_data_copy[train_data_copy_x_calc], train_data_copy[Target], random_state = 0)
train1_x_bin, test1_x_bin, train1_y_bin, test1_y_bin = model_selection.train_test_split(train_data_copy[train_data_copy_x_bin], train_data_copy[Target] , random_state = 0)
train1_x_dummy, test1_x_dummy, train1_y_dummy, test1_y_dummy = model_selection.train_test_split(train_data_copy_dummy[train_data_copy_x_dummy], train_data_copy[Target], random_state = 0)


print("train_data_copy Shape: {}".format(train_data_copy.shape))
print("Train1 Shape: {}".format(train1_x.shape))
print("Test1 Shape: {}".format(test1_x.shape))

train1_x_bin.head()

In [None]:
for x in train_data_copy_x:
    if train_data_copy[x].dtype != 'float64' :
        print('anychd_code Correlation by:', x)
        print(train_data_copy[[x, Target[0]]].groupby(x, as_index=False).mean())
        print('-'*10, '\n')

In [None]:
#correlation heatmap of dataset
def correlation_heatmap(df):
    _ , ax = plt.subplots(figsize =(20, 15))
    colormap = sns.diverging_palette(220, 10, as_cmap = True)
    
    _ = sns.heatmap(
        df.corr(), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':.9 }, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':12 }
    )
    
    plt.title('Pearson Correlation of Features', y=1.05, size=15)

correlation_heatmap(train_data_copy)

In [None]:
    #COMPARING DEM CLASSIFICATION ALGORITHMS

#Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()    
    ]



#split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit
#note: this is an alternative to train_test_split
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = train_data_copy[Target]

#index through MLA and save performance to table
row_index = 0
for alg in MLA:

    #set name and parameters
  MLA_name = alg.__class__.__name__
  MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
  MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
  cv_results = model_selection.cross_validate(alg, train_data_copy[train_data_copy_x_bin], train_data_copy[Target], cv  = cv_split)

  MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
  MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
  MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
  MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    

    #save MLA predictions - see section 6 for usage
  alg.fit(train_data_copy[train_data_copy_x_bin], train_data_copy[Target])
  MLA_predict[MLA_name] = alg.predict(train_data_copy[train_data_copy_x_bin])
    
  row_index+=1

    
#print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict

In [None]:
#barplot using https://seaborn.pydata.org/generated/seaborn.barplot.html
sns.barplot(x='MLA Test Accuracy Mean', y = 'MLA Name', data = MLA_compare, color = 'm')

#prettify using pyplot: https://matplotlib.org/api/pyplot_api.html
plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Algorithm')