In [2]:
# This is the library import session.
import sys # system parameter
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
import pandas as pd # database processing package similar to SQL

# Common Machine Learning Algorithms
from sklearn import svm, tree, linear_model, neighbors, \
naive_bayes, ensemble, discriminant_analysis, gaussian_process
#from xgboost import XGBClassifier missing wait for add
#Common Model Helper package
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12, 8

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Data import
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_test = train_df.copy(deep = True)
data_cleaner = [train_test, test_df]

In [None]:
# Data preview and exploration
train_df.info()
train_df.sample(10)
train_df.groupby('Survived').count()
print('Train Columns with null:\n', train_test.isnull().sum())
print("-" * 10)

print('Test/Validation columns with null:\n', test_df.isnull().sum())
print("-" * 10)

train_df.describe(include = 'all')

In [None]:
# Data Cleaning
for dataset in data_cleaner:
    # fill the missing data with median
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    
    #Complete Embark information with mode
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True) #mode is the number with largest frequency
    #complete the missing fare with median
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    
drop_column = ['PassengerId', 'Cabin', 'Ticket']
train_test.drop(drop_column, axis=1, inplace=True)
    
print(train_test.isnull().sum())
print("-" * 10)
print(test_df.isnull().sum())

In [None]:
### Feature engineering for train and test/validation dataset
for dataset in data_cleaner:
    # count the total number of family number
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    dataset['IsAlone'] = 1 # use 1 to denote alone
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0
    # filter out the title like Mr. Ms. Miss etc
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1]\
    .str.split(".", expand = True)[0]
    # catogorize using the frequency distribution 0%, 25%, 50%, 75%, 100%
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    #catogorize using the age range evenly separated into 5
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)

stat_min = 10
title_names = (train_test['Title'].value_counts() < stat_min)
    
train_test['Title'] = train_test['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
print(train_test['Title'].value_counts())
print("-" * 10)
    
train_test.info()
test_df.info()
train_test.sample(10)
print(train_test['Title'].value_counts())

In [None]:
# convert objects to category using label encoder for train
# test/validation dataset

label = LabelEncoder()
for dataset in data_cleaner:
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
    dataset['Title_Code'] = label.fit_transform(dataset['Title'])
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])
    

# define target variable
Target = ['Survived']
# Input variable aka feature selection

train_test_x = ['Sex', 'Pclass','Embarked', 'Title','SibSp'
                , 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone'] # original column name in charts
train_test_x_code = ['Sex_Code','Pclass', 'Embarked_Code', 
                   'Title_Code','SibSp', 'Parch', 'Age', 
                   'Fare'] # coded column names for algorithm
train_test_xy = Target + train_test_x
print('Original X Y: ', train_test_xy, '\n')

#define x variables for original w/bin features to remove continuous variables
train_test_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']
train_test_xy_bin = Target + train_test_x_bin
print('Bin X Y: ', train_test_xy_bin, '\n')

#define x and y variables for dummy features original
train_test_dummy = pd.get_dummies(train_test[train_test_x])
train_test_x_dummy = train_test_dummy.columns.tolist()
train_test_xy_dummy = Target + train_test_x_dummy
print('Dummy X Y: ', train_test_xy_dummy, '\n')

train_test_dummy.head()
train_test.head()


In [None]:
# Double check the cleaned data
print('Train columns with null values: \n', train_test.isnull().sum())
print('-' * 10)
print(train_test.info())
print("-" * 10)
print('Test/Validation columns with null values: \n', test_df.isnull().sum())
print("-"*10)
print (test_df.info())
print("-"*10)

train_df.describe(include = 'all')

In [None]:
train1_x, test1_x, train1_y, test1_y = \
model_selection.train_test_split(train_test[train_test_x_code],\
                                 train_test[Target], random_state = 0)
train1_x_bin, test1_x_bin, train1_y_bin, test1_y_bin = \
model_selection.train_test_split(train_test[train_test_x_bin], \
                                 train_test[Target], random_state = 0)
train1_x_dummy, test1_x_dummy, train1_y_dummy, test1_y_dummy= \
model_selection.train_test_split(train_test_dummy[train_test_x_dummy], \
                                 train_test[Target], random_state = 0)

print("train_test shape:{}".format(train_test.shape))
print("train1 Shape: {}".format(train1_x.shape))
print("test1 Shape: {}".format(test1_x.shape))

# Perform Exploratory Analysis with Statisticsm

In [None]:
#Discrete Variable Correlation by Survival using
#group by aka pivot table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html
for x in train_test_x:
    if train_test[x].dtype != 'float64' :
        print('Survival Correlation by:', x)
        print(train_test[[x, Target[0]]].
              groupby(x, as_index = False).mean())
        print('-' * 10, '\n')



In [None]:
#graph distribution of quantitative data
plt.figure(figsize=[16,12])
plt.subplot(231)
plt.boxplot(x = train_test['Fare'], showmeans = True, meanline = True)
plt.title('Fare Boxplot')
plt.ylabel('Fare ($)')

plt.subplot(232)
plt.boxplot(x = train_test['Age'], showmeans = True, meanline = True)
plt.title('Age Boxplot')
plt.ylabel('Age (Years)')

plt.subplot(233)
plt.boxplot(train_test['FamilySize'], showmeans = True, meanline = True)
plt.title('Family Size Boxplot')
plt.ylabel('Family Size (#)')

plt.subplot(234)
plt.hist(x = [train_test[train_test['Survived'] == 1]['Fare'],
             train_test[train_test['Survived'] == 0]['Fare']], 
             stacked = True, color = ['g', 'r'],
             label = ['Survived', 'Dead'])
plt.title('Fare Histogram by Survival')
plt.xlabel('Fare ($)')
plt.ylabel('# of Passengers')
plt.legend()

plt.subplot(235)
plt.hist(x = [train_test[train_test['Survived']==1]['Age'], train_test[train_test['Survived']==0]['Age']], 
         stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Age Histogram by Survival')
plt.xlabel('Age (Years)')
plt.ylabel('# of Passengers')
plt.legend()

plt.subplot(236)
plt.hist(x = [train_test[train_test['Survived']==1]['FamilySize'],
              train_test[train_test['Survived']==0]['FamilySize']], 
              stacked=True, color = ['g','r'], 
              label = ['Survived','Dead'])
plt.title('Family Size Histogram by Survival')
plt.xlabel('Family Size (#)')
plt.ylabel('# of Passengers')
plt.legend()

In [None]:
#graph individual features by survival
fig, saxis = plt.subplots(2, 3, figsize = (16, 12))

sns.barplot(x = 'Embarked', y = 'Survived',
           data = train_test, ax = saxis[0,0])
sns.barplot(x = 'Pclass', y = 'Survived', order = [1,2,3],
            data=train_test, ax = saxis[0,1])
sns.barplot(x = 'IsAlone', y = 'Survived', order=[1,0],
            data=train_test, ax = saxis[0,2])
sns.pointplot(x = 'FareBin', y = 'Survived',
              data=train_test, ax = saxis[1,0])
sns.pointplot(x = 'AgeBin', y = 'Survived',
              data=train_test, ax = saxis[1,1])
sns.pointplot(x = 'FamilySize', y = 'Survived',
              data=train_test, ax = saxis[1,2])

In [None]:
#graph distribution of qualitative data: Pclass
#we know class mattered in survival, now let's compare class and a 2nd feature
fig, (axis1, axis2, axis3) = plt.subplots(1, 3, figsize = (14, 12))

sns.boxplot(x = 'Pclass', y = 'Fare', hue = 'Survived',
           data = train_test, ax = axis1)
axis1.set_title('Pclass vs Fare Survival Comparison')

sns.violinplot(x = 'Pclass', y = 'Age', hue = 'Survived', data = train_test, split = True, ax = axis2)
axis2.set_title('Pclass vs Age Survival Comparison')

sns.boxplot(x = 'Pclass', y ='FamilySize', hue = 'Survived', data = train_test, ax = axis3)
axis3.set_title('Pclass vs Family Size Survival Comparison')

In [None]:
#graph distribution of qualitative data: Sex
#we know sex mattered in survival, now let's compare sex and a 2nd feature
fig, qaxis = plt.subplots(1,3,figsize=(14,12))

sns.barplot(x = 'Sex', y = 'Survived', hue = 'Embarked', data=train_test, ax = qaxis[0])
axis1.set_title('Sex vs Embarked Survival Comparison')

sns.barplot(x = 'Sex', y = 'Survived', hue = 'Pclass', data=train_test, ax  = qaxis[1])
axis1.set_title('Sex vs Pclass Survival Comparison')

sns.barplot(x = 'Sex', y = 'Survived', hue = 'IsAlone', data=train_test, ax  = qaxis[2])
axis1.set_title('Sex vs IsAlone Survival Comparison')

In [None]:
fig, (maxis1, maxis2) = plt.subplots(1, 2, figsize = (14, 12))

#how does family size factor with sex & survival compare
sns.pointplot(x = 'FamilySize', y = 'Survived', hue = 'Sex',
             data = train_test,
             palette = {"male": "blue", "female" : "pink"},
             markers = ["*", "o"], linestyles = ["-", "--"],
             ax = maxis1)

#how does class factor with sex & survival compare
sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=train_test,
              palette={"male": "blue", "female": "pink"},
              markers=["*", "o"], linestyles=["-", "--"], ax = maxis2)

In [None]:
#how does embark port factor with class, sex, and survival compare
e = sns.FacetGrid(train_test, col = 'Embarked')
e.map(sns.pointplot, 'Pclass', 'Survived', 'Sex',
      ci = 95.0, palette = 'deep') # ci is confidence interval
e.add_legend()

In [None]:
#plot distributions of age of passengers who survived or did not survive
a = sns.FacetGrid(train_test, hue = 'Survived', aspect = 4)
a.map(sns.kdeplot, 'Age', shade = True)
a.set(xlim=(0 , train_test['Age'].max()))
a.add_legend()

In [None]:
h = sns.FacetGrid(train_test, row = 'Sex', col = 'Pclass',
                  hue = 'Survived')
h.map(plt.hist, 'Age', alpha = .75)
h.add_legend()

In [None]:
pp = sns.pairplot(train_test, hue = 'Survived',
                 palette = 'deep', size = 1.2,
                 diag_kind = 'kde', diag_kws = dict(shade=True),
                 plot_kws=dict(s=10))
pp.set(xticklabels=[])

In [None]:
def correlation_heatmap(df):
    _, ax = plt.subplots(figsize = (14, 12))
    colormap = sns.diverging_palette(200, 10, as_cmap = True)
    
    _ = sns.heatmap(
        df.corr(),
        cmap = colormap,
        square = True,
        cbar_kws = {'shrink':.9 },
        ax = ax,
        annot = True,
        linewidths = 0.1, vmax = 1.0, linecolor = 'White',
        annot_kws = {'fontsize':12}
        )
    
    plt.title('Pearson Correlation of Features', y = 1.05, size = 15)
    
correlation_heatmap(train_test)

In [None]:
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),
    
    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),
]
#split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit
#note: this is an alternative to train_test_split
cv_split = model_selection.ShuffleSplit(n_splits=10,
                                       test_size=.3,
                                       train_size=.6,
                                       random_state=0)
# run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics

MLA_columns = ['MLA Name', 'MLA Parameters',
               'MLA Train Accuracy Mean', 
               'MLA Test Accuracy Mean', 
               'MLA Test Accuracy 3*STD' ,
               'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = train_test[Target]

#index through MLA and save performance to table
row_index = 0
for alg in MLA:
    
    #set name and parameters
    MLA_name = alg.__class__.__name__ # use the class name as the MLA name
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    # get parameters for specific alg
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    #score model with cross validation: 
    #http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
    cv_results = model_selection.cross_validate(alg, 
                                               train_test[train_test_x_bin],
                                               train_test[Target],
                                               cv = cv_split)
    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean() 
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    #save MLA predictions - see section 6 for usage
    alg.fit(train_test[train_test_x_bin], train_test[Target])
    MLA_predict[MLA_name] = alg.predict(train_test[train_test_x_bin])
    row_index += 1
    
#print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict

In [None]:
sns.barplot(x = 'MLA Test Accuracy Mean', y = 'MLA Name', data = MLA_compare, color = 'm')
plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Algorithm')

 # Evaluate Model Performance and Tune Model with Hyper-Parameter tunning

In [None]:
dtree = tree.DecisionTreeClassifier(random_state = 0)
base_results = model_selection.cross_validate(dtree, train_test[train_test_x_bin], train_test[Target], cv  = cv_split)
dtree.fit(train_test[train_test_x_bin], train_test[Target])

print('BEFORE DT Parameters: ', dtree.get_params())
print("BEFORE DT Training w/bin score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE DT Test w/bin score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE DT Test w/bin score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
#print("BEFORE DT Test w/bin set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)

#tune hyper-parameters: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
param_grid = {'criterion': ['gini', 'entropy'],  #scoring methodology; two supported formulas for calculating information gain - default is gini
              #'splitter': ['best', 'random'], #splitting methodology; two supported strategies - default is best
              'max_depth': [2,4,6,8,10,None], #max depth tree can grow; default is none
              #'min_samples_split': [2,5,10,.03,.05], #minimum subset size BEFORE new split (fraction is % of total); default is 2
              #'min_samples_leaf': [1,5,10,.03,.05], #minimum subset size AFTER new split split (fraction is % of total); default is 1
              #'max_features': [None, 'auto'], #max features to consider when performing split; default none or all
              'random_state': [0] #seed or control random number generator: https://www.quora.com/What-is-seed-in-random-number-generation
             }

#print(list(model_selection.ParameterGrid(param_grid)))

#choose best model with grid_search: #http://scikit-learn.org/stable/modules/grid_search.html#grid-search
#http://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
tune_model = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
tune_model.fit(train_test[train_test_x_bin], train_test[Target])

#print(tune_model.cv_results_.keys())
#print(tune_model.cv_results_['params'])
print('AFTER DT Parameters: ', tune_model.best_params_)
#print(tune_model.cv_results_['mean_train_score'])
print("AFTER DT Training w/bin score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
#print(tune_model.cv_results_['mean_test_score'])
print("AFTER DT Test w/bin score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER DT Test w/bin score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
# Tune Model with Feature Selection

#base model
print('BEFORE DT RFE Training Shape Old: ', train_test[train_test_x_bin].shape) 
print('BEFORE DT RFE Training Columns Old: ', train_test[train_test_x_bin].columns.values)

print("BEFORE DT RFE Training w/bin score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE DT RFE Test w/bin score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE DT RFE Test w/bin score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print('-'*10)

#feature selection by recursive feature elimination(RFE)
dtree_rfe = feature_selection.RFECV(dtree, step = 1, scoring = 'accuracy', cv = cv_split)
dtree_rfe.fit(train_test[train_test_x_bin], train_test[Target])

#transform x&y to reduced features and fit new model
#alternative: can use pipeline to reduce fit and transform steps: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
X_rfe = train_test[train_test_x_bin].columns.values[dtree_rfe.get_support()] # get the features columns with best score
rfe_results = model_selection.cross_validate(dtree, train_test[X_rfe], train_test[Target], cv = cv_split)

#print(dtree_rfe.grid_scores_)
print('AFTER DT RFE Training Shape New: ', train_test[X_rfe].shape) 
print('AFTER DT RFE Training Columns New: ', X_rfe)

print("AFTER DT RFE Training w/bin score mean: {:.2f}". format(rfe_results['train_score'].mean()*100)) 
print("AFTER DT RFE Test w/bin score mean: {:.2f}". format(rfe_results['test_score'].mean()*100))
print("AFTER DT RFE Test w/bin score 3*std: +/- {:.2f}". format(rfe_results['test_score'].std()*100*3))
print('-'*10)

#tune rfe model
rfe_tune_model = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
rfe_tune_model.fit(train_test[X_rfe], train_test[Target])

#print(rfe_tune_model.cv_results_.keys())
#print(rfe_tune_model.cv_results_['params'])
print('AFTER DT RFE Tuned Parameters: ', rfe_tune_model.best_params_)
#print(rfe_tune_model.cv_results_['mean_train_score'])
print("AFTER DT RFE Tuned Training w/bin score mean: {:.2f}". format(rfe_tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
#print(rfe_tune_model.cv_results_['mean_test_score'])
print("AFTER DT RFE Tuned Test w/bin score mean: {:.2f}". format(rfe_tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER DT RFE Tuned Test w/bin score 3*std: +/- {:.2f}". format(rfe_tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
#Graph MLA version of Decision Tree: http://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html
import graphviz 
dot_data = tree.export_graphviz(dtree, out_file=None, 
                                feature_names = data1_x_bin, class_names = True,
                                filled = True, rounded = True)
graph = graphviz.Source(dot_data) 
graph