In [1]:
# load python libraries

import pandas as pd  # data tools
import numpy as np  # maths
import seaborn as sns # visualizations
# import missingno as msno # for NaN visualization
import matplotlib.pyplot as plt # for data visualization, graph plotting
from sklearn.model_selection import train_test_split
import sweetviz as viz
#from random import randint
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from xgboost import XGBClassifier
#from scipy.stats import uniform
#import sweetviz as viz
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score






In [2]:
# define the data set variables and their locations.

data_folder = './data/titanic.csv'

data = pd.read_csv(data_folder)
print(data.shape)


(1309, 10)


In [3]:
features = data.drop(['survived'], axis=1)  # create the features 'x' dataset
target = data['survived']  # create the target 'y' dataset
features = pd.get_dummies(features) # create the feature columns for all of the feature values

In [4]:
features.head()

Unnamed: 0.1,Unnamed: 0,pclass,sibsp,parch,companions,sex_female,sex_male,embarked_Cherbourg,embarked_Queenstown,embarked_Southampton,...,deck_E,deck_F,deck_G,deck_T,age cohort_adult,age cohort_child,age cohort_senior,age cohort_teenager,age cohort_toddler,age cohort_young adult
0,0,1,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,1,1,1,2,3,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,2,1,1,2,3,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,3,1,1,2,3,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,4,1,1,2,3,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [5]:
# split up the data into test and training datasets

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.2, random_state=15)
print("features train and test: ", X_train.shape, X_test.shape)
print('targets train and test: ', y_train.shape, y_test.shape)


features train and test:  (1047, 24) (262, 24)
targets train and test:  (1047,) (262,)


In [6]:
# create an HTML report on the data
report = viz.compare([X_train,"train"], [X_test, "test"],)
report.show_html("rpt_train.html") # Not providing a filename will default to SWEETVIZ_REPORT.html


Done! Use 'show' commands to display/save.   |██████████| [100%]   00:01 -> (00:00 left)


Report rpt_train.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [7]:
#  cross validation using several classifiers

random_state=15

# Scale features such that the mean is 0 and standard deviation is 1
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # calculate the mean and variance of each of the features
X_test = scaler.transform(X_test) # transform each feature using the learned mean and variance

# Number of cross-validation folds
k_folds = 10

# Number of estimators for tree-based ensembles
n_estimators = 100

# Create a dictionary containing the instance of the models, scores, mean accuracy and standard deviation
classifiers = {
    'name': ['DecisionTree', 'RandomForest', 'ExtraTrees', 'AdaBoost', 'LogReg', 'KNN', 'SVC',
             'XGBoost', 'GradientBoost'],
    'models': [DecisionTreeClassifier(random_state=random_state),
               RandomForestClassifier(random_state=random_state, n_estimators=n_estimators),
               ExtraTreesClassifier(random_state=random_state, n_estimators=n_estimators),
               AdaBoostClassifier(random_state=random_state, n_estimators=n_estimators),
               LogisticRegression(random_state=random_state),
               KNeighborsClassifier(),
               SVC(random_state=random_state),
               XGBClassifier(random_state=random_state, n_estimators=n_estimators),
               GradientBoostingClassifier(random_state=random_state, n_estimators=n_estimators)], 
    'scores': [],
    'acc_mean': [],
    'acc_std': []
}

# Run cross-validation and store the scores
for model in classifiers['models']:
    score = cross_val_score(model, X_train, y_train, cv=k_folds, n_jobs=4)
    classifiers['scores'].append(score)
    classifiers['acc_mean'].append(score.mean())
    classifiers['acc_std'].append(score.std())    

# send the results to a table
classifiers_df = pd.DataFrame({
    'Model Name': classifiers['name'],
    'Accuracy': classifiers['acc_mean'],
    'Std': classifiers['acc_std']
}, columns=['Model Name', 'Accuracy', 'Std']).set_index('Model Name')

classifiers_df.sort_values('Accuracy', ascending=False)

Unnamed: 0_level_0,Accuracy,Std
Model Name,Unnamed: 1_level_1,Unnamed: 2_level_1
GradientBoost,0.815513,0.043893
SVC,0.806896,0.045217
LogReg,0.801181,0.043467
AdaBoost,0.797335,0.048807
XGBoost,0.79076,0.030253
RandomForest,0.765925,0.027716
KNN,0.76206,0.037454
ExtraTrees,0.752555,0.026469
DecisionTree,0.737262,0.023654


## we will be tuning the top three models
1) GradientBoost 
2) SVC (support vector machine)
3) LogReg (logistical regression)

In [8]:
## for grid search - remove
#gbc = GradientBoostingClassifier()

# parameters = {
#     "n_estimators":[5,50,250,500],
#     "max_depth":[1,3,5,7,9],
#     "learning_rate":[0.01,0.1,1,10,100]
# }
#cv = GridSearchCV(gbc,parameters,cv=5)

# parameters = {
#     "loss":["deviance"],
#     "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
#     "min_samples_split": np.linspace(0.1, 0.5, 12),
#     "min_samples_leaf": np.linspace(0.1, 0.5, 12),
#     "max_depth":[3,5,8],
#     "max_features":["log2","sqrt"],
#     "criterion": ["friedman_mse",  "mae"],
#     "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
#     "n_estimators":[10]
#     }


In [9]:
## for grid search - remove

# clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv=10, n_jobs=-1)

# clf.fit(X_train, y_train)
# print(clf.score(X_train, y_train))
# print(clf.best_params_)



## tuning the boost algorithm

In [57]:

    
parameters_gbc = {
    "loss":["deviance"],
    "learning_rate": sp_randFloat(),
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth": sp_randInt(4, 10),
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample": sp_randFloat(),
    "n_estimators": sp_randInt(100, 1000)
    }


In [58]:
rand_cv= RandomizedSearchCV(GradientBoostingClassifier(), parameters_gbc, cv=2, n_jobs=-1)

result = rand_cv.fit(X_train, y_train)
print(rand_cv.score(X_train, y_train))
print(rand_cv.best_params_)

df_gridsearch = pd.DataFrame(result.cv_results_)
df_gridsearch.insert(0,'model', 'GradientBoostingClassifier')

best_estimator_boost = result.best_estimator_


0.7946513849092646
{'criterion': 'mae', 'learning_rate': 0.3817283830887682, 'loss': 'deviance', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 0.17272727272727273, 'min_samples_split': 0.2090909090909091, 'n_estimators': 194, 'subsample': 0.6376445486152407}


In [59]:

best_pred_y = best_estimator_boost.predict(X_test)
print("Accuracy: {}%".format(accuracy_score(y_test, best_pred_y)*100))
print("Confusion Matrix:")
print("{}".format(confusion_matrix(y_test, best_pred_y)))

Accuracy: 77.09923664122137%
Confusion Matrix:
[[144  23]
 [ 37  58]]


## tuning the SVM

In [60]:
# defining parameter range
params_svm = {'C': [0.1, 1, 10, 50, 100, 1000],
              'gamma': ['scale'],
              'kernel':  ['poly', 'rbf', 'sigmoid']}
 

In [61]:

grid_cv = GridSearchCV(SVC(), param_grid=params_svm, n_jobs=-1, scoring='accuracy',error_score=0)
result = grid_cv.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (result.best_score_, result.best_params_))
means = result.cv_results_['mean_test_score']
stds = result.cv_results_['std_test_score']
params = result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

dftmp = pd.DataFrame(result.cv_results_)
dftmp.insert(0,'model', 'svc')
df_gridsearch = pd.concat([df_gridsearch, dftmp])

best_estimator_svc = result.best_estimator_

Best: 0.804151 using {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}


In [62]:
best_pred_y = best_estimator_svc.predict(X_test)
print("Accuracy: {}%".format(accuracy_score(y_test, best_pred_y)*100))
print("Confusion Matrix:")
print("{}".format(confusion_matrix(y_test, best_pred_y)))

Accuracy: 78.62595419847328%
Confusion Matrix:
[[144  23]
 [ 33  62]]


## tuning logistic regression

In [63]:
params_lr = {'solver' : ['newton-cg', 'lbfgs', 'liblinear'],
        'penalty' : ['l2'],
        'C' : [100, 10, 1.0, 0.1, 0.01]}


In [64]:
model = LogisticRegression()
              

# define grid search
# grid = dict(solver=solvers,penalty=penalty,C=c_values)

rsf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=params_lr, n_jobs=-1, cv=rsf, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

dftmp = pd.DataFrame(result.cv_results_)
dftmp.insert(0,'model', 'svc')
df_gridsearch = pd.concat([df_gridsearch, dftmp])

best_estimator_lr = result.best_estimator_

Best: 0.797839 using {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}


In [65]:
#df_gridsearch.head()

In [66]:


best_pred_y = best_estimator_lr.predict(X_test)
print("Accuracy: {}%".format(accuracy_score(y_test, best_pred_y)*100))
print("Confusion Matrix:")
print("{}".format(confusion_matrix(y_test, best_pred_y)))

Accuracy: 78.62595419847328%
Confusion Matrix:
[[144  23]
 [ 33  62]]
