In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import math
import seaborn as sns
from six.moves import cPickle as pickle
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

%matplotlib inline



## From previous result

From previous result, we decide to use the following models to do submission. They are:
1. Multinominal Naive bayes Accuracy: 84.33%
2. XGboost Accuracy: 83.58%
3. Logistic Accuracy: 83.21%

In this section, we will run a grid search to find the best parameter for the validation set.

In [2]:
train_ds_file = 'train_dataset.pickle'
train_lb_file = 'train_label.pickle'
test_ds_file = 'test_dataset.pickle'

with open(train_ds_file, 'rb') as f:
    train_dataset = pickle.load(f)
    
with open(train_lb_file, 'rb') as f:
    train_label = pickle.load(f)
    
with open(test_ds_file, 'rb') as f:
    test_dataset = pickle.load(f)
    
columns = ["Pclass", "Sex", "SibSp", "Parch", "FamilyMember", "Embarked", "Salutation", "CabinArea", "AgeDiscrete", "FareDiscrete"]

full_dataset = pandas.concat([train_dataset, test_dataset])

full_datasett_onehot = pandas.get_dummies(full_dataset, sparse=True, columns=columns)

train_dataset_onehot = full_datasett_onehot[:len(train_dataset)]
test_dataset_onehot = full_datasett_onehot[len(train_dataset):]

display(train_dataset_onehot[0:10])

Unnamed: 0_level_0,Pclass_0,Pclass_1,Pclass_2,Sex_0,Sex_1,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,...,AgeDiscrete_8,AgeDiscrete_9,FareDiscrete_0,FareDiscrete_1,FareDiscrete_2,FareDiscrete_3,FareDiscrete_4,FareDiscrete_5,FareDiscrete_6,FareDiscrete_7
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,1,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,1,0,1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6,0,0,1,0,1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,0,0,1,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
9,0,0,1,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
10,0,1,0,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [3]:
def get_train_test_set(test_size):
    X_train, X_test, y_train, y_test = \
        train_test_split(train_dataset_onehot, train_label, test_size=test_size)
    
    return X_train, X_test, y_train, y_test

## Tuning logistic regression

The model parameter to optimized are
1. C - The regularization term

In [41]:
from sklearn.linear_model import LogisticRegression

parameters = {'C':[0.001, 0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5]}
lr = LogisticRegression()
clf = GridSearchCV(lr, parameters)
clf.fit(train_dataset_onehot, train_label)

print(clf.best_params_)

{'C': 0.25}


In [43]:
for i in range(10):
    X_train, X_test, y_train, y_test = get_train_test_set(0.3)
    lr = LogisticRegression(C = 0.25)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("%s Accuracy: %.2f%%" % ("Logistic regression, C:0.3", accuracy * 100.0))

Logistic regression, C:0.3 Accuracy: 80.60%
Logistic regression, C:0.3 Accuracy: 83.58%
Logistic regression, C:0.3 Accuracy: 82.46%
Logistic regression, C:0.3 Accuracy: 81.72%
Logistic regression, C:0.3 Accuracy: 80.97%
Logistic regression, C:0.3 Accuracy: 79.85%
Logistic regression, C:0.3 Accuracy: 81.34%
Logistic regression, C:0.3 Accuracy: 83.21%
Logistic regression, C:0.3 Accuracy: 79.10%
Logistic regression, C:0.3 Accuracy: 80.60%


# Tuning Multinominal Naive bayes

The model parameter to optimized are

1. alpha - The smoothing term

In [38]:
from sklearn.naive_bayes import MultinomialNB

parameters = {'alpha':[0.001, 0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5]}
mnb = MultinomialNB()
clf = GridSearchCV(mnb, parameters)
clf.fit(train_dataset_onehot, train_label)

print(clf.best_params_)

{'alpha': 0.75}


In [40]:
for i in range(10):
    X_train, X_test, y_train, y_test = get_train_test_set(0.3)
    lr = MultinomialNB(alpha = 0.75)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("%s Accuracy: %.2f%%" % ("Multinominal Naive bayes regression, alpha:0.001", accuracy * 100.0))

Multinominal Naive bayes regression, alpha:0.001 Accuracy: 78.36%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 80.97%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 80.22%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 81.72%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 79.85%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 76.12%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 80.60%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 76.12%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 83.96%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 82.46%


## Tuning XGBoost

The model will use tree instead of linear

The parameter to be tuned are:
1. learning_rate
2. max_depth
3. min_child_weight
4. gamma
5. subsample
6. colsample_bytree
7. objective
8. learning_rate

In [8]:
parameters = {
    'min_child_weight':range(2,6,1),
    'max_depth':range(3,7,1),
    'gamma':[i/10.0 for i in range(0,5)],
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
    'learning_rate':[0.01, 0.1, 1]
}

xgb = XGBClassifier()
clf = RandomizedSearchCV(xgb, parameters, n_jobs=8, n_iter=5000)
clf.fit(train_dataset_onehot, train_label)

print(clf.best_params_)

{'subsample': 0.8, 'reg_alpha': 0.1, 'colsample_bytree': 0.6, 'gamma': 0.2, 'min_child_weight': 2, 'max_depth': 3, 'learning_rate': 0.1}


In [30]:
for i in range(10):
    X_train, X_test, y_train, y_test = get_train_test_set(0.3)
    lr = XGBClassifier(learning_rate=0.1, subsample=0.8, colsample_bytree=0.6, gamma=0.2,
                       max_depth=3, reg_alpha=0.1, min_child_weight=2, objective= 'binary:logistic')
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("%s Accuracy: %.2f%%" % ("XGBoost", accuracy * 100.0))

XGBoost Accuracy: 79.10%
XGBoost Accuracy: 85.45%
XGBoost Accuracy: 82.46%
XGBoost Accuracy: 80.97%
XGBoost Accuracy: 83.58%
XGBoost Accuracy: 85.45%
XGBoost Accuracy: 82.09%
XGBoost Accuracy: 79.85%
XGBoost Accuracy: 82.84%
XGBoost Accuracy: 81.34%


## Tuning Random Forest

The parameter to be tuned are:
1. max_depth
2. max_features
3. min_samples_split
4. min_samples_leaf
5. bootstrap
6. criterion

In [61]:
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint

param_dist = {"n_estimators" : sp_randint(3, 20),
              "max_depth": [1, 2, 3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 5000
# build a classifier
clf = RandomForestClassifier()
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, n_jobs=8)
random_search.fit(train_dataset_onehot, train_label)

print(random_search.best_params_)

{'max_features': 6, 'bootstrap': False, 'criterion': 'gini', 'n_estimators': 8, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None}


In [58]:
for i in range(10):
    X_train, X_test, y_train, y_test = get_train_test_set(0.3)
    lr = RandomForestClassifier(max_features=8, bootstrap=True, min_samples_split=9, n_estimators=14, criterion='entropy',
                       min_samples_leaf=4, max_depth=None)  
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("%s Accuracy: %.2f%%" % ("Random forest", accuracy * 100.0))

Random forest Accuracy: 79.48%
Random forest Accuracy: 79.85%
Random forest Accuracy: 86.19%
Random forest Accuracy: 80.60%
Random forest Accuracy: 82.46%
Random forest Accuracy: 82.46%
Random forest Accuracy: 82.09%
Random forest Accuracy: 79.48%
Random forest Accuracy: 80.60%
Random forest Accuracy: 83.96%


# Submission, select the best model for Kaggle

In [59]:
# Submission score is 0.77990 better as than gender classifier 0.76555

clf1 = RandomForestClassifier(max_features=8, bootstrap=True, min_samples_split=9, n_estimators=14, criterion='entropy',
                       min_samples_leaf=4, max_depth=None)  
clf1.fit(train_dataset_onehot, train_label)
r_pred = clf1.predict(test_dataset_onehot)
r_predictions = [int(round(value)) for value in r_pred]

submission_df = pandas.DataFrame(index=test_dataset.index, columns=["Survived"])
submission_df["Survived"] = r_predictions
submission_df.to_csv("submission_best_rf.csv", sep=',')

In [54]:
# Submission score is 0.77033 better than gender classifier 0.76555

clf2 = XGBClassifier(learning_rate=0.1, subsample=0.8, colsample_bytree=0.6, gamma=0.2,
                       max_depth=3, reg_alpha=0.1, min_child_weight=2, objective= 'binary:logistic')   
clf2.fit(train_dataset_onehot, train_label)
r_pred = clf2.predict(test_dataset_onehot)
r_predictions = [int(round(value)) for value in r_pred]

submission_df = pandas.DataFrame(index=test_dataset.index, columns=["Survived"])
submission_df["Survived"] = r_predictions
submission_df.to_csv("submission_best_xg.csv", sep=',')

In [53]:
# Submission score is 0.76077 worser than gender classifier 0.76555

clf3 = MultinomialNB(alpha = 0.75)
clf3.fit(train_dataset_onehot, train_label)
r_pred = clf3.predict(test_dataset_onehot)
r_predictions = [int(round(value)) for value in r_pred]

submission_df = pandas.DataFrame(index=test_dataset.index, columns=["Survived"])
submission_df["Survived"] = r_predictions
submission_df.to_csv("submission_best_mn.csv", sep=',')

In [49]:
# Submission score is 0.77033 worser than gender classifier 0.76555

clf4 = LogisticRegression(C = 0.25)
clf4.fit(train_dataset_onehot, train_label)
r_pred = clf4.predict(test_dataset_onehot)
r_predictions = [int(round(value)) for value in r_pred]

submission_df = pandas.DataFrame(index=test_dataset.index, columns=["Survived"])
submission_df["Survived"] = r_predictions
submission_df.to_csv("submission_best_lr.csv", sep=',')

In [60]:
# Submission score is 0.77990 better than gender classifier 0.76555

from sklearn.ensemble import VotingClassifier

eclf2 = VotingClassifier(estimators=[('rf', clf1), ('xgb', clf2), ('nb', clf3), ('lr', clf4)], voting='hard')
eclf2.fit(train_dataset_onehot, train_label) 
r_pred = eclf2.predict(test_dataset_onehot)
r_predictions = [int(round(value)) for value in r_pred]

submission_df = pandas.DataFrame(index=test_dataset.index, columns=["Survived"])
submission_df["Survived"] = r_predictions
submission_df.to_csv("submission_best_voting.csv", sep=',')