In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import math
import seaborn as sns
from six.moves import cPickle as pickle
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC

%matplotlib inline



## From previous result

From previous result, we decide to use the following models to do submission. They are:
1. Polynomial SVM Accuracy: 83.96%
2. XGboost Accuracy: 83.58%
3. RBF SVM Accuracy: 82.84%
4. Bernoulli Naive bayes Accuracy: 82.46%
5. Logistic Accuracy: 81.72%
6. Random forest Accuracy: 81.34%
7. Linear SVM Accuracy: 81.34%
8. Neural Network Accuracy: 81.34%
9. Extra tree Accuracy: 79.85%
10. Guassian Naive bayes Accuracy: 60.45%

In this section, we will run a grid search to find the best parameter for the validation set.

In [2]:
train_ds_file = 'train_dataset.pickle'
train_lb_file = 'train_label.pickle'
test_ds_file = 'test_dataset.pickle'

with open(train_ds_file, 'rb') as f:
    train_dataset = pickle.load(f)
    
with open(train_lb_file, 'rb') as f:
    train_label = pickle.load(f)
    
with open(test_ds_file, 'rb') as f:
    test_dataset = pickle.load(f)
    
def transform_ds_to_input(dataset):
    columns = ["Pclass", "Embarked_enc", "Salutation_enc", "CabinArea_enc"]
    ds_onehot = dataset[["Pclass", "Sex_enc", "SibSp", "Parch", "Fare", "CabinArea_enc",\
                                       "Embarked_enc", "Salutation_enc", "FamilyMember"]]
    ds_onehot = pandas.get_dummies(ds_onehot, sparse=True, columns=columns)
    scaler = StandardScaler().fit(ds_onehot)
    ds_onehot_scaled = scaler.transform(ds_onehot) 
    return ds_onehot_scaled

full_dataset = pandas.concat([train_dataset, test_dataset])
full_dataset_onehot = transform_ds_to_input(full_dataset)
train_dataset_onehot= full_dataset_onehot[:len(train_dataset)]
test_dataset_onehot = full_dataset_onehot[len(train_dataset):]

display(pandas.DataFrame(train_dataset_onehot[0:10]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,0.743497,0.481288,-0.445,-0.503371,0.073352,-0.572351,-0.518084,0.919925,-0.512148,-0.32204,...,-0.02765,-0.130744,-0.228584,-0.278148,-0.190843,-0.179818,-0.127688,-0.061922,-0.02765,0.539377
1,-1.344995,0.481288,-0.445,0.734691,0.073352,1.747178,-0.518084,-1.087045,1.952562,-0.32204,...,-0.02765,-0.130744,-0.228584,3.59521,-0.190843,-0.179818,-0.127688,-0.061922,-0.02765,-1.853992
2,-1.344995,-0.479087,-0.445,-0.49032,-0.558346,-0.572351,-0.518084,0.919925,-0.512148,-0.32204,...,-0.02765,-0.130744,-0.228584,-0.278148,-0.190843,-0.179818,-0.127688,-0.061922,-0.02765,0.539377
3,-1.344995,0.481288,-0.445,0.383123,0.073352,1.747178,-0.518084,-1.087045,-0.512148,-0.32204,...,-0.02765,-0.130744,-0.228584,3.59521,-0.190843,-0.179818,-0.127688,-0.061922,-0.02765,-1.853992
4,0.743497,-0.479087,-0.445,-0.487904,-0.558346,-0.572351,-0.518084,0.919925,-0.512148,-0.32204,...,-0.02765,-0.130744,-0.228584,-0.278148,-0.190843,-0.179818,-0.127688,-0.061922,-0.02765,0.539377
5,0.743497,-0.479087,-0.445,-0.480009,-0.558346,-0.572351,-0.518084,0.919925,-0.512148,3.105202,...,-0.02765,-0.130744,-0.228584,-0.278148,-0.190843,-0.179818,-0.127688,-0.061922,-0.02765,0.539377
6,0.743497,-0.479087,-0.445,0.359196,-0.558346,1.747178,-0.518084,-1.087045,-0.512148,-0.32204,...,-0.02765,-0.130744,-0.228584,-0.278148,-0.190843,5.56119,-0.127688,-0.061922,-0.02765,-1.853992
7,0.743497,2.402037,0.710763,-0.23607,1.968447,-0.572351,-0.518084,0.919925,-0.512148,-0.32204,...,-0.02765,-0.130744,-0.228584,-0.278148,-0.190843,-0.179818,-0.127688,-0.061922,-0.02765,0.539377
8,-1.344995,-0.479087,1.866526,-0.428289,0.705051,-0.572351,-0.518084,0.919925,-0.512148,-0.32204,...,-0.02765,-0.130744,-0.228584,-0.278148,-0.190843,-0.179818,-0.127688,-0.061922,-0.02765,0.539377
9,-1.344995,0.481288,-0.445,-0.062139,0.073352,-0.572351,1.93019,-1.087045,1.952562,-0.32204,...,-0.02765,-0.130744,-0.228584,-0.278148,-0.190843,-0.179818,-0.127688,-0.061922,-0.02765,0.539377


In [3]:
def get_train_test_set(test_size):
    X_train, X_test, y_train, y_test = \
        train_test_split(train_dataset_onehot, train_label, test_size=test_size)
    
    return X_train, X_test, y_train, y_test

## Tuning logistic regression

The model parameter to optimized are
1. C - The regularization term

In [4]:
from sklearn.linear_model import LogisticRegression

parameters = {'C':[0.001, 0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5]}
lr = LogisticRegression()
clf = GridSearchCV(lr, parameters)
clf.fit(train_dataset_onehot, train_label)

print(clf.best_params_)

{'C': 0.075}


In [5]:
for i in range(10):
    X_train, X_test, y_train, y_test = get_train_test_set(0.3)
    lr = LogisticRegression(C = 0.075)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("%s Accuracy: %.2f%%" % ("Logistic regression, C:0.3", accuracy * 100.0))

Logistic regression, C:0.3 Accuracy: 83.58%
Logistic regression, C:0.3 Accuracy: 82.84%
Logistic regression, C:0.3 Accuracy: 85.45%
Logistic regression, C:0.3 Accuracy: 81.72%
Logistic regression, C:0.3 Accuracy: 82.84%
Logistic regression, C:0.3 Accuracy: 80.60%
Logistic regression, C:0.3 Accuracy: 81.34%
Logistic regression, C:0.3 Accuracy: 83.96%
Logistic regression, C:0.3 Accuracy: 83.96%
Logistic regression, C:0.3 Accuracy: 82.46%


# Tuning RBF SVM

The model parameter to optimized are

1. C - regularization term
2. Gamma - the influence of a single training example reaches

In [6]:
from sklearn.naive_bayes import MultinomialNB

parameters = {'C':[0.001, 0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5], 
              'gamma':[0.00001, 0.000025, 0.00005, 0.000075, 0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, \
                   0.005, 0.0075, 0.01, 0.025, 0.05, 0.075]}
rbf = SVC()
clf = GridSearchCV(rbf, parameters, n_jobs=8)
clf.fit(train_dataset_onehot, train_label)

print(clf.best_params_)

{'C': 0.75, 'gamma': 0.05}


In [7]:
for i in range(10):
    X_train, X_test, y_train, y_test = get_train_test_set(0.3)
    lr = SVC(gamma=0.05, C=0.75)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("%s Accuracy: %.2f%%" % ("Multinominal Naive bayes regression, alpha:0.001", accuracy * 100.0))

Multinominal Naive bayes regression, alpha:0.001 Accuracy: 84.33%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 83.21%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 82.46%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 81.34%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 81.34%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 83.21%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 80.22%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 84.33%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 83.21%
Multinominal Naive bayes regression, alpha:0.001 Accuracy: 81.34%


## Tuning XGBoost

The model will use tree instead of linear

The parameter to be tuned are:
1. learning_rate
2. max_depth
3. min_child_weight
4. gamma
5. subsample
6. colsample_bytree
7. objective
8. learning_rate

In [8]:
parameters = {
    'min_child_weight':range(2,6,1),
    'max_depth':range(3,7,1),
    'gamma':[i/10.0 for i in range(0,5)],
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
    'learning_rate':[0.01, 0.1, 1]
}

xgb = XGBClassifier()
clf = RandomizedSearchCV(xgb, parameters, n_jobs=8, n_iter=5000)
clf.fit(train_dataset_onehot, train_label)

print(clf.best_params_)

{'max_depth': 5, 'reg_alpha': 0.01, 'gamma': 0.2, 'subsample': 0.9, 'colsample_bytree': 0.8, 'learning_rate': 0.1, 'min_child_weight': 3}


In [15]:
for i in range(10):
    X_train, X_test, y_train, y_test = get_train_test_set(0.3)
    lr = XGBClassifier(learning_rate=0.1, subsample=0.9, colsample_bytree=0.8, gamma=0.2,
                       max_depth=5, reg_alpha=0.01, min_child_weight=3, objective= 'binary:logistic')
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("%s Accuracy: %.2f%%" % ("XGBoost", accuracy * 100.0))

XGBoost Accuracy: 79.48%
XGBoost Accuracy: 82.09%
XGBoost Accuracy: 82.84%
XGBoost Accuracy: 81.72%
XGBoost Accuracy: 83.21%
XGBoost Accuracy: 80.97%
XGBoost Accuracy: 82.09%
XGBoost Accuracy: 85.07%
XGBoost Accuracy: 75.75%
XGBoost Accuracy: 82.46%


## Tuning Random Forest

The parameter to be tuned are:
1. max_depth
2. max_features
3. min_samples_split
4. min_samples_leaf
5. bootstrap
6. criterion

In [10]:
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint

param_dist = {"n_estimators" : sp_randint(3, 20),
              "max_depth": [1, 2, 3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 5000
# build a classifier
clf = RandomForestClassifier()
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, n_jobs=8)
random_search.fit(train_dataset_onehot, train_label)

print(random_search.best_params_)

{'max_depth': None, 'criterion': 'gini', 'min_samples_split': 9, 'max_features': 9, 'n_estimators': 16, 'min_samples_leaf': 4, 'bootstrap': True}


In [16]:
for i in range(10):
    X_train, X_test, y_train, y_test = get_train_test_set(0.3)
    lr = RandomForestClassifier(max_features=9, bootstrap=True, min_samples_split=9, n_estimators=16, criterion='gini',
                       min_samples_leaf=4, max_depth=None)  
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("%s Accuracy: %.2f%%" % ("Random forest", accuracy * 100.0))

Random forest Accuracy: 82.46%
Random forest Accuracy: 80.60%
Random forest Accuracy: 83.21%
Random forest Accuracy: 85.45%
Random forest Accuracy: 82.09%
Random forest Accuracy: 82.09%
Random forest Accuracy: 82.46%
Random forest Accuracy: 83.21%
Random forest Accuracy: 82.46%
Random forest Accuracy: 79.85%


# Submission, select the best model for Kaggle

In [17]:
# Submission score is 0.77990, better than gender classifier 0.76555

clf1 = RandomForestClassifier(max_features=9, bootstrap=True, min_samples_split=9, n_estimators=16, criterion='gini',
                       min_samples_leaf=4, max_depth=None)  
clf1.fit(train_dataset_onehot, train_label)
r_pred = clf1.predict(test_dataset_onehot)
r_predictions = [int(round(value)) for value in r_pred]

submission_df = pandas.DataFrame(index=test_dataset.index, columns=["Survived"])
submission_df["Survived"] = r_predictions
submission_df.to_csv("submission_best_rf.csv", sep=',')

In [18]:
# Submission score is 0.78469, better than gender classifier 0.76555

clf2 = XGBClassifier(learning_rate=0.1, subsample=0.9, colsample_bytree=0.8, gamma=0.2,
                       max_depth=5, reg_alpha=0.01, min_child_weight=3, objective= 'binary:logistic')
clf2.fit(train_dataset_onehot, train_label)
r_pred = clf2.predict(test_dataset_onehot)
r_predictions = [int(round(value)) for value in r_pred]

submission_df = pandas.DataFrame(index=test_dataset.index, columns=["Survived"])
submission_df["Survived"] = r_predictions
submission_df.to_csv("submission_best_xg.csv", sep=',')

In [20]:
# Submission score is 0.78469, better than gender classifier 0.76555

clf3 = SVC(gamma=0.05, C=0.75)
clf3.fit(train_dataset_onehot, train_label)
r_pred = clf3.predict(test_dataset_onehot)
r_predictions = [int(round(value)) for value in r_pred]

submission_df = pandas.DataFrame(index=test_dataset.index, columns=["Survived"])
submission_df["Survived"] = r_predictions
submission_df.to_csv("submission_best_svc.csv", sep=',')

In [21]:
# Submission score is 0.77512, better than gender classifier 0.76555

clf4 = LogisticRegression(C = 0.075)
clf4.fit(train_dataset_onehot, train_label)
r_pred = clf4.predict(test_dataset_onehot)
r_predictions = [int(round(value)) for value in r_pred]

submission_df = pandas.DataFrame(index=test_dataset.index, columns=["Survived"])
submission_df["Survived"] = r_predictions
submission_df.to_csv("submission_best_lr.csv", sep=',')

In [22]:
# Submission score is 0.81818, better than gender classifier 0.76555

from sklearn.ensemble import VotingClassifier

eclf2 = VotingClassifier(estimators=[('rf', clf1), ('xgb', clf2), ('svm', clf3), ('lr', clf4)], voting='hard')
eclf2.fit(train_dataset_onehot, train_label) 
r_pred = eclf2.predict(test_dataset_onehot)
r_predictions = [int(round(value)) for value in r_pred]

submission_df = pandas.DataFrame(index=test_dataset.index, columns=["Survived"])
submission_df["Survived"] = r_predictions
submission_df.to_csv("submission_best_voting.csv", sep=',')