# Build the final model and generate the predictions for submission 

Exploratory data analysis is already done and several models have been constructed and tested beforehand. In this notebook we will create an ensemble model from the best models, and use it to predict on the project test set. Predictions will be finally exported in file for submission.

## Import the libraries

In [39]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler

Set the rng seed.

In [2]:
seed = 17

## Import data

In [3]:
df_train = pd.read_csv('../data/train_prepd_3.csv')
df_test = pd.read_csv('../data/test_prepd_3.csv')

print('Training set:')
display(df_train.head(5))

print('Testing set:')
display(df_test.head(5))

Training set:


Unnamed: 0,Survived,Pclass,Age,Fare,FamSize,Sex_male,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_M,Deck_T
0,0.0,3,22.0,7.25,1,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0
1,1.0,1,38.0,71.2833,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
2,1.0,3,26.0,7.925,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0
3,1.0,1,35.0,53.1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0
4,0.0,3,35.0,8.05,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0


Testing set:


Unnamed: 0,Pclass,Age,Fare,FamSize,Sex_male,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_M,Deck_T
0,3,34.5,7.8292,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0
1,3,47.0,7.0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0
2,2,62.0,9.6875,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0
3,3,27.0,8.6625,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0
4,3,22.0,12.2875,2,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0


## Create dependent and independent variables 

In [4]:
X_train = df_train.iloc[:, 1:]
y_train = df_train.iloc[:, 0]
X_test = df_test

In [5]:
print('X_train: ')
display(X_train.head(5))
print('dims: ', X_train.shape)
print()

print('y_train: ')
display(y_train.head(5))
print('dims: ', y_train.shape)
print()

print('X_test: ')
display(X_test.head(5))
print('dims: ', X_test.shape)

X_train: 


Unnamed: 0,Pclass,Age,Fare,FamSize,Sex_male,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_M,Deck_T
0,3,22.0,7.25,1,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0
1,1,38.0,71.2833,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
2,3,26.0,7.925,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0
3,1,35.0,53.1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0
4,3,35.0,8.05,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0


dims:  (891, 19)

y_train: 


0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64

dims:  (891,)

X_test: 


Unnamed: 0,Pclass,Age,Fare,FamSize,Sex_male,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_M,Deck_T
0,3,34.5,7.8292,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0
1,3,47.0,7.0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0
2,2,62.0,9.6875,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0
3,3,27.0,8.6625,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0
4,3,22.0,12.2875,2,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0


dims:  (418, 19)


## Build and fit the model 

For model details, please see the corresponding notebooks inside this repo.

### Random Forest Classifier

In [15]:
clf_rf = RandomForestClassifier(criterion='entropy', max_depth=8, n_estimators = 20, max_features='sqrt', random_state=seed)
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)

### Support Vector Classifier

In [23]:
X_train_sc = X_train.copy()
X_test_sc = X_test.copy()

standardizer = StandardScaler()
X_train_sc[['Age', 'Fare', 'FamSize', 'Pclass']] = standardizer.fit_transform(X_train_sc[['Age', 'Fare', 'FamSize', 'Pclass']])
X_test_sc[['Age', 'Fare', 'FamSize', 'Pclass']] = standardizer.transform(X_test_sc[['Age', 'Fare', 'FamSize', 'Pclass']])

clf_svc= SVC(kernel='rbf',gamma=0.4, C=0.75, random_state = seed)
clf_svc.fit(X_train_sc, y_train)
y_pred_svc = clf_svc.predict(X_test_sc)

### XGBoost

In [29]:
clf_xgb = xgb.XGBClassifier(colsample_bytree=0.5,
                        gamma=0.25,
                        learning_rate=0.3,
                        max_depth=10,
                        min_child_weight=5,
                        reg_lambda=0,
                        scale_pos_weight=1,
                        use_label_encoder=False,
                        random_state=seed
                        )
clf_xgb.fit(X_train, y_train)
y_pred_xgb = clf_xgb.predict(X_test)



### Logistic Regression

In [34]:
scaler = StandardScaler()
X_train_sc2 = scaler.fit_transform(X_train)
X_test_sc2 = scaler.transform(X_test)

clf_lr = LogisticRegression(random_state=seed)
clf_lr.fit(X_train_sc2, y_train)
y_pred_lr = clf_lr.predict(X_test_sc2)

### Decision Tree

In [40]:
clf_dt = DecisionTreeClassifier(criterion = 'gini', ccp_alpha=0.002364, random_state = seed)
clf_dt.fit(X_train, y_train)
y_pred_dt = clf_dt.predict(X_test)

### Ensemble Classifier

In [44]:
y_pred_ens = ((y_pred_rf + y_pred_svc + y_pred_xgb + y_pred_lr + y_pred_dt)/5).round()

## Export to .csv 

In [45]:
# Use the sample file provided by Kaggle and just change the predictions with ours. Force to integer.
submission = pd.read_csv('gender_submission.csv')
submission['Survived'] = y_pred_ens.astype(int)

In [46]:
submission.to_csv('submission_ens3.csv', index=False)