In [66]:
import warnings
warnings.filterwarnings('ignore')

# numerical libraries
import numpy as np
import pandas as pd

# divide train and test (preproc)
from sklearn.cross_validation import train_test_split

# import different models
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# feature optimisation
from sklearn.feature_selection import SelectFromModel

# model optimisation
from sklearn.grid_search import RandomizedSearchCV
from sklearn.grid_search import GridSearchCV

# model evaluation
from sklearn import metrics
from sklearn import cross_validation

# standarisation of features
from sklearn import preprocessing

In [3]:
cd Dropbox/Portfolio/DataScience-Portfolio/Titanic

/Users/Capgemini/Dropbox/Portfolio/DataScience-Portfolio/Titanic


In [4]:
# load data and test set
titanic = pd.read_csv('titanic_train_ready2.csv')

# generate X and Y for preditions
Y = np.ravel(titanic.Survived)  # to flatten array
X = titanic.drop('Survived', axis = 1)

In [5]:
# separate train and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=1)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((623, 7), (268, 7), (623,), (268,))

In [6]:
# baseline accuracy (predicting that it did not survive)
baseline = round(1-np.mean(Y), 2)
baseline

0.62

## Feature Scaling

In [2]:
def standarisation(train, test):
    scaler = preprocessing.StandardScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    return train, test

In [7]:
X_train, X_test = standarisation(X_train, X_test)

### Random Forests

In [11]:
# Random Forests Model Std features
rf_model = RandomForestClassifier(n_estimators = 1250, max_features = 6, criterion = 'gini', bootstrap = True,
                                  min_samples_split = 2, min_samples_leaf = 10, max_depth = None,
                                  random_state = 1)
rf_model.fit(X_train, Y_train)
pred_rf = rf_model.predict(X_test)
metrics.accuracy_score(Y_test, pred_rf)

0.76865671641791045

### Nearest Neighbours

In [13]:
# K Nearest Neighbours
for neighbour in range(1,20,2):
    knn_model = KNeighborsClassifier(n_neighbors = neighbour)
    knn_model.fit(X_train, Y_train)
    predicted = knn_model.predict(X_test)
    print(neighbour, metrics.accuracy_score(Y_test, predicted))

1 0.705223880597
3 0.776119402985
5 0.783582089552
7 0.779850746269
9 0.779850746269
11 0.776119402985
13 0.768656716418
15 0.772388059701
17 0.768656716418
19 0.757462686567


In [15]:
knn_model = KNeighborsClassifier(n_neighbors = 5)
knn_model.fit(X_train, Y_train)
pred_knn = knn_model.predict(X_test)
metrics.accuracy_score(Y_test, pred_knn)

0.78358208955223885

### XGBoost

In [16]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train,Y_train)

predicted = xgb_model.predict(X_test)       
metrics.accuracy_score(Y_test, predicted)

0.77985074626865669

In [17]:
# grid search for parameter optimisation
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200],
                   'gamma' : [0,1,10], 'learning_rate' : [0.01, 0.03, 0.1, 0.3]},
                   verbose=1)

clf.fit(X_train, Y_train)
pred_xgb = clf.predict(X_test)       
metrics.accuracy_score(Y_test, pred_xgb)

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    4.0s


Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=1)]: Done 324 out of 324 | elapsed:    6.7s finished


0.77238805970149249

## SVM

In [26]:
# SVM regularization parameter
for C in [.1, 1.0, 10]:
    # SVC with a Linear Kernel  (our original example)
    svc = svm.SVC(kernel='linear', C=C).fit(X_train, Y_train)
    predicted = svc.predict(X_test)
    print(C, metrics.accuracy_score(Y_test, predicted))

0.1 0.753731343284
1.0 0.753731343284
10 0.753731343284


In [28]:
# SVM regularization parameter
for C in [.1, 1.0, 10]:
    for degree in range(1,7):
        # SVC with a Linear Kernel  (our original example)
        svc = svm.SVC(kernel='poly', degree = degree, C=C).fit(X_train, Y_train)
        predicted = svc.predict(X_test)
        print(C, degree, metrics.accuracy_score(Y_test, predicted))

# BETS IS C = 1, DEGREE = 2

0.1 1 0.753731343284
0.1 2 0.652985074627
0.1 3 0.716417910448
0.1 4 0.675373134328
0.1 5 0.667910447761
0.1 6 0.652985074627
1.0 1 0.753731343284
1.0 2 0.791044776119
1.0 3 0.776119402985
1.0 4 0.720149253731
1.0 5 0.694029850746
1.0 6 0.708955223881
10 1 0.753731343284
10 2 0.787313432836
10 3 0.764925373134
10 4 0.757462686567
10 5 0.772388059701
10 6 0.727611940299


In [30]:
svc = svm.SVC(kernel='poly', degree = 2, C=1).fit(X_train, Y_train)
pred_SVM = svc.predict(X_test)
metrics.accuracy_score(Y_test, pred_SVM)

0.79104477611940294

In [34]:
# Gaussian Radial Bassis Function
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=1).fit(X_train, Y_train)
pred_SVMrbf = svc.predict(X_test)
metrics.accuracy_score(Y_test, pred_SVMrbf)

0.79104477611940294

### Logistic Regression

In [67]:
logReg = LogisticRegression()  
logReg.fit(X_train, Y_train)

# predict survivors on test set
pred_logit = logReg.predict(X_test)         
metrics.accuracy_score(Y_test, pred_logit)

0.79104477611940294

## Combination of models

In [62]:
pred1 = (pred_rf + pred_knn + pred_SVMrbf)/3
pred1 = np.where(pred1 > 0.5,1,0)

In [63]:
metrics.accuracy_score(Y_test, pred1)

0.79104477611940294

In [68]:
pred2 = (pred_logit + pred_knn + pred_SVMrbf) / 3
pred2 = np.where(pred2 > 0.5,1,0)
metrics.accuracy_score(Y_test, pred2)

0.79850746268656714

In [69]:
pred3 = (pred_logit + pred_rf + pred_SVMrbf) / 3
pred3 = np.where(pred3 > 0.5,1,0)
metrics.accuracy_score(Y_test, pred3)

0.79850746268656714

## Prepare submission

In [71]:
# load data
titanic_train = X
titanic_test = pd.read_csv('titanic_test_ready2.csv')

# standarise
titanic_train, titanic_test = standarisation(titanic_train, titanic_test)

In [72]:
# logit
logReg = LogisticRegression()  
logReg.fit(titanic_train, Y)

# predict survivors on test set
pred_logit = logReg.predict(titanic_test)         

In [73]:
# Gaussian Radial Bassis Function
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=1).fit(titanic_train, Y)
pred_SVMrbf = svc.predict(titanic_test)

In [74]:
# Random Forests
rf_model = RandomForestClassifier(n_estimators = 1250, max_features = 6, criterion = 'gini', bootstrap = True,
                                  min_samples_split = 2, min_samples_leaf = 10, max_depth = None,
                                  random_state = 1)
rf_model.fit(titanic_train, Y)
pred_rf = rf_model.predict(titanic_test)

In [75]:
# Combination of the 3 models
pred3 = (pred_logit + pred_rf + pred_SVMrbf) / 3
pred3 = np.where(pred3 > 0.5,1,0)

In [76]:
test_df = pd.read_csv('test.csv')

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": pred3
    })

submission.to_csv('titanic_submission_Combined.csv', index=False)