In [1]:
import warnings
warnings.filterwarnings('ignore')

# numerical libraries
import numpy as np
import pandas as pd

# divide train and test (preproc)
from sklearn.cross_validation import train_test_split

# import different models
from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

# feature optimisation
from sklearn.feature_selection import SelectFromModel

# model evaluation
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn import cross_validation

# additional model xgboost
import xgboost as xgb



In [3]:
# confusion matrix function
def confusion_mtx(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm = pd.DataFrame(data=cm, columns=[0, 1], index=[0, 1])
    cm.columns.name = 'Predicted label'
    cm.index.name = 'True label'
    error_rate = (y_pred != y_test).mean()
    print('error rate: %.2f' % error_rate)
    return cm

In [4]:
# load data and test set
titanic = pd.read_csv('titanic_train_ready2.csv')

# generate X and Y for preditions
Y = np.ravel(titanic.Survived)  # to flatten array
X = titanic.drop('Survived', axis = 1)

In [5]:
# separate train and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=1)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((623, 7), (268, 7), (623,), (268,))

In [6]:
# baseline accuracy (predicting that it did not survive)
baseline = round(1-np.mean(Y), 2)
baseline

0.62

In [7]:
# multi-colinearity
# make correlation matrix and select those with correlation above 0.7
corr_matrix = titanic.corr()
corr_matrix[corr_matrix > 0.7] = 1
corr_matrix[corr_matrix < -0.7] = 1
corr_matrix[corr_matrix != 1] = 0
corr_matrix

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
Survived,1,0,0,0,0,0,0,0
Pclass,0,1,0,0,0,0,0,0
Sex,0,0,1,0,0,0,0,0
Age,0,0,0,1,0,0,0,0
SibSp,0,0,0,0,1,0,0,0
Parch,0,0,0,0,0,1,0,0
Fare,0,0,0,0,0,0,1,0
Embarked,0,0,0,0,0,0,0,1


In [8]:
# Logistic Regression Model
logReg = LogisticRegression()  # make the model object
logReg.fit(X_train, Y_train)   # fit to the training set

predicted = logReg.predict(X_test)         # predict affairs on test set
metrics.accuracy_score(Y_test,predicted) 

0.78731343283582089

In [9]:
# Linear Discriminant Analysis
lda_model = LDA()   # make model object
lda_model.fit(X_train, Y_train)  # fit to train set

predicted = logReg.predict(X_test)         # predict affairs on test set
metrics.accuracy_score(Y_test,predicted)

0.78731343283582089

In [10]:
# K Nearest Neighbours
for neighbour in range(1,16,2):
    knn_model = KNeighborsClassifier(n_neighbors = neighbour)
    knn_model.fit(X_train, Y_train)
    predicted = knn_model.predict(X_test)
    print(metrics.accuracy_score(Y_test, predicted))

0.652985074627
0.701492537313
0.690298507463
0.694029850746
0.682835820896
0.690298507463
0.690298507463
0.690298507463


In [11]:
# Random Forests
rf_model = RandomForestClassifier(n_estimators = 500, random_state = 1)
rf_model.fit(X_train, Y_train)
predicted = rf_model.predict(X_test)
metrics.accuracy_score(Y_test, predicted)

0.7574626865671642

In [12]:
# XGBoost
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X_train, Y_train)
predicted = gbm.predict(X_test)
metrics.accuracy_score(Y_test, predicted)

0.77985074626865669

In [13]:
# SVM regularization parameter
C = 1.0  

# SVC with a Linear Kernel  (our original example)
svc = svm.SVC(kernel='linear', C=C).fit(X_train, Y_train)
predicted = svc.predict(X_test)
metrics.accuracy_score(Y_test, predicted)

0.75373134328358204

In [14]:
# Gaussian Radial Bassis Function
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X_train, Y_train)
predicted = rbf_svc.predict(X_test)
metrics.accuracy_score(Y_test, predicted)

0.60447761194029848

In [15]:
# SVC Linear
lin_svc = svm.LinearSVC(C=C).fit(X_train,Y_train)
predicted = lin_svc.predict(X_test)
metrics.accuracy_score(Y_test, predicted)

0.63432835820895528