In [1]:
import warnings
warnings.filterwarnings('ignore')

# numerical libraries
import numpy as np
import pandas as pd

# divide train and test (preproc)
from sklearn.cross_validation import train_test_split

# import different models
from sklearn.linear_model import LogisticRegression

# feature optimisation
from sklearn.feature_selection import SelectFromModel

# model evaluation
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn import cross_validation

# standarisation of features
from sklearn import preprocessing

In [2]:
cd Dropbox/Portfolio/DataScience-Portfolio/Titanic

/Users/Capgemini/Dropbox/Portfolio/DataScience-Portfolio/Titanic


In [3]:
# load data and test set
titanic = pd.read_csv('titanic_train_ready2.csv')

# generate X and Y for preditions
Y = np.ravel(titanic.Survived)  # to flatten array
X = titanic.drop('Survived', axis = 1)

In [4]:
# separate train and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=1)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((623, 7), (268, 7), (623,), (268,))

In [5]:
# baseline accuracy (predicting that it did not survive)
baseline = round(1-np.mean(Y), 2)
baseline

0.62

In [6]:
# multi-colinearity
# make correlation matrix and select those with correlation above 0.7
corr_matrix = titanic.corr()
corr_matrix[corr_matrix > 0.7] = 1
corr_matrix[corr_matrix < -0.7] = 1
corr_matrix[corr_matrix != 1] = 0
corr_matrix

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
Survived,1,0,0,0,0,0,0,0
Pclass,0,1,0,0,0,0,0,0
Sex,0,0,1,0,0,0,0,0
Age,0,0,0,1,0,0,0,0
SibSp,0,0,0,0,1,0,0,0
Parch,0,0,0,0,0,1,0,0
Fare,0,0,0,0,0,0,1,0
Embarked,0,0,0,0,0,0,0,1


In [7]:
# Logistic Regression Model
logReg = LogisticRegression()  # make the model object
logReg.fit(X_train, Y_train)   # fit to the training set

predicted = logReg.predict(X_test)         # predict affairs on test set
metrics.accuracy_score(Y_test,predicted) 

0.78731343283582089

### Recursive feature elimination

In [8]:
# Recursive Feature Selection (RFS)
logReg_RFS = SelectFromModel(logReg, prefit=True)
X_train_RFS = logReg_RFS.transform(X_train)
X_train_RFS.shape

(623, 2)

In [9]:
# which features were selected?
non_removedFeatures = logReg_RFS.inverse_transform(X_train_RFS)
non_removedFeatures = non_removedFeatures.sum(axis = 0)
non_removedFeatures[non_removedFeatures > 0 ] = 1

# make a dataframe with the features and whether they were selected (1)
coef_df = pd.DataFrame(X.columns)
coef_df.columns = ['Features']

coef_df['Coefficients'] = pd.Series(non_removedFeatures)
coef_df

Unnamed: 0,Features,Coefficients
0,Pclass,1
1,Sex,1
2,Age,0
3,SibSp,0
4,Parch,0
5,Fare,0
6,Embarked,0


In [10]:
# select same features in test set
X_test_RFS = logReg_RFS.transform(X_test)
X_test_RFS.shape

(268, 2)

In [11]:
# re fit the logistic regression with the 2 features only
logReg_RFS = LogisticRegression()
logReg_RFS.fit(X_train_RFS, Y_train)

predicted = logReg_RFS.predict(X_test_RFS)  # predict affairs on test set
metrics.accuracy_score(Y_test,predicted)    # check accuracy

0.75373134328358204

In [12]:
# re fit the logistic regression with the 2 features only
logReg_3F = LogisticRegression()
logReg_3F.fit(X_train[['Pclass', 'Sex', 'Age']], Y_train)

predicted = logReg_3F.predict(X_test[['Pclass', 'Sex', 'Age']])  # predict affairs on test set
metrics.accuracy_score(Y_test,predicted)

0.77238805970149249

In [13]:
# cross validation
scores = cross_validation.cross_val_score(logReg, X_train, Y_train, cv=5)
scores.mean()

0.80595494111623156

In [14]:
# cross validation
scores = cross_validation.cross_val_score(logReg_RFS, X_train_RFS, Y_train, cv=5)
scores.mean()

0.80111623143881217

In [15]:
# cross validation
scores = cross_validation.cross_val_score(logReg_3F, X_train[['Pclass', 'Sex', 'Age']], Y_train, cv=5)
scores.mean()

0.79951623143881212

### Feature Scaling

In [16]:
def standarisation(train, test):
    scaler = preprocessing.StandardScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    return train, test

In [17]:
X_train, X_test = standarisation(X_train, X_test)
X_train_RFS, X_test_RFS = standarisation(X_train, X_test)

In [18]:
# Logistic Regression Model
logReg_Std = LogisticRegression()  # make the model object
logReg_Std.fit(X_train, Y_train)   # fit to the training set

predicted = logReg_Std.predict(X_test)         # predict affairs on test set
metrics.accuracy_score(Y_test,predicted)

0.79104477611940294

In [19]:
# re fit the logistic regression with the 2 features only
logReg_RFS_Std = LogisticRegression()
logReg_RFS_Std.fit(X_train_RFS, Y_train)

predicted = logReg_RFS_Std.predict(X_test_RFS)  # predict affairs on test set
metrics.accuracy_score(Y_test,predicted)

0.79104477611940294

In [20]:
# cross validation
scores = cross_validation.cross_val_score(logReg_Std, X_train, Y_train, cv=5)
scores.mean()

0.80749103942652334

In [21]:
# cross validation
scores = cross_validation.cross_val_score(logReg_RFS_Std, X_train_RFS, Y_train, cv=5)
scores.mean()

0.80749103942652334

In [22]:
titanic_test = pd.read_csv('titanic_test_ready2.csv')
titanic_test = titanic_test[['Pclass', 'Sex']]
titanic_test.head()

Unnamed: 0,Pclass,Sex
0,3,1
1,3,0
2,2,1
3,3,1
4,3,0


In [23]:
X = X[['Pclass', 'Sex']]
# Y

In [24]:
# standarise the datasets
titanic_train, titanic_test = standarisation(X, titanic_test)

In [25]:
# Logistic Regression Model
logReg_Final = LogisticRegression()  # make the model object
logReg_Final.fit(titanic_train, Y)   # fit to the training set

# cross validation
scores = cross_validation.cross_val_score(logReg_Final, titanic_train, Y, cv=5)
scores.mean()

0.78671502492918788

In [26]:
titanic_train.shape, titanic_test.shape

((891, 2), (418, 2))

In [27]:
predicted = logReg_Final.predict(titanic_test)

In [28]:
test_df = pd.read_csv('test.csv')

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": predicted
    })

submission.to_csv('titanic_submission.csv', index=False)