In [1]:
import warnings
warnings.filterwarnings('ignore')

# numerical libraries
import numpy as np
import pandas as pd

# divide train and test (preproc)
from sklearn.cross_validation import train_test_split

# import different models
from sklearn.linear_model import LogisticRegression

# feature optimisation
from sklearn.feature_selection import SelectFromModel

# model evaluation
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn import cross_validation

# standarisation of features
from sklearn import preprocessing

In [2]:
cd Dropbox/Portfolio/DataScience-Portfolio/Titanic

/Users/Capgemini/Dropbox/Portfolio/DataScience-Portfolio/Titanic


In [3]:
# load data and test set
titanic = pd.read_csv('titanic_train_ready3.csv')

# generate X and Y for preditions
Y = np.ravel(titanic.Survived)  # to flatten array
X = titanic.drop('Survived', axis = 1)

In [4]:
# Drop following columns as they are hihgly correlated to other columns (we know from previous analysis
# see Titanic_LogisticRegression notebook)
X.drop(['Mr', 'Master', 'Unknown', 'Child', 'Age_mean'], axis = 1, inplace = True)

In [5]:
# separate train and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=1)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((623, 25), (268, 25), (623,), (268,))

In [6]:
# baseline accuracy (predicting that it did not survive)
baseline = round(1-np.mean(Y), 2)
baseline

0.62

### Logistic regression including all features

In [7]:
# feature scaling
def standarisation(train, test):
    scaler = preprocessing.StandardScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    return train, test

In [8]:
# standarization of features
X_train, X_test = standarisation(X_train, X_test)

In [19]:
# Set regularization parameter
for i, C in enumerate((500, 200, 100, 1, 0.01)):
    # turn down tolerance for short training time
    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01, solver = 'newton-cg')
    clf_l1_LR.fit(X_train, Y_train)
    clf_l2_LR.fit(X_train, Y_train)

    pred_l1 = clf_l1_LR.predict(X_test) 
    pred_l2 = clf_l2_LR.predict(X_test)
    
    scores_l1 = cross_validation.cross_val_score(clf_l1_LR, X_train, Y_train, cv=5)
    scores_l2 = cross_validation.cross_val_score(clf_l2_LR, X_train, Y_train, cv=5)
    
    print("C=%.2f" % C)
    print("score with L1 penalty: %.4f" % clf_l1_LR.score(X_train, Y_train))
    print("score with L2 penalty: %.4f" % clf_l2_LR.score(X_train, Y_train))
    print("")
    print("test score with L1 penalty: %.4f" % metrics.accuracy_score(Y_test,pred_l1))
    print("test score with L2 penalty: %.4f" % metrics.accuracy_score(Y_test,pred_l2))
    print("")
    print("crossval score with L1 penalty: %.4f" % scores_l1.mean())
    print("crossval score with L2 penalty: %.4f" % scores_l2.mean())
    print("")
    print("======")

C=500.00
score with L1 penalty: 0.8459
score with L2 penalty: 0.8443

test score with L1 penalty: 0.7985
test score with L2 penalty: 0.7948

crossval score with L1 penalty: 0.8236
crossval score with L2 penalty: 0.8220

C=200.00
score with L1 penalty: 0.8427
score with L2 penalty: 0.8443

test score with L1 penalty: 0.7910
test score with L2 penalty: 0.7948

crossval score with L1 penalty: 0.8220
crossval score with L2 penalty: 0.8220

C=100.00
score with L1 penalty: 0.8427
score with L2 penalty: 0.8443

test score with L1 penalty: 0.7948
test score with L2 penalty: 0.7948

crossval score with L1 penalty: 0.8220
crossval score with L2 penalty: 0.8220

C=1.00
score with L1 penalty: 0.8363
score with L2 penalty: 0.8363

test score with L1 penalty: 0.7910
test score with L2 penalty: 0.7948

crossval score with L1 penalty: 0.8220
crossval score with L2 penalty: 0.8156

C=0.01
score with L1 penalty: 0.6822
score with L2 penalty: 0.7673

test score with L1 penalty: 0.6194
test score with L2 

# Prepare submission

In [20]:
titanic_test = pd.read_csv('titanic_test_ready3.csv')
titanic_test.drop(['Mr', 'Master', 'Unknown', 'Child', 'Age_mean'], axis = 1, inplace = True)
titanic_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Age_Unknown,Minor,Alone,man_w_spouse,...,B,C.1,D,E,F,G,Miss,Mrs,Other,low_fare
0,3,1,34.5,0,0,7.8292,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,3,0,47.0,1,0,7.0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
2,2,1,62.0,0,0,9.6875,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,3,1,27.0,0,0,8.6625,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,3,0,22.0,1,1,12.2875,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1


In [22]:
titanic_train = X
titanic_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Age_Unknown,Minor,Alone,man_w_spouse,...,B,C.1,D,E,F,G,Miss,Mrs,Other,low_fare
0,3,1,22,1,0,7.25,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
1,1,0,38,1,0,71.2833,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
2,3,0,26,0,0,7.925,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
3,1,0,35,1,0,53.1,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
4,3,1,35,0,0,8.05,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
# standarise the datasets
titanic_train, titanic_test = standarisation(titanic_train, titanic_test)

In [24]:
# Logistic Regression Model
logReg_Final = LogisticRegression(C=500, penalty='l1', tol=0.01)
logReg_Final.fit(titanic_train, Y)   # fit to the training set

# cross validation
scores = cross_validation.cross_val_score(logReg_Final, titanic_train, Y, cv=5)
scores.mean()

0.81037415237392041

In [25]:
titanic_train.shape, titanic_test.shape

((891, 25), (418, 25))

In [26]:
predicted = logReg_Final.predict(titanic_test)

In [27]:
test_df = pd.read_csv('test.csv')

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": predicted
    })

submission.to_csv('titanic_submission_regLogit.csv', index=False)