In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

test = pd.read_csv('preprocessed_test.csv')
y_final = test['PassengerId']

## Pre-processing

In [46]:
raw_data = pd.read_csv('preprocessed_test.csv')
data = raw_data.drop(['PassengerId'], axis=1)

#change some assignments to facilitate dummy variable dropping 
data['Sex'] = data['Sex'].map({'female':1, 'male':0})
data = data.rename({'Sex': 'Is_Female'}, axis=1)
data.loc[data.Pclass == 3, 'Pclass'] = 0
data.loc[data.Deck == 'Unknown', 'Deck'] = 0

#convert to categorical data type 
for x in data: 
    data[x] = data[x].astype('category')

In [47]:
targets = data['Survived']
inputs = data.drop(['Survived'], axis=1)
inputs = pd.get_dummies(inputs, drop_first=True)

In [48]:
inputs.shape

(418, 30)

In [5]:
#separate out 15% data as final test - do not touch until you're sure you're done
X, X_final, Y, y_final = train_test_split(inputs, targets, train_size = .85, random_state=33)

#separet out 15% as CV
X_train, X_cv, y_train, y_cv = train_test_split(X, Y, train_size = .85, random_state=33)

## Logistic Regression

#### Base Model

Regularization is applied by default - l2, C=1.0 (set to a really large # to default to no regularization)

In [6]:
model = LogisticRegression(solver = 'liblinear', C = 10000)
X_train = sm.add_constant(X_train)
X_cv = sm.add_constant(X_cv)
print(X_train.shape)

model.fit(X_train, y_train)

#accuracy
print(model.score(X_train, y_train)) #we see some overfitting
print(model.score(X_cv, y_cv))
pred = model.predict(X_cv)

#there is evidence of overfitting

(643, 31)
0.8506998444790047
0.8245614035087719


#### confusion matrix

In [7]:
cm = confusion_matrix(y_cv, pred)
cm_df = pd.DataFrame(cm, columns=['Pred No', 'Pred Yes'])
cm_df = cm_df.rename(index = {0: 'Actual No', 1: 'Actual Yes'})
cm_df

Unnamed: 0,Pred No,Pred Yes
Actual No,55,5
Actual Yes,15,39


#### classification report

In [8]:
print(classification_report(y_cv, pred, digits=3))

              precision    recall  f1-score   support

           0      0.786     0.917     0.846        60
           1      0.886     0.722     0.796        54

    accuracy                          0.825       114
   macro avg      0.836     0.819     0.821       114
weighted avg      0.833     0.825     0.822       114



precision = 0.886 (of all the 1's we assigned, 88.6% did survive)
recall = 0.722 (of all those who did survive, we only got 72.2% correct)
f1 = 0.796
accuracy = 0.825

#### Grid search cv

In [9]:
params_grid = {"C": [0.1, 0.3, 0.5, 0.7, 0.9, 1, 1.5], 
               "penalty":["l1","l2"]}  #l1 lasso l2 ridge

logreg = LogisticRegression(solver = 'liblinear')
logreg_cv = GridSearchCV(logreg, params_grid, cv=5, scoring = 'accuracy')
logreg_cv.fit(X_train,y_train)

print(f"tuned hpyerparameters: {logreg_cv.best_params_}")
print(f"accuracy: {logreg_cv.best_score_}")

tuned hpyerparameters: {'C': 0.5, 'penalty': 'l1'}
accuracy: 0.8243217054263565


In [10]:
model = LogisticRegression(solver = 'liblinear', C = 0.5, penalty = 'l1')
X_train = sm.add_constant(X_train)
X_cv = sm.add_constant(X_cv)

print(X_train.shape)
model.fit(X_train, y_train)

#accuracy
print(model.score(X_train, y_train)) #we see some overfitting
print(model.score(X_cv, y_cv))
pred = model.predict(X_cv)

(643, 31)
0.8289269051321928
0.8245614035087719


In [11]:
cm = confusion_matrix(y_cv, pred)
cm_df = pd.DataFrame(cm, columns=['Pred No', 'Pred Yes'])
cm_df = cm_df.rename(index = {0: 'Actual No', 1: 'Actual Yes'})
cm_df

Unnamed: 0,Pred No,Pred Yes
Actual No,55,5
Actual Yes,15,39


In [12]:
print(classification_report(y_cv, pred, digits=3))

              precision    recall  f1-score   support

           0      0.786     0.917     0.846        60
           1      0.886     0.722     0.796        54

    accuracy                          0.825       114
   macro avg      0.836     0.819     0.821       114
weighted avg      0.833     0.825     0.822       114



#### classification report

precision = 0.886 (of all the 1's we assigned, 88.6% did survive)
recall = 0.722 (of all those who did survive, we only got 72.2% correct)
f1 = 0.796
accuracy = 0.825

In [13]:
#features interpretation 
feature_names = inputs.columns.values
feature_names = np.append(['Intercept'], feature_names)

summary = pd.DataFrame(data = feature_names, columns = ['Features'])
summary['Coef'] = np.transpose(model.coef_)
summary['Odds_ratio'] = np.exp(summary.Coef)
summary.sort_values('Coef', ascending=False)

Unnamed: 0,Features,Coef,Odds_ratio
1,Pclass_1,1.65553,5.235852
7,Is_Female_1,1.480432,4.394843
3,Title_Master,1.398088,4.047454
2,Pclass_2,0.79229,2.208448
27,Deck_D,0.4388,1.550844
28,Deck_E,0.293123,1.340607
11,Age_Group_30-39,0.243496,1.275701
20,Fare_Group_7-8,0.145056,1.156104
6,Title_Mrs,0.066853,1.069138
19,Fare_Group_38+,0.0,1.0


Interpretatin of Odds_ratio:
holding other variables constant, how does this feature impact odds compared to default 

Deck_E: holding everything else constant, odds of survival is almost 5x that of Unknown Deck 
Deck_G: holding everything else constant, odds of survival is almost 50%worse that of Unknown Deck 
Is_Female: 4.4x better odds than male 
As we approach odds_ratio of 1, less explanatory power of the feature -- 
Deck_A and Title_miss are close to 1 
lastly, family_size_2 (5+) means odds of survival is almost 90% worse than traveling Alone 

Surprises: 
Pclass_2 odds_ratio is higher than that of Pclass
Pclass 2 2.2x more likely than Pclass 3 
Pclass 1 66% more likely 
Family_size_1 (2-4) 25% less good than traveling Alone -- seems to be contraditory to our EDA 

#### change 1/0 assignment

In [38]:
pred_proba = model.predict_proba(X_cv)[:,1]
pred = [0 if x < 0.4 else 1 for x in pred_proba]

In [36]:
cm = confusion_matrix(y_cv, pred)
cm_df = pd.DataFrame(cm, columns=['Pred No', 'Pred Yes'])
cm_df = cm_df.rename(index = {0: 'Actual No', 1: 'Actual Yes'})
cm_df

Unnamed: 0,Pred No,Pred Yes
Actual No,51,9
Actual Yes,12,42


In [39]:
print(classification_report(y_cv, pred, digits=3))

              precision    recall  f1-score   support

           0      0.806     0.900     0.850        60
           1      0.872     0.759     0.812        54

    accuracy                          0.833       114
   macro avg      0.839     0.830     0.831       114
weighted avg      0.837     0.833     0.832       114



#### Final model

In [41]:
model = LogisticRegression(solver = 'liblinear', C = 0.5, penalty = 'l1')
X = sm.add_constant(X)
X_final = sm.add_constant(X_final)
model.fit(X, Y)

print(model.score(X, Y)) #we see some overfitting
print(model.score(X_final, y_final))

0.8401585204755614
0.835820895522388


In [50]:
inputs = sm.add_constant(inputs)

pred_proba = model.predict_proba(inputs)[:,1]
pred = [0 if x < 0.5 else 1 for x in pred_proba]

In [45]:
print(classification_report(y_final, pred, digits=3))

              precision    recall  f1-score   support

           0      0.845     0.887     0.866        80
           1      0.820     0.759     0.788        54

    accuracy                          0.836       134
   macro avg      0.833     0.823     0.827       134
weighted avg      0.835     0.836     0.835       134



In [None]:
predfinal['PassengerId'] = raw_data
predfinal = predfinal[['PassengerId', 'Survived']]
predfinal
predfinal.to_csv('nn_submit_1.csv', index = False)

In [56]:
submit = pd.DataFrame(raw_data['PassengerId'].values, columns = ['PassengerId'])
submit['Survived'] = pred
submit
submit.to_csv('log_final.csv', index = False)

score on Kaggle: 0.7799