In [6]:
import pandas as pd
import numpy as np
from os import path
import titanic_utils as tu

In [103]:
def read_data():
    train_df = pd.read_csv('../data/titanic/raw/titanic-train.csv',index_col="PassengerId")
    test_df = pd.read_csv('../data/titanic/raw/titanic-test.csv', index_col='PassengerId')
    test_df['Survived'] = -888
    return pd.concat((train_df, test_df), axis=0, sort=True)

def process_data(df):
    return (
        df.assign(Title = lambda x: x.Name.map(tu.get_title))
         # working missing values - start with this
         .pipe(tu.fill_missing_values)
         # create fare bin feature
         .assign(Fare_Bin = lambda x: pd.qcut(x.Fare, 4, labels=['very_low','low','high','very_high']))
         # create age state
         .assign(AgeState = lambda x : np.where(x.Age >= 18, 'Adult','Child'))
         .assign(FamilySize = lambda x : x.Parch + x.SibSp + 1)
         .assign(IsMother = lambda x : np.where(((x.Sex == 'female') & (x.Parch > 0) & (x.Age > 18) & (x.Title != 'Miss')), 1, 0))
          # create deck feature
         .assign(Cabin = lambda x: np.where(x.Cabin == 'T', np.nan, x.Cabin)) 
         .assign(Deck = lambda x : x.Cabin.map(tu.get_deck))
         # feature encoding 
         .assign(IsMale = lambda x : np.where(x.Sex == 'male', 1,0))
         .pipe(pd.get_dummies, columns=['Deck', 'Pclass','Title', 'Fare_Bin', 'Embarked','AgeState'])
         # add code to drop unnecessary columns
         .drop(['Cabin','Name','Ticket','Parch','SibSp','Sex'], axis=1)
         # reorder columns
         .pipe(tu.reorder_columns)
    )
def gen_submission_file(model, X , filename):   
    predictions = model.predict(X)
    sub_df = pd.DataFrame({ 'PassengerId' : test_df.index, 'Survived' : predictions})
    sub_df = sub_df.to_csv(path.join('../data/titanic/external', filename), index=False)
    
def write_data(df):
    # train data
    df[df.Survived != -888].to_csv('../data/titanic/processed/train.csv') 
    # test data
    columns = [column for column in df.columns if column != 'Survived']
    df[df.Survived == -888][columns].to_csv('../data/titanic/processed/test.csv') 


In [8]:
df = read_data()
df = process_data(df)
df.head()
write_data(df)

## Regression

In [42]:
from sklearn.model_selection import train_test_split
import sklearn

In [10]:
train_df = pd.read_csv('../data/titanic/processed/train.csv',index_col="PassengerId")
test_df = pd.read_csv('../data/titanic/processed/test.csv', index_col='PassengerId')

In [30]:
X = np.matrix(train_df.loc[:,'Age':].astype('float'))
y = train_df.Survived.ravel()

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=0)

In [38]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [41]:
print(f'Mean survival in training set : {y_train.mean()}')
print(f'Mean survival in test set : {y_test.mean()}')

Mean survival in training set : 0.38342696629213485
Mean survival in test set : 0.3854748603351955


### 1. Base Model

In [47]:
from sklearn.dummy import DummyClassifier

In [82]:
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)
model_dummy.fit(X_train,y_train)
print('Score for baseline model : {0:.2f}'.format(model_dummy.score(X_test, y_test)))

Score for baseline model : 0.61


In [51]:
# !conda update -y sklearn

### Performance Metrics

In [52]:
from sklearn.metrics import precision_score, accuracy_score, confusion_matrix, recall_score

In [61]:
print('Accuracy for base line model is : {0:.2f}'.format(accuracy_score(y_test, model_dummy.predict(X_test))))

Accuracy for base line model is : 0.61


In [62]:
print('Confusion matrix for base line model is : \n {}'.format(confusion_matrix(y_test, model_dummy.predict(X_test))))

Confusion matrix for base line model is : 
 [[110   0]
 [ 69   0]]


In [66]:
print('Precision score for base line model is : {0:.2f}'.format(precision_score(y_test, model_dummy.predict(X_test))))
print('Recall score for base line model is : {0:.2f}'.format(recall_score(y_test, model_dummy.predict(X_test))))

Precision score for base line model is : 0.00
Recall score for base line model is : 0.00


## Kaggle Submission (Base model)

In [106]:
test_X = np.matrix(test_df.loc[:,'Age':].astype('float'))
gen_submission_file(model_dummy,test_X,'01_dummy.csv')