In [1]:
import pandas as pd
import numpy as np
from os import path
import titanic_utils as tu

  return f(*args, **kwds)


In [2]:
def read_data():
    train_df = pd.read_csv('../data/titanic/raw/titanic-train.csv',index_col="PassengerId")
    test_df = pd.read_csv('../data/titanic/raw/titanic-test.csv', index_col='PassengerId')
    test_df['Survived'] = -888
    return pd.concat((train_df, test_df), axis=0, sort=True)

def process_data(df):
    return (
        df.assign(Title = lambda x: x.Name.map(tu.get_title))
         # working missing values - start with this
         .pipe(tu.fill_missing_values)
         # create fare bin feature
         .assign(Fare_Bin = lambda x: pd.qcut(x.Fare, 4, labels=['very_low','low','high','very_high']))
         # create age state
         .assign(AgeState = lambda x : np.where(x.Age >= 18, 'Adult','Child'))
         .assign(FamilySize = lambda x : x.Parch + x.SibSp + 1)
         .assign(IsMother = lambda x : np.where(((x.Sex == 'female') & (x.Parch > 0) & (x.Age > 18) & (x.Title != 'Miss')), 1, 0))
          # create deck feature
         .assign(Cabin = lambda x: np.where(x.Cabin == 'T', np.nan, x.Cabin)) 
         .assign(Deck = lambda x : x.Cabin.map(tu.get_deck))
         # feature encoding 
         .assign(IsMale = lambda x : np.where(x.Sex == 'male', 1,0))
         .pipe(pd.get_dummies, columns=['Deck', 'Pclass','Title', 'Fare_Bin', 'Embarked','AgeState'])
         # add code to drop unnecessary columns
         .drop(['Cabin','Name','Ticket','Parch','SibSp','Sex'], axis=1)
         # reorder columns
         .pipe(tu.reorder_columns)
    )
def gen_submission_file(model, X , filename):   
    predictions = model.predict(X)
    sub_df = pd.DataFrame({ 'PassengerId' : test_df.index, 'Survived' : predictions})
    sub_df = sub_df.to_csv(path.join('../data/titanic/external', filename), index=False)
    
def write_data(df):
    # train data
    df[df.Survived != -888].to_csv('../data/titanic/processed/train.csv') 
    # test data
    columns = [column for column in df.columns if column != 'Survived']
    df[df.Survived == -888][columns].to_csv('../data/titanic/processed/test.csv') 


In [3]:
df = read_data()
df = process_data(df)
df.head()
write_data(df)

In [None]:
# Sex vs Survived
# train_df[["SibSp", "Survived"]].groupby(['SibSp'], as_index = False).mean().sort_values(by='Survived', ascending=False)

## Regression

In [37]:
from sklearn.model_selection import train_test_split
import sklearn

In [38]:
train_df = pd.read_csv('../data/titanic/processed/train.csv',index_col="PassengerId")
test_df = pd.read_csv('../data/titanic/processed/test.csv', index_col='PassengerId')

In [39]:
X = np.matrix(train_df.loc[:,'Age':].astype('float'))
y = train_df.Survived.ravel()

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=0)

In [41]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [11]:
print(f'Mean survival in training set : {y_train.mean()}')
print(f'Mean survival in test set : {y_test.mean()}')

Mean survival in training set : 0.38342696629213485
Mean survival in test set : 0.3854748603351955


### 1. Base Model

In [12]:
from sklearn.dummy import DummyClassifier

In [13]:
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)
model_dummy.fit(X_train,y_train)
print('Score for baseline model : {0:.2f}'.format(model_dummy.score(X_test, y_test)))

Score for baseline model : 0.61


In [14]:
# !conda update -y sklearn

### Performance Metrics

In [15]:
from sklearn.metrics import precision_score, accuracy_score, confusion_matrix, recall_score

In [30]:
print('Accuracy for base line model is : {0:.2f}'.format(accuracy_score(y_test, model_dummy.predict(X_test))))
print('Confusion matrix for base line model is : \n {}'.format(confusion_matrix(y_test, model_dummy.predict(X_test))))
print('Precision score for base line model is : {0:.2f}'.format(precision_score(y_test, model_dummy.predict(X_test))))
print('Recall score for base line model is : {0:.2f}'.format(recall_score(y_test, model_dummy.predict(X_test))))

Accuracy for base line model is : 0.61
Confusion matrix for base line model is : 
 [[110   0]
 [ 69   0]]
Precision score for base line model is : 0.00
Recall score for base line model is : 0.00


  _warn_prf(average, modifier, msg_start, len(result))


Confusion matrix for base line model is : 
 [[110   0]
 [ 69   0]]


Precision score for base line model is : 0.00
Recall score for base line model is : 0.00


  _warn_prf(average, modifier, msg_start, len(result))


## Kaggle Submission (Base model)

In [19]:
test_X = np.matrix(test_df.loc[:,'Age':].astype('float'))
gen_submission_file(model_dummy,test_X,'01_dummy.csv')

## Kaggle Submission (Stage -2)

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
model_lr = LogisticRegression(random_state=0)
model_lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
print('Score for logistic regression {0:.2f}'.format(model_lr.score(X_test,y_test)))
print('Accuracy for base line model is : {0:.2f}'.format(accuracy_score(y_test, model_lr.predict(X_test))))
print('Confusion matrix for base line model is : \n {}'.format(confusion_matrix(y_test, model_lr.predict(X_test))))
print('Precision score for base line model is : {0:.2f}'.format(precision_score(y_test, model_lr.predict(X_test))))
print('Recall score for base line model is : {0:.2f}'.format(recall_score(y_test, model_lr.predict(X_test))))
gen_submission_file(model_lr,test_X,'02_dummy.csv')

Score for logistic regression 0.83
Accuracy for base line model is : 0.83
Confusion matrix for base line model is : 
 [[95 15]
 [15 54]]
Precision score for base line model is : 0.78
Recall score for base line model is : 0.78


In [47]:
model_lr.coef_
model_lr.intercept_

array([1.28209196])

## Kaggle Submission (Stage -3)

In [48]:
from sklearn.model_selection import GridSearchCV

In [54]:
parameters = {'C': [1.0,10.0,100.0,1000.0], 'penalty' : ['l1','l2']}
clf = GridSearchCV(model_lr,param_grid=parameters,cv=3)

In [55]:
clf.fit(X_train, y_train)

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative so

GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1.0, 10.0, 100.0, 1000.0],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [64]:
print('Best Score : {}'.format(clf.best_score_))
print('Best Params : {}'.format(clf.best_params_))


Best Score : 0.8272760581025659
Best Params : {'C': 10.0, 'penalty': 'l2'}


'0.22.2.post1'

In [61]:
gen_submission_file(clf,test_X,'03_dummy.csv')