In [1]:
import os
new_cwd = os.path.join(os.getcwd(), os.path.pardir)
os.chdir(new_cwd)
print(os.getcwd())

C:\Users\tomas\sources\data1\titanic


# Build base model

In [2]:
import pandas as pd
import numpy as np
import src.features.feature_normalization as fn
import src.data.utils as utils

## Read processed data

In [3]:
processed_data_path = 'data/processed'
train_data_path = os.path.join(processed_data_path, 'train.csv')
test_data_path = os.path.join(processed_data_path, 'test.csv')

df_train = pd.read_csv(train_data_path, index_col='PassengerId')
df_test = pd.read_csv(test_data_path, index_col='PassengerId')

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 44 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_capt            891 non-null int64
Title_col             891 non-null int64
Title_don             891 non-null int64
Title_dona            891 non-null int64
Title_dr              891 non-

In [5]:
X_train, X_test, y_train, y_test = utils.get_train_test_matrices(df_train, y_id='Survived')

In [6]:
print(f'mean survival in train: {np.mean(y_train)}')
print(f'mean survival in test: {np.mean(y_test)}')

mean survival in train: 0.38342696629213485
mean survival in test: 0.3854748603351955


## Create dummy model

In [7]:
from sklearn.dummy import DummyClassifier

In [8]:
dummy_model = DummyClassifier(strategy='most_frequent', random_state=0)
dummy_model.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

## Dummy model - metrics

In [9]:
print(f'Score: {dummy_model.score(X_test, y_test):.2f}')

Score: 0.61


In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [11]:
test_prediction = [y_test, dummy_model.predict(X_test)]
print(f"Accuracy score: {accuracy_score(*test_prediction):.2}")

Accuracy score: 0.61


In [12]:
confusion_matrix(*test_prediction)

array([[110,   0],
       [ 69,   0]], dtype=int64)

In [13]:
print(f"{precision_score(*test_prediction):.2}")
print(f"{recall_score(*test_prediction):.2}")

0.0
0.0


  'precision', 'predicted', average, warn_for)


In [14]:
utils.create_submission_file(dummy_model, df_test, 'dummy_01.csv')

In [15]:
utils.show_metrics(*test_prediction)

Accuracy score: 0.61
Confusion matrix: [[110   0]
 [ 69   0]]
Precision: 0.0
Recall: 0.0


## Logistic regression model

In [16]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(random_state=0)
lr_model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
predictions = [y_test, lr_model.predict(X_test)]
utils.show_metrics(*predictions)

Accuracy score: 0.83
Confusion matrix: [[94 16]
 [15 54]]
Precision: 0.77
Recall: 0.78


In [18]:
utils.create_submission_file(lr_model, df_test, 'lr_01.csv')

In [19]:
lr_model.coef_

array([[-0.02769073,  0.00460697, -0.46424383,  0.        , -0.99967398,
         0.07805807, -0.14595291, -0.39697824,  0.50689048,  1.11868925,
         0.39484247, -0.12194943, -0.28413195,  0.96923123,  0.51826752,
        -0.338031  , -0.21491052,  0.27207455,  0.        ,  0.        ,
        -0.24038588,  0.        ,  0.16290156, -0.01094179,  1.15470575,
         0.33568059,  0.15531057,  0.07035602, -1.46688786,  1.09894502,
         0.21781875, -0.79468679,  0.30135855,  0.10812921,  0.18515388,
         0.26714235,  0.28872142,  0.40845009,  0.49927105,  0.49150544,
         0.15869126,  0.42108203,  0.72838572]])

## Hyperparameter optimization

In [20]:
lr_model2 = LogisticRegression(random_state=0)

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
params = {'C': [1, 10, 100, 500, 1000], 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(lr_model2, param_grid=params, cv=3)

In [23]:
grid_search.fit(X_train, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10, 100, 500, 1000], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [24]:
grid_search.best_params_

{'C': 1, 'penalty': 'l2'}

In [25]:
print(f'best score: {grid_search.best_score_:.2}')

best score: 0.83


In [26]:
print(f'lr ver 2 score: {grid_search.score(X_test, y_test):.2}')

lr ver 2 score: 0.83


In [27]:
utils.create_submission_file(grid_search, df_test, 'lr_02.csv')

# Feature normalization

In [28]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 44 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_capt            891 non-null int64
Title_col             891 non-null int64
Title_don             891 non-null int64
Title_dona            891 non-null int64
Title_dr              891 non-

In [29]:
features_to_normalize = ['Age', 'Fare']
fn.normalize(df_train, features_to_normalize)
fn.normalize(df_test, features_to_normalize)

In [30]:
X_train, X_test, y_train, y_test = utils.get_train_test_matrices(df_train, y_id='Survived')

In [31]:
lr_model3 = LogisticRegression(random_state=0, C=1, penalty='l2')

In [32]:
lr_model.fit(X=X_train, y=y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
lr_model.score(X_test, y_test)

0.8212290502793296