# Build base model

In [1]:
import pandas as pd
import numpy as np
import os

## Read processed data

In [2]:
processed_data_path = 'data/processed'
train_data_path = os.path.join(os.path.pardir, processed_data_path, 'train.csv')
test_data_path = os.path.join(os.path.pardir, processed_data_path, 'test.csv')

df_train = pd.read_csv(train_data_path, index_col='PassengerId')
df_test = pd.read_csv(test_data_path, index_col='PassengerId')

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 44 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_capt            891 non-null int64
Title_col             891 non-null int64
Title_don             891 non-null int64
Title_dona            891 non-null int64
Title_dr              891 non-

In [4]:
X = df_train.loc[:, 'Age':].values.astype('float')
y = df_train.loc[:, 'Survived'].ravel()

In [5]:
print(X.shape, y.shape)

(891, 43) (891,)


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(712, 43) (712,)
(179, 43) (179,)


In [7]:
print(f'mean survival in train: {np.mean(y_train)}')
print(f'mean survival in test: {np.mean(y_test)}')

mean survival in train: 0.38342696629213485
mean survival in test: 0.3854748603351955


## Create dummy model

In [8]:
from sklearn.dummy import DummyClassifier

In [9]:
dummy_model = DummyClassifier(strategy='most_frequent', random_state=0)
dummy_model.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

## Dummy model - metrics

In [10]:
print(f'Score: {dummy_model.score(X_test, y_test):.2f}')

Score: 0.61


In [11]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [12]:
test_prediction = [y_test, dummy_model.predict(X_test)]
print(f"Accuracy score: {accuracy_score(*test_prediction):.2}")

Accuracy score: 0.61


In [13]:
confusion_matrix(*test_prediction)

array([[110,   0],
       [ 69,   0]], dtype=int64)

In [14]:
print(f"{precision_score(*test_prediction):.2}")
print(f"{recall_score(*test_prediction):.2}")

0.0
0.0


  'precision', 'predicted', average, warn_for)


In [30]:
def show_metrics(real, predictions):
    from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
    print(f"Accuracy score: {accuracy_score(real, predictions):.2}")
    print(f"Confusion matrix: {confusion_matrix(real, predictions)}")
    print(f"Precision: {precision_score(real, predictions):.2}")
    print(f"Recall: {recall_score(real, predictions):.2}")

In [31]:
def create_submission_file(model, df_test, filename):
    predictions = model.predict(df_test)
    df_submission = pd.DataFrame({'PassengerId': df_test.index, 'Survived': predictions})
    submission_data_file = os.path.join(os.pardir, 'data', 'external', filename)
    df_submission.to_csv(submission_data_file, index=False)


In [32]:
create_submission_file(dummy_model, df_test, 'dummy_01.csv')

In [33]:
show_metrics(*test_prediction)

Accuracy score: 0.61
Confusion matrix: [[110   0]
 [ 69   0]]
Precision: 0.0
Recall: 0.0


## Logistic regression model

In [34]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(random_state=0)
lr_model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
predictions = [y_test, lr_model.predict(X_test)]
show_metrics(*predictions)

Accuracy score: 0.83
Confusion matrix: [[94 16]
 [15 54]]
Precision: 0.77
Recall: 0.78


In [36]:
create_submission_file(lr_model, df_test, 'lr_01.csv')

In [41]:
lr_model.coef_.shape
df_train.shape

(891, 44)