In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')

In [3]:
train_df = pd.read_csv(train_file_path, index_col='PassengerId')
test_df = pd.read_csv(test_file_path, index_col='PassengerId')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Fare_Bin_very_low     891 non-null int64
Fare_Bin_low          891 non-null int64
Fare_Bin_high         891 non-null int64
Fare_Bin_very_high    891 non-null int64
Embarked_C            891 non-

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Fare_Bin_very_low     418 non-null int64
Fare_Bin_low          418 non-null int64
Fare_Bin_high         418 non-null int64
Fare_Bin_very_high    418 non-null int64
Embarked_C            418 non-null int64
Embarked_Q            418 n

## Prepare data

In [6]:
# Converts dataframe into matrix without the Survived column
X = train_df.loc[:,'Age':].as_matrix().astype('float')
# ravel converts into single dimensional matrix also called vector. Since one dimensional used lower case 'y'
y = train_df['Survived'].ravel()

  


In [7]:
print(X.shape, y.shape)

(891, 32) (891,)


In [8]:
# train test split
from sklearn.model_selection import train_test_split

# random_state param make sure that same result is returned whenever executed 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [9]:
# average survival in train and test
print('mean survival in train : {0:.3f}'.format(np.mean(y_train)))
print('mean survival in test : {0:.3f}'.format(np.mean(y_test)))

mean survival in train : 0.378
mean survival in test : 0.408


## Baseline model

In [10]:
from sklearn.dummy import DummyClassifier

# create model
dummy_model = DummyClassifier(strategy='most_frequent', random_state=1)
# train model
dummy_model.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=1, strategy='most_frequent')

In [11]:
print('Baseline model score : {0:.2f}'.format(dummy_model.score(X_test, y_test)))

Baseline model score : 0.59


In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

print('Baseline model accuracy : {0:.2f}'.format(accuracy_score(y_test, dummy_model.predict(X_test))))
print('Baseline model confusion matrix : \n {0}'.format(confusion_matrix(y_test, dummy_model.predict(X_test))))
print('Baseline model precision : {0:.2f}'.format(precision_score(y_test, dummy_model.predict(X_test))))
print('Baseline model recall : {0:.2f}'.format(recall_score(y_test, dummy_model.predict(X_test))))

Baseline model accuracy : 0.59
Baseline model confusion matrix : 
 [[106   0]
 [ 73   0]]
Baseline model precision : 0.00
Baseline model recall : 0.00


  'precision', 'predicted', average, warn_for)


## First kaggle submission

In [17]:
# convert test_df to matrix
test_X = test_df.as_matrix().astype('float')

  


In [18]:
# make predictions
predictions = dummy_model.predict(test_X)

In [19]:
# create submission dataframe with passengerId and predicted value
submission_df = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions})

In [20]:
submission_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [21]:
submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
submission_file_path = os.path.join(submission_data_path, 'baseline_submission.csv')

In [22]:
# index=False makes sure that no index column is added
submission_df.to_csv(submission_file_path, index=False)

In [32]:
def get_submission_file(model, filename):
    # convert test_df to matrix
    test_X = test_df.as_matrix().astype('float')
    # make predictions
    predictions = model.predict(test_X)
    # create submission dataframe with passengerId and predicted value
    submission_df = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions})
    # submission file
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    # index=False makes sure that no index column is added
    submission_df.to_csv(submission_file_path, index=False)

In [24]:
get_submission_file(dummy_model, 'baseline_submission.csv')

  This is separate from the ipykernel package so we can avoid doing imports until


## Logistic regression model

In [25]:
from sklearn.linear_model import LogisticRegression

# create model
logistic_model = LogisticRegression(random_state=1)
# train model
logistic_model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [26]:
def get_model_performance(model):
    print('Baseline model score : {0:.2f}'.format(model.score(X_test, y_test)))
    print('Baseline model accuracy : {0:.2f}'.format(accuracy_score(y_test, model.predict(X_test))))
    print('Baseline model confusion matrix : \n {0}'.format(confusion_matrix(y_test, model.predict(X_test))))
    print('Baseline model precision : {0:.2f}'.format(precision_score(y_test, model.predict(X_test))))
    print('Baseline model recall : {0:.2f}'.format(recall_score(y_test, model.predict(X_test))))

In [27]:
get_model_performance(logistic_model)

Baseline model score : 0.82
Baseline model accuracy : 0.82
Baseline model confusion matrix : 
 [[93 13]
 [19 54]]
Baseline model precision : 0.81
Baseline model recall : 0.74


In [28]:
logistic_model.coef_

array([[-0.0188797 ,  0.00365923, -0.44612568,  0.25235362, -1.00437111,
        -0.27710578,  0.3361231 , -0.2040542 ,  0.70576026,  1.15286614,
         0.23017411, -0.13294319, -0.70411096,  0.87117662,  0.67399112,
        -0.43845825,  0.12534422,  0.3445212 ,  0.27444044,  0.36240362,
         0.54179194,  0.52043891,  0.04447864,  0.24839361,  0.85831588,
         0.24720306,  1.60060026,  0.56233656, -1.20852965,  1.30154098,
        -0.28851509, -1.10792663]])

## Second kaggle submission

In [33]:
get_submission_file(logistic_model, 'logistic_submission.csv')

  This is separate from the ipykernel package so we can avoid doing imports until


## Hyperparameter optimization

In [35]:
# create model
logistic_model = LogisticRegression(random_state=1)

In [38]:
from sklearn.model_selection import GridSearchCV

params = {'C':[1.0,10.0,100.0,1000.0], 'penalty':['l1','l2']}
logistic_optimized_model = GridSearchCV(logistic_model, param_grid=params, cv=12)

logistic_optimized_model.fit(X_train, y_train)





GridSearchCV(cv=12, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1.0, 10.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
logistic_optimized_model.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [40]:
logistic_optimized_model.best_score_

0.8370786516853933

In [41]:
get_model_performance(logistic_optimized_model)

Baseline model score : 0.83
Baseline model accuracy : 0.83
Baseline model confusion matrix : 
 [[94 12]
 [19 54]]
Baseline model precision : 0.82
Baseline model recall : 0.74


In [42]:
get_submission_file(logistic_optimized_model, 'logistic_optimized_submission.csv')

  This is separate from the ipykernel package so we can avoid doing imports until


## Feature normalization and standardization

In [43]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [49]:
# normalization
scaler = MinMaxScaler()
X_train_scaled_minmax = scaler.fit_transform(X_train)
X_test_scaled_minmax = scaler.fit_transform(X_test)

In [50]:
X_train_scaled_minmax[:,0].min(), X_train_scaled[:,0].max()

(0.0, 1.0)

In [51]:
X_train_scaled_minmax

array([[0.37170143, 0.04538098, 0.2       , ..., 0.        , 0.        ,
        0.        ],
       [0.37170143, 0.1111184 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.42196532, 0.02049464, 0.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.25860769, 0.14346245, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25860769, 0.01473662, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25860769, 0.01571255, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [52]:
# standardization
scaler = StandardScaler()
X_train_scaled_standard = scaler.fit_transform(X_train)
X_test_scaled_standard = scaler.fit_transform(X_test)

## Model after standardization

In [65]:
from sklearn.model_selection import GridSearchCV

params = {'C':[1.0,10.0,100.0,1000.0], 'penalty':['l1','l2']}
logistic_optimized_model = GridSearchCV(logistic_model, param_grid=params, cv=3)

logistic_optimized_model.fit(X_train_scaled_standard, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1.0, 10.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [66]:
logistic_optimized_model.best_score_

0.827247191011236

In [67]:
get_model_performance(logistic_optimized_model)

Baseline model score : 0.65
Baseline model accuracy : 0.65
Baseline model confusion matrix : 
 [[103   3]
 [ 60  13]]
Baseline model precision : 0.81
Baseline model recall : 0.18


## Model persistence

In [68]:
import pickle

# file paths
model_file_path = os.path.join(os.path.pardir, 'models', 'lr_model.pkl')
scaler_file_path = os.path.join(os.path.pardir, 'models', 'lr_scaler.pkl')

# open files in write mode
model_pickle_file = open(model_file_path, 'wb')
scaler_pickle_file = open(scaler_file_path, 'wb')

# persist model and scaler
pickle.dump(logistic_optimized_model, model_pickle_file)
pickle.dump(scaler, scaler_pickle_file)

# close files
model_pickle_file.close()
scaler_pickle_file.close()

In [70]:
# load model and scaler objects

# open files in read mode
model_pickle_file = open(model_file_path, 'rb')
scaler_pickle_file = open(scaler_file_path, 'rb')

# load files
loaded_model = pickle.load(model_pickle_file)
loaded_scaler = pickle.load(scaler_pickle_file)

# close files
model_pickle_file.close()
scaler_pickle_file.close()

In [71]:
loaded_model

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1.0, 10.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [72]:
loaded_scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [75]:
X_test_scaled =loaded_scaler.transform(X_test)
loaded_model.score(X_test_scaled, y_test)

0.8156424581005587