# 2. Building Predictive Models

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
train_df = pd.read_csv('train_processed.csv')
test_df = pd.read_csv('test_processed.csv')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 34 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
PassengerId           891 non-null int64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-

In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 33 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
PassengerId           418 non-null int64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 non-

In [5]:
X = train_df.loc[:, 'Age' :].as_matrix().astype('float')
y = train_df['Survived'].ravel()

In [6]:
print(X.shape, y.shape)

(891, 33) (891,)


In [8]:
# train, test, split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(712, 33) (712,)
(179, 33) (179,)


In [9]:
print('Mean survival for train data: {0:.3f}'.format(np.mean(y_train)))
print('Mean survival for test data: {0:.3f}'.format(np.mean(y_test)))

Mean survival for train data: 0.383
Mean survival for test data: 0.385


## Baseline Model

In [10]:
from sklearn.dummy import DummyClassifier

In [11]:
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)

In [13]:
model_dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [14]:
# Accuracy for baseline model
print('Baseline model accuracy: {0}'.format(model_dummy.score(X_test, y_test)))

Baseline model accuracy: 0.6145251396648045


In [15]:
# Performace metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [17]:
# Accuracy score
print(f'Accuracy score: {accuracy_score(y_test, model_dummy.predict(X_test)):.2f}')

Accuracy score: 0.61


In [18]:
# Confusion matrix
print('Confusion matrix for baseline model: \n {0}'.format(confusion_matrix(y_test, model_dummy.predict(X_test))))

Confusion matrix for baseline model: 
 [[110   0]
 [ 69   0]]


In [19]:
# Precision
print(f'Accuracy score: {precision_score(y_test, model_dummy.predict(X_test)):.2f}')
# Recall
print(f'Accuracy score: {recall_score(y_test, model_dummy.predict(X_test)):.2f}')

Accuracy score: 0.00
Accuracy score: 0.00


  'precision', 'predicted', average, warn_for)


## Logistic Regression Model

In [20]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(random_state = 1)
logistic_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
# Score of logistic regression model
print('Logistic regression score: ', logistic_model.score(X_test, y_test))

Logistic regression score:  0.832402234637


In [26]:
print(f'Accuracy score: {accuracy_score(y_test, logistic_model.predict(X_test)):.2f}')
print('Confusion matrix for baseline model: \n {0}'.format(confusion_matrix(y_test, logistic_model.predict(X_test))))
print(f'Accuracy score: {precision_score(y_test, logistic_model.predict(X_test)):.2f}')
print(f'Accuracy score: {recall_score(y_test, logistic_model.predict(X_test)):.2f}')

Accuracy score: 0.83
Confusion matrix for baseline model: 
 [[95 15]
 [15 54]]
Accuracy score: 0.78
Accuracy score: 0.78


In [28]:
# Model coefficients
logistic_model.coef_

array([[ -2.85820422e-02,   4.54079277e-03,  -6.38844993e-05,
         -5.00543528e-01,   6.26460824e-01,  -8.01448460e-01,
          1.28751908e-01,  -1.61688007e-01,  -3.93688106e-01,
          5.18056781e-01,   1.09711000e+00,   3.99528400e-01,
         -1.73461187e-01,  -3.01861412e-01,   9.71685792e-01,
          4.89072392e-01,  -3.48009808e-01,   2.55509650e-01,
          1.21952487e+00,   5.70341503e-01,  -1.44608809e+00,
          1.08834568e+00,  -1.13229427e-01,  -4.61655813e-01,
          1.66925735e-01,   2.51335364e-01,   2.82439209e-01,
          4.12048068e-01,   4.94000132e-01,   4.65652969e-01,
          1.53095275e-01,   3.78656512e-01,   7.34091864e-01]])

## Hyperparameter optimization

In [29]:
lr_model = LogisticRegression(random_state=0)

In [30]:
# Grid Search
from sklearn.model_selection import GridSearchCV

In [32]:
params = {'C' : [1.0, 10.0, 100.0, 1000.0], 'penalty' : ['l1', 'l2']}
clf = GridSearchCV(lr_model, param_grid=params, cv=3) # cv value decides the K-fold cross validation

In [33]:
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [35]:
# The best match for out parameters
clf.best_params_

{'C': 1.0, 'penalty': 'l2'}

In [36]:
print('Best score from GridSearch: ', clf.best_score_)

Best score from GridSearch:  0.831460674157


In [37]:
print('Logistic regression score after tuning hyperparameters: ', clf.score(X_test, y_test))

Logistic regression score after tuning hyperparameters:  0.832402234637


## Feature Normalization and Standardization

In [38]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#### Feature normalization

In [49]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [50]:
X_train_scaled[:,1].min(), X_train_scaled[:,1].max()

(0.0, 1.0)

In [51]:
X_test_scaled = scaler.transform(X_test)

#### Feature Standardization

In [52]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [53]:
X_train_scaled[:,1].min(), X_train_scaled[:,1].max()

(-0.63598588303483594, 9.5901275214924677)

### Model after standardization

In [43]:
lr_model = LogisticRegression(random_state=0)
params = {'C' : [1.0, 10.0, 100.0, 1000.0], 'penalty' : ['l1', 'l2']}
clf = GridSearchCV(lr_model, param_grid=params, cv=3)
clf.fit(X_train_scaled, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [44]:
clf.best_score_

0.8117977528089888

In [46]:
print('Logistic regression score after standardization: ', clf.score(X_test_scaled, y_test))

Logistic regression score after standardization:  0.843575418994


## Model Persistence

In [65]:
import pickle

# Creating the file paths
model_file_path = os.path.join(os.path.pardir, 'Titanic-dataset-experimentation', 'Models', 'LR_model.pkl')
scaler_file_path = os.path.join(os.path.pardir, 'Titanic-dataset-experimentation', 'Models', 'LR_scaler.pkl')

In [66]:
# Opening the pkl files
model_file_pickle = open(model_file_path, 'wb')
scaler_file_pickle = open(scaler_file_path, 'wb')

In [67]:
# Persisting the model and scaler
pickle.dump(clf, model_file_pickle)
pickle.dump(scaler, scaler_file_pickle)

In [68]:
model_file_pickle.close()
scaler_file_pickle.close()

### Loading and testing the saved files

In [70]:
model_file_pickle = open(model_file_path, 'rb')
scaler_file_pickle = open(scaler_file_path, 'rb')

#load
clf_loaded = pickle.load(model_file_pickle)
scaler_loaded = pickle.load(scaler_file_pickle)

model_file_pickle.close()
scaler_file_pickle.close()

In [71]:
clf_loaded

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [72]:
scaler_loaded

StandardScaler(copy=True, with_mean=True, with_std=True)

In [73]:
# Transforming data using the laoded scaler object
X_test_scaled = scaler_loaded.transform(X_test)

print('Score for the persisted LR classifier: ', clf_loaded.score(X_test_scaled, y_test))

Score for the persisted LR classifier:  0.843575418994
