### Building Predictive Models - Part I

In [1]:
import pandas as pd
import os
import numpy as np

#### Import Data

In [2]:
# set the path of the processed data
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')

In [3]:
train_df = pd.read_csv(train_file_path, index_col = 'PassengerId')
test_df = pd.read_csv(test_file_path, index_col = 'PassengerId')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Survived            891 non-null    int64  
 1   Age                 891 non-null    float64
 2   Fare                891 non-null    float64
 3   FamilySize          891 non-null    int64  
 4   IsMother            891 non-null    int64  
 5   IsMale              891 non-null    int64  
 6   Deck_A              891 non-null    int64  
 7   Deck_B              891 non-null    int64  
 8   Deck_C              891 non-null    int64  
 9   Deck_D              891 non-null    int64  
 10  Deck_E              891 non-null    int64  
 11  Deck_F              891 non-null    int64  
 12  Deck_G              891 non-null    int64  
 13  Deck_z              891 non-null    int64  
 14  Pclass_1            891 non-null    int64  
 15  Pclass_2            891 non-null    int64  
 16  Pclass_3

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 418 non-null    float64
 1   Fare                418 non-null    float64
 2   FamilySize          418 non-null    int64  
 3   IsMother            418 non-null    int64  
 4   IsMale              418 non-null    int64  
 5   Deck_A              418 non-null    int64  
 6   Deck_B              418 non-null    int64  
 7   Deck_C              418 non-null    int64  
 8   Deck_D              418 non-null    int64  
 9   Deck_E              418 non-null    int64  
 10  Deck_F              418 non-null    int64  
 11  Deck_G              418 non-null    int64  
 12  Deck_z              418 non-null    int64  
 13  Pclass_1            418 non-null    int64  
 14  Pclass_2            418 non-null    int64  
 15  Pclass_3            418 non-null    int64  
 16  Title

#### Data Preparation

In [6]:
X = train_df.loc[:, 'Age':].values.astype('float') # Take all rows & columnns from starting from Age and convert their data type to real float
y = train_df['Survived'].ravel() # For output, use ravel() which is a numpy function which creates a flattend 1-D arrary

In [7]:
print(X.shape, y.shape)

(891, 32) (891,)


In [8]:
# Perform the train test split using the training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # random_state is the used as a seed value or randomization value
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)                                                

(712, 32) (712,)
(179, 32) (179,)


In [9]:
# Average survival in train and test
print('Mean Survival in train data: {0:.3f}'.format(np.mean(y_train)))
print('Mean Survival in test data: {0:.3f}'.format(np.mean(y_test)))

Mean Survival in train data: 0.383
Mean Survival in test data: 0.385


##### Validate the Scikit-Learn Version

In [10]:
import sklearn

In [11]:
sklearn.__version__

'0.24.2'

#### Baseline the Model

In [12]:
# import function
from sklearn.dummy import DummyClassifier # To create a baseline model import DummyClassifier

In [13]:
# Creating a model
model_dummy = DummyClassifier(strategy = 'most_frequent', random_state = 0)

In [14]:
# Training a model
model_dummy.fit(X_train, y_train) # It takes input and output data

DummyClassifier(random_state=0, strategy='most_frequent')

In [15]:
print('Score for the Baseline Model: {0:.2f}'.format(model_dummy.score(X_test, y_test))) ## Accuracy

Score for the Baseline Model: 0.61


In [16]:
# Performance Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [17]:
# Accuracy Score
print('Accracy for the Baseline Model is : {0:.2f}'.format(accuracy_score(y_test, model_dummy.predict(X_test))))

Accracy for the Baseline Model is : 0.61


In [18]:
# Confusion Matrix
print('Confusion matrix for the Baseline Model is : \n {0}'.format(confusion_matrix(y_test, model_dummy.predict(X_test))))

Confusion matrix for the Baseline Model is : 
 [[110   0]
 [ 69   0]]


In [19]:
import warnings
warnings.filterwarnings('ignore')

# Precision Score
print('Precision Score for the Baseline Model is : {0:.2f}'.format(precision_score(y_test, model_dummy.predict(X_test))))

Precision Score for the Baseline Model is : 0.00


In [20]:
# Recall Score
print('Recall Score for the Baseline Model is : {0:.2f}'.format(recall_score(y_test, model_dummy.predict(X_test))))

Recall Score for the Baseline Model is : 0.00


### First Kaggle Submission

In [21]:
# Converting to the Matrix
test_X = test_df.values.astype('float') # Converting the test data frame to 2-D Matrix and converting matrix to floating type

In [22]:
# Make Predictions
predictions = model_dummy.predict(test_X)

In [23]:
df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived' : predictions})

In [24]:
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [25]:
submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
submission_data_file = os.path.join(submission_data_path, '01_dummy.csv')

In [26]:
df_submission.to_csv(submission_data_file, index=False)
# index = False to supress the additional column which will be created in the final output file with indexes

In [38]:
 def get_submission_file(model, filename):
        # converting to the matrix
        test_X = test_df.values.astype('float')
        # make predictions
        predictions = model.predict(test_X)
        # dataframe for submission
        df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived' : predictions})
        # generating a submission file
        submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
        submission_data_file = os.path.join(submission_data_path, filename)
        # writing to the submission file
        df_submission.to_csv(submission_data_file, index=False)

In [28]:
# get submission file
get_submission_file(model_dummy, '01_dummy.csv')

In [29]:
!kaggle competitions submit -c titanic -f ..\data\external\01_dummy.csv -m "01 Baseline Model"

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 4.77kB/s]
100%|##########| 3.18k/3.18k [00:02<00:00, 1.51kB/s]


#### Logisitical Regression Model

In [30]:
# import function
from sklearn.linear_model import LogisticRegression

In [31]:
# Create the Model
model_lr_1 = LogisticRegression(random_state = 0)

In [32]:
# Train the model
model_lr_1.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [33]:
# Evaluate the model
print('Score for the Logistic Regression Model Version 1 is : {0:.2f}'.format(model_lr_1.score(X_test, y_test))) ## Accuracy

Score for the Logistic Regression Model Version 1 is : 0.83


In [34]:
# Evaluating the performance Matrices

# Accuracy
print('Accracy for the Logistic Regression Model Version 1 is : {0:.2f}'.format(accuracy_score(y_test, model_lr_1.predict(X_test))))

# Confusion Matrix
print('Confusion matrix for the Logistic Regression Model Version 1 is : \n {0}'.format(confusion_matrix(y_test, model_lr_1.predict(X_test))))

# Precision Score
print('Precision Score for the Logistic Regression Model Version 1 is : {0:.2f}'.format(precision_score(y_test, model_lr_1.predict(X_test))))

# Precision Score
print('Recall Score for the Logistic Regression Model Version 1 is : {0:.2f}'.format(recall_score(y_test, model_lr_1.predict(X_test))))

Accracy for the Logistic Regression Model Version 1 is : 0.83
Confusion matrix for the Logistic Regression Model Version 1 is : 
 [[95 15]
 [15 54]]
Precision Score for the Logistic Regression Model Version 1 is : 0.78
Recall Score for the Logistic Regression Model Version 1 is : 0.78


In [35]:
# Model Coefficients
model_lr_1.coef_ # model coef or model weights or model weights

array([[-0.03108255,  0.0042563 , -0.49562515,  0.62938038, -0.73371338,
         0.12095211, -0.07736283, -0.37295122,  0.4836195 ,  1.03509223,
         0.26883505, -0.05071579, -0.36235192,  0.90425344,  0.47401862,
        -0.33315492,  0.10930546,  1.05991081,  0.50093029, -1.47897019,
         1.16859477, -0.10330668, -0.21134732,  0.1609539 ,  0.21649565,
         0.2323362 ,  0.4353314 ,  0.43826022,  0.49903308,  0.10782384,
         0.35844359,  0.68667355]])

### Second Kaggle Submission

In [39]:
# Use the get submission file
get_submission_file(model_lr_1, '02_lr.csv')

In [40]:
!kaggle competitions submit -c titanic -f ..\data\external\02_lr.csv -m "02 Second Submission Using Logistical Regression Model"

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 15.0kB/s]
100%|##########| 3.18k/3.18k [00:01<00:00, 2.77kB/s]


### Building Predictive Models - Part II

#### Hyperparameter Optimization

In [58]:
## Base Model
model_lr = LogisticRegression(random_state = 0)

In [59]:
from sklearn.model_selection import GridSearchCV # 

In [62]:
parameters = {'C' : [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty' : ['l1', 'l2']}

clf = GridSearchCV(model_lr, param_grid = parameters, cv = 3)
# C parameter controls the penality strength

In [63]:
clf.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=LogisticRegression(random_state=0),
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l1', 'l2']})

In [64]:
clf.best_params_

{'C': 10.0, 'penalty': 'l2'}

In [65]:
print('best score : {0:.2f}'.format(clf.best_score_))

best score : 0.83


In [66]:
# Evaluate the model
print('Score for the Logistic Regression Model Version 3 is : {0:.2f}'.format(clf.score(X_test, y_test)))

Score for the Logistic Regression Model Version 3 is : 0.83


### Third and Final Kaggle Submission

In [69]:
get_submission_file(clf, '03_lr.csv')

In [70]:
!kaggle competitions submit -c titanic -f ..\data\external\03_lr.csv -m "03 Third Submission Using Logistical Regression Model"

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 20.3kB/s]
100%|##########| 3.18k/3.18k [00:01<00:00, 2.73kB/s]


#### Feature Normalization and Standardization 

In [78]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#### Feature Normalization

In [79]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [80]:
X_train_scaled[:,0].min(), X_train_scaled[:,0].max()

(0.0, 1.0)

In [81]:
# normalize the test data
X_test_scaled = scaler.transform(X_test)

#### Feature Standardization

In [82]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [83]:
# Base Model
model_lr = LogisticRegression()
parameters = {'C' : [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty' : ['l1', 'l2']}
clf = GridSearchCV(model_lr, param_grid = parameters, cv = 3)
clf.fit(X_train_scaled, y_train)

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l1', 'l2']})

In [84]:
clf.best_score_

0.8089978607476747

In [85]:
print('Score for the Logistic Regression Model Scaled Version is : {0:.2f}'.format(clf.score(X_test_scaled, y_test)))

Score for the Logistic Regression Model Scaled Version is : 0.84


#### Model Persistence

In [94]:
# import pickle library
import pickle

In [95]:
# create the file paths
model_file_path = os.path.join(os.path.pardir, 'models', 'lr_model.pkl')
scaler_file_path = os.path.join(os.path.pardir, 'models', 'lr_scaler.pkl')

In [96]:
# open the files to write
model_file_pickle = open(model_file_path, 'wb') # wb is for the binary format
scaler_file_pickle = open(scaler_file_path, 'wb')

In [97]:
# persist the model and scaler
pickle.dump(clf, model_file_pickle)
pickle.dump(scaler, scaler_file_pickle)

In [98]:
# close the files
model_file_pickle.close()
scaler_file_pickle.close()

#### Load the Persisted File

In [100]:
# Open the files in read mode
model_file_pickle = open(model_file_path, 'rb')
scaler_file_pickle = open(scaler_file_path, 'rb')

# load files
clf_loaded = pickle.load(model_file_pickle)
scaler_loaded = pickle.load(scaler_file_pickle)

# close the files
model_file_pickle.close()
scaler_file_pickle.close()

In [101]:
clf_loaded

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l1', 'l2']})

In [102]:
scaler_loaded

StandardScaler()

In [103]:
X_test_scaled = scaler_loaded.transform(X_test)
print('Score for the Persisted Logistic Regression Model is : {0:.2f}'.format(clf_loaded.score(X_test_scaled, y_test)))

Score for the Persisted Logistic Regression Model is : 0.84
