## Build Predictive Models


In [2]:
import pandas as pd
import os
import numpy as np

### Import Data

In [3]:
processed_data_path = os.path.join(os.path.pardir, "data",'processed')
train_file_path = os.path.join(processed_data_path, "train.csv")
test_file_patah = os.path.join(processed_data_path, "test.csv")

In [4]:
####DatFrames

train_df = pd.read_csv(train_file_path, index_col = 'PassengerId')

test_df = pd.read_csv(test_file_patah, index_col = 'PassengerId')



In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 non-null int64
Title_Officer         418 n

### Data Preparation

In [10]:
X = train_df.loc[:,'Age':].values.astype('float')
y = train_df['Survived'].ravel()

In [15]:
print(X.shape, y.shape)

(891, 32) (891,)


### train_test split

In [20]:
from  sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 0)

print(X_train.shape, y_train.shape)

print(X_test.shape, y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [21]:
print("mean survivel in train: {0: .3f} " .format(np.mean(y_train)))
print("mean survivel in test: {0: .3f} " .format(np.mean(y_test)))

mean survivel in train:  0.383 
mean survivel in test:  0.385 


###### SKlearn version

In [22]:
import sklearn 

sklearn.__version__

'0.21.2'

### BaseLine Model

In [24]:
from sklearn.dummy import DummyClassifier

In [26]:
#Create model

dummy_model = DummyClassifier(strategy ='most_frequent', random_state = 0)


In [27]:
## train the model

dummy_model.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [28]:
print("Score for baseline model : {0: .2f}".format(dummy_model.score(X_test, y_test)))

Score for baseline model :  0.61


### Performence metrics: Accuracy, Confusion Matrix, Precision, Recall

In [31]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score 

In [36]:
## Accuracy Score 

print("Accuracy for baseline model : {0: .2f}".format(accuracy_score(y_test, dummy_model.predict(X_test))))

Accuracy for baseline model :  0.61


In [38]:
## Confusion Matrix

print("Confusion matrix for baseline model : \n {0}".format(confusion_matrix(y_test, dummy_model.predict(X_test))))

Confusion matrix for baseline model : 
 [[110   0]
 [ 69   0]]


In [39]:
## Precision and Recall Score 

print("Precision for baseline model : {0: .2f}".format(precision_score(y_test, dummy_model.predict(X_test))))

print("Recall for baseline model : {0: .2f}".format(recall_score(y_test, dummy_model.predict(X_test))))

Precision for baseline model :  0.00
Recall for baseline model :  0.00


  'precision', 'predicted', average, warn_for)


In [58]:
## print the performence Metrices

def print_metrices(model):
    ## Accuracy Score 
    print("Accuracy for  model : {0: .2f}".format(accuracy_score(y_test, model.predict(X_test))))
    print("\n")
    print("Confusion matrix for  model : \n {0}".format(confusion_matrix(y_test, model.predict(X_test))))
    print("\n")
    ## Precision and Recall Score 
    print("Precision for  model : {0: .2f}".format(precision_score(y_test, model.predict(X_test))))
    print("Recall for  model : {0: .2f}".format(recall_score(y_test, model.predict(X_test))))

In [59]:
print_metrices(dummy_model)

Accuracy for  model :  0.61


Confusion matrix for  model : 
 [[110   0]
 [ 69   0]]


Precision for  model :  0.00
Recall for  model :  0.00


## Use Kaagle Data Set

In [41]:
## use the test data set from kaagle

test_X = test_df.values.astype('float')

In [42]:
###  predict 
predictions = dummy_model.predict(test_X)

In [45]:
### creat CSV 

df_submission = pd.DataFrame({'PassengerId': test_df.index, "Survived": predictions})

df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [46]:
### write the file

submission_data_path = os.path.join(os.path.pardir, 'data','external')
submision_file_path = os.path.join(submission_data_path, '01_dummy.csv')


In [47]:
df_submission.to_csv(submision_file_path, index= False)

In [50]:
def get_submission_file(model, filename):
    ## use the test data set from kaagle
    test_X = test_df.values.astype('float')
    
    ## make predictions
    predictions = model.predict(test_X)
    
    ## Create DataFrame for the predictions
    submission_df = pd.DataFrame({'PassengerId': test_df.index, "Survived": predictions})    
    ## Create the prediction csv file
    submission_data_path = os.path.join(os.path.pardir, 'data','external')
    submision_file_path = os.path.join(submission_data_path, filename)
    df_submission.to_csv(submision_file_path, index= False)
    

In [51]:
get_submission_file(dummy_model, '01_dummy.csv')

### LogisticRegression

In [61]:
from sklearn.linear_model import LogisticRegression

In [65]:
## model

model_l_r = LogisticRegression(random_state =0)

#Train
model_l_r.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [66]:
## score 

print("Scorefor the model {0: .2f}".format(model_l_r.score(X_test, y_test)))


Scorefor the model  0.83


### Performence Metrics

In [67]:
print_metrices(model_l_r)

Accuracy for  model :  0.83


Confusion matrix for  model : 
 [[95 15]
 [15 54]]


Precision for  model :  0.78
Recall for  model :  0.78


#### model coefficinets

In [68]:
model_l_r.coef_

array([[-0.02844175,  0.00455341, -0.5004576 ,  0.61774057, -0.81275851,
         0.12903828, -0.16993328, -0.39612356,  0.52013941,  1.09848813,
         0.40536714, -0.18206889, -0.30193394,  0.96541477,  0.48235054,
        -0.344792  ,  0.27361348,  1.21989989,  0.56621016, -1.44437763,
         1.07590817, -0.11233172, -0.47594905,  0.16305224,  0.24595004,
         0.28139246,  0.41257857,  0.49230706,  0.46124568,  0.14942057,
         0.37267581,  0.7302975 ]])

### Create Kaagle Submission

In [70]:
get_submission_file(model_l_r, '02_logic_regresson.csv')

## Hyperparameter Optimization using GridSearchCV

In [71]:
model_l_r =LogisticRegression(random_state = 0)

In [78]:
## GridSerachCV

from sklearn.model_selection import GridSearchCV

params = {
    'C':[1.0, 10.0, 50.0, 100.0, 1000.0],
    'penalty':['l1','l2']
}

gscv = GridSearchCV(model_l_r, param_grid = params, cv=3)

In [79]:
## train data
gscv.fit(X_train, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [83]:
##Best params
gscv.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [84]:
gscv.best_score_

0.8328651685393258

In [85]:
print("Base Model score : {0: .2f}".format(gscv.score(X_test, y_test)))

Base Model score :  0.83


### Test Kaggle Data

In [86]:
 get_submission_file(gscv, "0_l_r_gridsearhCv.csv")

## Feature Normalization

In [87]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [90]:
# feature normalization

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_train_scaled[:,0].min(), X_train_scaled[:,0].max()

X_test_scaled = scaler.transform(X_test)

### Feature Standardization

In [91]:
stan =  StandardScaler()

X_train_scaled = stan.fit_transform(X_train)
X_test_scaled = stan.transform(X_test)

In [92]:
model_l_r =LogisticRegression(random_state = 0)
## GridSerachCV

from sklearn.model_selection import GridSearchCV

params = {
    'C':[1.0, 10.0, 50.0, 100.0, 1000.0],
    'penalty':['l1','l2']
}

gscv = GridSearchCV(model_l_r, param_grid = params, cv=3)

## train data
gscv.fit(X_train_scaled, y_train)





Base Model score :  0.84




In [94]:
##Best params
gscv.best_score_


0.8132022471910112

In [95]:
print("Base Model score : {0: .2f}".format(gscv.score(X_test_scaled, y_test)))

Base Model score :  0.84



# Persist Models using Pickle

In [97]:
#import pickle library
import pickle

In [99]:
#path
model_file_path = os.path.join(os.path.pardir, 'models', 'lr_model.pkl')
scaler_file_path = os.path.join(os.path.pardir, 'models', 'lr_scaler.pkl')

In [102]:
#Save model
with open(model_file_path, 'wb') as file:
    pickle.dump(gscv, file)

In [103]:
#Save scalar

with open(scaler_file_path, "wb") as file:
    pickle.dump(stan, file)

### Load the model

In [105]:
##load model
with open(model_file_path, 'rb') as file:
    l_r_model_loded = pickle.load(file)
    
with  open(scaler_file_path, 'rb') as file:
    scaler_loded = pickle.load(file)

In [106]:
l_r_model_loded

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [107]:
scaler_loded

StandardScaler(copy=True, with_mean=True, with_std=True)

In [108]:
### Test the models

X_test_scaled = scaler_loded.transform(X_test)

print("Loaded model score  : {0: .2f}".format(l_r_model_loded.score(X_test_scaled, y_test)))

Loaded model score  :  0.84
