# Hosting Model using API

In [78]:
#Import Python Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [79]:

df = pd.read_csv("titanic-train.csv")

df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [80]:
df2 = df.loc[:,['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]
df2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,male,22.0,1,0
1,1,1,female,38.0,1,0
2,1,3,female,26.0,0,0
3,1,1,female,35.0,1,0
4,0,3,male,35.0,0,0


In [81]:
print('Age missing ', df2['Age'].isnull().sum())
print('PClass missing ', df2['Pclass'].isnull().sum())
print('SibSp missing ', df2['SibSp'].isnull().sum())
print('Parch missing ', df2['Parch'].isnull().sum())
print('Survived missing ', df2['Survived'].isnull().sum())

Age missing  177
PClass missing  0
SibSp missing  0
Parch missing  0
Survived missing  0


In [82]:
df2['Age'] = df2['Age'].fillna(df2['Age'].mean())

print('Age missing ', df2['Age'].isnull().sum())

Age missing  0


In [83]:
df2.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch'], dtype='object')

In [84]:
df3 = pd.get_dummies(df2)
df3.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Sex_female,Sex_male
0,0,3,22.0,1,0,0,1
1,1,1,38.0,1,0,1,0
2,1,3,26.0,0,0,1,0
3,1,1,35.0,1,0,1,0
4,0,3,35.0,0,0,0,1


In [85]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(df3[['Pclass','Sex_male','Sex_female','Age','SibSp','Parch']], 
                                                    df3['Survived'], 
                                                    train_size=0.7,
                                                    stratify=df3['Survived'].values,
                                                    random_state=123)
print("Labels for training and testing data")
print(train_X)

Labels for training and testing data
     Pclass  Sex_male  Sex_female        Age  SibSp  Parch
199       2         0           1  24.000000      0      0
468       3         1           0  29.699118      0      0
198       3         0           1  29.699118      0      0
574       3         1           0  16.000000      0      0
776       3         1           0  29.699118      0      0
148       2         1           0  36.500000      0      2
227       3         1           0  20.500000      0      0
408       3         1           0  21.000000      0      0
762       3         1           0  20.000000      0      0
166       1         0           1  29.699118      0      1
62        1         1           0  45.000000      1      0
427       2         0           1  19.000000      0      0
662       1         1           0  47.000000      0      0
471       3         1           0  38.000000      0      0
732       2         1           0  29.699118      0      0
610       3        



In [86]:
print('Training : ', np.bincount(train_y) / float(len(train_y)) * 100.0)
print('Testing : ', np.bincount(test_y) / float(len(test_y)) * 100.0)

Training :  [61.63723917 38.36276083]
Testing :  [61.56716418 38.43283582]


In [87]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_estimators=500, oob_score=True, max_depth=5,
                             criterion='entropy', random_state=1234,
                             verbose=True, n_jobs=-1)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train_X, train_y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    0.4s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=True, random_state=1234, verbose=True,
            warm_start=False)

In [88]:
clf.score(train_X, train_y)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.2s finished


0.8426966292134831

In [89]:
clf.oob_score_

0.826645264847512

In [90]:

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [91]:
from sklearn.metrics import accuracy_score
print("Predicting Titanic on the train set using Random Forest")

y_train_pred = clf.predict(train_X)

print("Accurary : ", round(accuracy_score(train_y, y_train_pred),4))
print(classification_report(train_y, y_train_pred))
print(confusion_matrix(train_y, y_train_pred))


Predicting Titanic on the train set using Random Forest


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.2s finished


Accurary :  0.8427
              precision    recall  f1-score   support

           0       0.85      0.91      0.88       384
           1       0.83      0.74      0.78       239

   micro avg       0.84      0.84      0.84       623
   macro avg       0.84      0.82      0.83       623
weighted avg       0.84      0.84      0.84       623

[[348  36]
 [ 62 177]]


In [92]:
print("Predicting Titanic on the test set using Random Forest")

y_pred = clf.predict(test_X)

print("Accurary : ", round(accuracy_score(test_y, y_pred),4))
print(classification_report(test_y, y_pred))
print(confusion_matrix(test_y, y_pred))


Predicting Titanic on the test set using Random Forest


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.2s finished


Accurary :  0.8209
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       165
           1       0.82      0.68      0.74       103

   micro avg       0.82      0.82      0.82       268
   macro avg       0.82      0.79      0.80       268
weighted avg       0.82      0.82      0.82       268

[[150  15]
 [ 33  70]]


## Save Model

In [95]:
# Save your model
from sklearn.externals import joblib
joblib.dump(clf, 'model.pkl')
print("Model dumped!")


# Saving the data columns from training
model_columns = list(df3.columns)
joblib.dump(model_columns, 'model_columns.pkl')
print("Models columns dumped!")

Model dumped!
Models columns dumped!


## Load Model

In [None]:
# Load the model that you just saved
lr = joblib.load('model.pkl')



In [94]:
print("Predicting Titanic on the test set using Random Forest")

y_pred = lr.predict(test_X)

print("Accurary : ", round(accuracy_score(test_y, y_pred),4))
print(classification_report(test_y, y_pred))
print(confusion_matrix(test_y, y_pred))



Predicting Titanic on the test set using Random Forest


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.1s finished


Accurary :  0.8209
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       165
           1       0.82      0.68      0.74       103

   micro avg       0.82      0.82      0.82       268
   macro avg       0.82      0.79      0.80       268
weighted avg       0.82      0.82      0.82       268

[[150  15]
 [ 33  70]]
