# Used to train/test the predicition model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
#Import Models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [3]:
data_path = '/home/jovyan/work/data/training_titanic_dataset.csv'
df = pd.read_csv(data_path, index_col=0)

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Imp_Embarked_mf,Imp_Age_Mean,Imp_Age_Median,Imp_Age_Rand,Imp_Det_Lin,Imp_Sto_Lin
0,1,0,3,1,22.0,1,0,7.25,1.0,1.0,22.0,22.0,22.0,22.0,22.0
1,2,1,1,0,38.0,1,0,71.2833,2.0,2.0,38.0,38.0,38.0,38.0,38.0
2,3,1,3,0,26.0,0,0,7.925,1.0,1.0,26.0,26.0,26.0,26.0,26.0
3,4,1,1,0,35.0,1,0,53.1,1.0,1.0,35.0,35.0,35.0,35.0,35.0
4,5,0,3,1,35.0,0,0,8.05,1.0,1.0,35.0,35.0,35.0,35.0,35.0


In [5]:
# Create the test/train split
x = df.drop('Survived', axis=1)
y = df['Survived']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=42)

In [6]:
# Make a list of the imputed ages column names
age_list = ['Imp_Age_Mean', 'Imp_Age_Median', 'Imp_Age_Rand', 'Imp_Det_Lin', 'Imp_Sto_Lin']

# Make a list of the classifiers to evaluate
classifiers = [
    GaussianNB(),
    SGDClassifier(),
    KNeighborsClassifier(3),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(kernel="rbf", C=0.025, probability=True),
    GradientBoostingClassifier()
    ]

In [7]:
# Using Loops we are going to go through the different imputations and models
model_results = []

for age in age_list:
    x_train_tempdf = x_train[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Imp_Embarked_mf', age]]
    x_test_tempdf = x_test[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Imp_Embarked_mf', age]]
    for classifier in classifiers:
        pipe = Pipeline(steps=[('classifier', classifier)])
        pipe.fit(x_train_tempdf, y_train)
        #print(f"The Results for {age} imputation and classifier is {classifier}")
        #print("model score: %.3f" % pipe.score(x_test_tempdf, y_test))
        name = f"{age}-{classifier}"
        model_results.append((name, pipe.score(x_test_tempdf, y_test)))

In [8]:
#get model results in order
model_results = sorted(model_results, key=lambda x: x[1], reverse=True)
#Print top 5 models
model_results[:10]

[('Imp_Det_Lin-GradientBoostingClassifier()', 0.8430493273542601),
 ('Imp_Sto_Lin-GradientBoostingClassifier()', 0.8385650224215246),
 ('Imp_Det_Lin-RandomForestClassifier()', 0.8340807174887892),
 ('Imp_Sto_Lin-RandomForestClassifier()', 0.8295964125560538),
 ('Imp_Age_Mean-GradientBoostingClassifier()', 0.820627802690583),
 ('Imp_Age_Rand-GradientBoostingClassifier()', 0.820627802690583),
 ('Imp_Age_Median-GradientBoostingClassifier()', 0.8116591928251121),
 ('Imp_Age_Mean-RandomForestClassifier()', 0.8026905829596412),
 ('Imp_Age_Rand-RandomForestClassifier()', 0.7982062780269058),
 ('Imp_Age_Rand-GaussianNB()', 0.7892376681614349)]

In [9]:
#Lets take the GBC as is and run it through the kaggle test data

In [10]:
clf = GradientBoostingClassifier()
gcb_x_train = x_train[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Imp_Embarked_mf', 'Imp_Det_Lin']]
gcb_x_test = x_test[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Imp_Embarked_mf', 'Imp_Det_Lin']]
clf.fit(gcb_x_train, y_train)

GradientBoostingClassifier()

In [11]:
clf.score(gcb_x_test, y_test)

0.8430493273542601

In [None]:
filename = '/home/jovyan/work/src/app/assets/models/current_models/finalized_gbc_model_11_30_2020.sav'
pickle.dump(clf, open(filename, 'wb'))

In [12]:
#This is the predict

In [28]:
# Read in the Kaggle Test Dataframe
data_path = '/home/jovyan/work/data/test.csv'
testdf = pd.read_csv(data_path, index_col=0)

gender = {
    'male': 1,
    'female':0
}

embark = {
    'S': 1,
    'C':2,
    'Q':3
}

testdf['Sex'] = testdf['Sex'].apply(lambda x: gender.get(x))
testdf['Embarked'] = testdf['Embarked'].apply(lambda x: embark.get(x))
# Now to build in the imputation function for Embarked and age
import sys
sys.path.insert(1, '/home/jovyan/work/src/app')
from modules.data_pre import lin_model_age
from modules.data_pre import most_common

testdf['Imp_Embarked_mf'] = testdf['Embarked']
testdf['Imp_Embarked_mf'] = testdf['Imp_Embarked_mf'].apply(most_common, mf_value=testdf['Embarked'].value_counts().idxmax().tolist())
testdf['Imp_Fare_mf'] = testdf['Fare']
testdf['Imp_Fare_mf'] = testdf['Imp_Fare_mf'].apply(most_common, mf_value=testdf['Fare'].value_counts().idxmax().tolist())
testdf['Imp_Det_Lin'] = testdf.apply(lin_model_age, axis=1)

In [1]:
def survive_pred(data):
    xnew = [[data['Pclass'], data['Sex'], data['SibSp'],data['Parch'], data['Imp_Fare_mf'], data['Imp_Embarked_mf'], data['Imp_Det_Lin']]]
    pred_surive = clf.predict(xnew)[0]
    return pred_surive

SyntaxError: invalid syntax (<ipython-input-1-4099fc9b966a>, line 3)

In [32]:
testdf['Pred_Survive'] = testdf.apply(survive_pred, axis=1)

In [37]:
subdf = testdf[['Pred_Survive']]

In [40]:
save_data_path = '/home/jovyan/work/data/kaggle_sub.csv'
subdf.to_csv(save_data_path)

In [None]:
'''
This model validated at .77990 on Kaggle.  So while pretty good, it needs some work.  
However, as the purpose of this project to build a full end to end prediction with rendezvous architecture
to demonstrate how to build a model, develop it, and continue improvements using microservcies.
So the next step is to save the model and start working on the API.
'''