In [1]:
import pickle

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, roc_auc_score

In [2]:
df = pd.read_csv("http://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
target = "Survived"
ignore_cols = ["Name", "PassengerId", "Ticket", "Cabin"]
numeric_preds = list(set(df._get_numeric_data().columns) - set([target] +ignore_cols))
cat_preds = list(set(df.columns) - set([target] + ignore_cols + numeric_preds))

In [11]:
numeric_preds

['Age', 'Fare', 'SibSp', 'Parch', 'Pclass']

In [12]:
cat_preds

['Sex', 'Embarked']

In [5]:
df_prep = pd.concat([df[numeric_preds], df[[target]], pd.get_dummies(df[cat_preds], drop_first=True)], axis=1).fillna(0)
df_prep.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Pclass,Survived,Sex_male,Embarked_Q,Embarked_S
0,22.0,7.25,1,0,3,0,1,0,1
1,38.0,71.2833,1,0,1,1,0,0,0
2,26.0,7.925,0,0,3,1,0,0,1
3,35.0,53.1,1,0,1,1,0,0,1
4,35.0,8.05,0,0,3,0,1,0,1


In [6]:
predictors = list(set(df_prep.columns) - set([target]))

In [13]:
predictors

['Embarked_Q',
 'Age',
 'Sex_male',
 'Fare',
 'SibSp',
 'Parch',
 'Pclass',
 'Embarked_S']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_prep[predictors], df_prep[target], test_size=0.2, random_state=42)

In [8]:
rf = RandomForestClassifier(n_estimators=100, oob_score=True, max_depth=5, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, oob_score=True, random_state=42)

In [14]:
rf.predict(X_test)

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1], dtype=int64)

In [9]:
pd.DataFrame(confusion_matrix(y_test, rf.predict(X_test)))

Unnamed: 0,0,1
0,93,12
1,24,50


In [17]:
df_prep.dtypes

Age           float64
Fare          float64
SibSp           int64
Parch           int64
Pclass          int64
Survived        int64
Sex_male        uint8
Embarked_Q      uint8
Embarked_S      uint8
dtype: object

In [31]:
event = {'id': 111,
            'Embarked_Q': 0 ,
            'Age': 22.0,
            'Sex_male': 1,
            'Fare':7.2500,
            'SibSp': 1,
            'Parch':0,
            'Pclass':3,
            'Embarked_S':1}

In [26]:
ordered_features = [
    data['Embarked_Q'],
    data['Age'],
    data['Sex_male'],
    data['Fare'],
    data['SibSp'],
    data['Parch'],
    data['Pclass'],
    data['Embarked_S']
]

features = np.array(ordered_features).reshape(1, -1)

In [28]:
prediction = model.predict_proba(features)[0][1]

In [32]:
scoreModel(event)

{'statusCode': 200, 'body': '{"id": 111, "prediction": 0.15167416722370064}'}

In [30]:
import json
import numpy as np

model = rf

def scoreModel(event):
    
    try:
        data = event

        ordered_features = [
            data['Embarked_Q'],
            data['Age'],
            data['Sex_male'],
            data['Fare'],
            data['SibSp'],
            data['Parch'],
            data['Pclass'],
            data['Embarked_S']
        ]

        features = np.array(ordered_features).reshape(1, -1)
        prediction = model.predict_proba(features)[0][1]
        
        
        return {
            'statusCode': 200,
            'body': json.dumps({'id': data['id'], 'prediction': prediction})
        }
    except Exception as e:
        return {
            'statusCode': 500,
            'body': json.dumps({'message': str(e)})
        }