# Modelling

 - Finding an appropriate prediction model for the preprocessed dataset.  
 - Using 10 fold cross validation in parallel.  
 - There is a slight class imbalance so we use roc_auc as a metric.
 - Grid search over multiple models and their hyper-parameters.
 - For simplicity restricting to LogReg, SVM and Random Forest.
 - Aim is not to get best score, but to showcase the approach.

In [119]:
import sys
import joblib
import sklearn
import warnings
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


sys.path.append("../deployment/src/")
from preprocessor import Preprocessor

% matplotlib inline
% load_ext autoreload
% autoreload 2

sns.set_style("darkgrid")
warnings.filterwarnings('ignore')
plt.rcParams["figure.figsize"] = (10, 5)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 1. Loading Data

In [2]:
raw = pd.read_csv("../data/titanic.csv")
raw.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 2. Eval Tools
- Build the functions to help evaluate, summarize the resutls and do a hyper parameter search.

In [130]:
def _eval(model: BaseEstimator, train_df: pd.DataFrame, val_df: pd.DataFrame) -> (float, float):
    """
    Evaluate a model on a single split.
    :return: roc_auc for train and val set.
    """
    preprocessor = Preprocessor()
    train_df = preprocessor.fit_transform(train_df)
    val_df = preprocessor.transform(val_df)

    model.fit(train_df.drop("survived", axis=1), train_df["survived"])
    train_preds = model.predict_proba(train_df.drop("survived", axis=1))[:,1]
    train_score = roc_auc_score(train_df["survived"], train_preds)
    val_preds = model.predict_proba(val_df.drop("survived", axis=1))[:,1]
    val_score = roc_auc_score(val_df["survived"], val_preds)
    return train_score, val_score


def evaluate_model(model_name: str, model: BaseEstimator, raw_df: pd.DataFrame) -> pd.DataFrame:
    """
    Evaluate a model on a 10 fold cv in parallel.
    Data is preprocessed accordingly and a summary df is created with mean and std of scores (roc_auc)
    """
    
    train_errs = []
    val_errs = []
    
    cv = StratifiedKFold(n_splits=10)
    folds = [(raw_df.loc[train_ix], raw_df.loc[val_ix]) 
             for train_ix, val_ix in cv.split(raw_df, raw_df["survived"])]
             
    parallel = joblib.Parallel(n_jobs=-1, backend="multiprocessing")
    scores = parallel(joblib.delayed(_eval)(model, train_df, val_df) 
                      for train_df, val_df in folds)
    
    train_scores = [score[0] for score in scores]
    val_scores = [score[1] for score in scores]
    return summary(model_name, train_scores, val_scores)


def summary(model_name: str, train_scores: list, val_scores: list) -> pd.DataFrame:
    """
    Summarize the train and validation scores for a given module.
    """
    return pd.DataFrame([{
        "train_mean": np.mean(train_scores),
        "train_std": np.std(train_scores),
        "val_mean": np.mean(val_scores),
        "val_std": np.std(val_scores),
    }], index=[model_name])


def create_grid(model_cls: type, **kwargs) -> pd.DataFrame:
    """
    Create a "grid" aka cartesian product of all variables past as kwargs (lists).
    Output as a DataFrame with 2 colummns: name of model (str), instantiated model to be fit.
    """
    names = []
    models = []
    for element in itertools.product(*kwargs.values()):
        keys = list(kwargs.keys())
        params = {keys[i]: element[i] for i in range(len(kwargs))}
        names.append(model_cls.__name__ + str(params))
        models.append(model_cls(**params))
    return pd.DataFrame({"name": names, "model": models})


def evaluate_all(grid: pd.DataFrame, data: pd.DataFrame) -> pd.DataFrame:
    """
    Evaluate all configurations found in the grid dataframe.
    :return: a DataFrame of summaries
    """
    parallel = joblib.Parallel(n_jobs=-1, backend="multiprocessing")
    res = parallel(joblib.delayed(evaluate_model)(model_name=row["name"], model=row["model"], raw_df=data) for ix, row in grid.iterrows())
    return pd.concat(res, axis=0)

### 3. Build Grid
- Build a list of all possible hyperparameter configurations for each model.

In [152]:
# Logistic Regression

logistic_grid = create_grid(LogisticRegression, C=[10**i for i in range(-2, 2)], 
                                                penalty=["l1", "l2"])
print("n_models:", len(logistic_grid))
logistic_grid.head()

n_models: 8


Unnamed: 0,model,name
0,"LogisticRegression(C=0.01, class_weight=None, ...","LogisticRegression{'C': 0.01, 'penalty': 'l1'}"
1,"LogisticRegression(C=0.01, class_weight=None, ...","LogisticRegression{'C': 0.01, 'penalty': 'l2'}"
2,"LogisticRegression(C=0.1, class_weight=None, d...","LogisticRegression{'C': 0.1, 'penalty': 'l1'}"
3,"LogisticRegression(C=0.1, class_weight=None, d...","LogisticRegression{'C': 0.1, 'penalty': 'l2'}"
4,"LogisticRegression(C=1, class_weight=None, dua...","LogisticRegression{'C': 1, 'penalty': 'l1'}"


In [153]:
# SVM

svm_grid = create_grid(SVC, C=[10**i for i in range(-2, 2)], 
                            kernel=["poly", "rbf", "linear"], 
                            gamma=[10**i for i in range(-3, 2)], 
                            probability=[True])
print("n_models:", len(svm_grid))
svm_grid.head()

n_models: 60


Unnamed: 0,model,name
0,"SVC(C=0.01, cache_size=200, class_weight=None,...","SVC{'C': 0.01, 'kernel': 'poly', 'gamma': 0.00..."
1,"SVC(C=0.01, cache_size=200, class_weight=None,...","SVC{'C': 0.01, 'kernel': 'poly', 'gamma': 0.01..."
2,"SVC(C=0.01, cache_size=200, class_weight=None,...","SVC{'C': 0.01, 'kernel': 'poly', 'gamma': 0.1,..."
3,"SVC(C=0.01, cache_size=200, class_weight=None,...","SVC{'C': 0.01, 'kernel': 'poly', 'gamma': 1, '..."
4,"SVC(C=0.01, cache_size=200, class_weight=None,...","SVC{'C': 0.01, 'kernel': 'poly', 'gamma': 10, ..."


In [154]:
# Random Forest

forest_grid = create_grid(RandomForestClassifier, n_estimators=[10*i for i in range(1, 11)], max_depth=range(3,6), min_samples_leaf=range(1, 11, 2))
print("n_models:", len(forest_grid))
forest_grid.head()

n_models: 150


Unnamed: 0,model,name
0,(),"RandomForestClassifier{'n_estimators': 10, 'ma..."
1,(),"RandomForestClassifier{'n_estimators': 10, 'ma..."
2,(),"RandomForestClassifier{'n_estimators': 10, 'ma..."
3,(),"RandomForestClassifier{'n_estimators': 10, 'ma..."
4,(),"RandomForestClassifier{'n_estimators': 10, 'ma..."


In [156]:
# Final Grid

all_grid = pd.concat([logistic_grid, svm_grid, forest_grid], axis=0)
print("n_models:", len(all_grid))

n_models: 218


### 4. Grid Search
- Evaluate all models for each configuration in the grid.
- In practice a more careful analysis of hyperparams should be performed.

In [157]:
res = evaluate_all(all_grid, raw)
res = res.sort_values(by=["val_mean", "val_std"], ascending=[False, True])
res.head()

Unnamed: 0,train_mean,train_std,val_mean,val_std
"RandomForestClassifier{'n_estimators': 50, 'max_depth': 5, 'min_samples_leaf': 1}",0.913639,0.004047,0.871868,0.035366
"RandomForestClassifier{'n_estimators': 90, 'max_depth': 5, 'min_samples_leaf': 1}",0.914926,0.004784,0.869822,0.037582
"RandomForestClassifier{'n_estimators': 40, 'max_depth': 5, 'min_samples_leaf': 3}",0.907931,0.005689,0.869734,0.03915
"RandomForestClassifier{'n_estimators': 70, 'max_depth': 5, 'min_samples_leaf': 1}",0.914,0.004219,0.868842,0.03812
"RandomForestClassifier{'n_estimators': 20, 'max_depth': 5, 'min_samples_leaf': 1}",0.911295,0.004463,0.868653,0.036818


In [182]:
# Compare best results

classifier = np.vstack(res.index.str.split("{").values)[: ,0]
res.groupby(classifier)["val_mean"].max()

LogisticRegression        0.853492
RandomForestClassifier    0.871868
SVC                       0.852999
Name: val_mean, dtype: float64

### 5. Select and Save Best Model
- Include saving the processor.
- Not much difference in performacne, in practice would choose lin reg.
- Pikle best model and test loading works.

In [190]:
preprocessor = Preprocessor()
train_df = preprocessor.fit_transform(raw)
best_model_str = res.index[0].split("{")
best_model = eval(best_model_str[0] + "(**{" + best_model_str[1] + ")")
best_model.fit(train_df.drop("survived", axis=1), train_df["survived"])

train_preds = best_model.predict_proba(train_df.drop("survived", axis=1))[:,1]
train_score = roc_auc_score(train_df["survived"], train_preds)

joblib.dump(preprocessor, '../deployment/pkl/preprocessor.pkl')
joblib.dump(best_model, '../deployment/pkl/model.pkl') 

print("Final train score: {:.4f}".format(train_score))

Final train score: 0.9079


In [191]:
# Test loading

l_proc = joblib.load('../deployment/pkl/preprocessor.pkl')
l_model = joblib.load('../deployment/pkl/model.pkl') 

t_df = l_proc.transform(raw)
t_preds = l_model.predict_proba(t_df.drop("survived", axis=1))[:,1]
t_score = roc_auc_score(t_df["survived"], t_preds)

print("Final train score: {:.4f}".format(t_score))

Final train score: 0.9079
