# Modeling

In [2]:
import numpy as np
import pandas as pd
pd.options.display.max_columns=None

import matplotlib.pyplot as plt
import seaborn as sns

import joblib

In [5]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import mlflow

In [4]:
mlflow.set_registry_uri("./mlruns/")
mlflow.set_experiment("Titanic")

AttributeError: module 'mlflow' has no attribute 'set_registry_uri'

In [29]:
train_df = joblib.load('trainset.csv')
test_df = joblib.load('testset.csv')

In [30]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['Survived'], shuffle=True)

In [31]:
train_df.shape, val_df.shape, test_df.shape

((712, 17), (179, 17), (418, 16))

In [32]:
train_df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Family',
       'Age_band', 'Alone', 'Fare_cat', 'Initial_1', 'Initial_2', 'Initial_3',
       'Initial_4', 'Embarked_1', 'Embarked_2'],
      dtype='object')

In [33]:
NUMERIC_FEATURES = [
    'Age', 'SibSp', 'Parch', 'Fare', 'Family', 
]

CATEGORICAL_FEATURES = [
    'Pclass', 'Sex', 'Age_band', 'Alone', 'Fare_cat', 'Initial_1', 'Initial_2', 'Initial_3',
       'Initial_4', 'Embarked_1', 'Embarked_2'
]

LABEL = 'Survived'
FEATURES = NUMERIC_FEATURES + CATEGORICAL_FEATURES

In [34]:
X_train, y_train = train_df[FEATURES], train_df[LABEL]
X_val, y_val = val_df[FEATURES], val_df[LABEL]
X_test = test_df

In [35]:
from sklearn.preprocessing import StandardScaler

ss_scaler = StandardScaler()

X_train[NUMERIC_FEATURES] = ss_scaler.fit_transform(X_train[NUMERIC_FEATURES])
X_val[NUMERIC_FEATURES] = ss_scaler.transform(X_val[NUMERIC_FEATURES])
X_test[NUMERIC_FEATURES] = ss_scaler.transform(X_test[NUMERIC_FEATURES])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[NUMERIC_FEATURES] = ss_scaler.fit_transform(X_train[NUMERIC_FEATURES])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[NUMERIC_FEATURES] = ss_scaler.transform(X_val[NUMERIC_FEATURES])


In [36]:
from sklearn.metrics import accuracy_score

In [55]:
models = [
    [
        LogisticRegression(),
        {
            'penalty' : 'l2',
            'C' : 1,
        }
    ],
    [
        SVC(),
        {
            'C' : 1,
            'kernel' : 'rbf',
        }
    ],
    [
        KNeighborsClassifier(),
        {
            'n_neighbors' : 5,
        }
    ],
    [
        DecisionTreeClassifier(),
        {
            'max_depth' : None,
            'random_state' : 42,
        }
    ],
    [
        ExtraTreeClassifier(),
        {
            'max_depth' : None,
            'random_state' : 42,
        }
    ],
    [
        RandomForestClassifier(),
        {
            'max_depth' : None,
            'random_state' : 42,
        }
    ],
    [
        AdaBoostClassifier(),
        {
            'n_estimators' : 50,
            'random_state' : 42,
        }
    ],
    [
        XGBClassifier(),
        {
            'max_depth' : None,
            'random_state' : 42,
        }
    ],
    [
        LGBMClassifier(),
        {
            'max_depth' : None,
            'random_state' : 42, 
        }
    ],
    [
        CatBoostClassifier(verbose=0),
        {
            'max_depth' : None,
            'random_state' : 42, 
        }
    ],
]

In [56]:
def mlflow_run_sklearn(
    run_name,
    tags,
    model_params,
    model,
    X_train,
    y_train,
    X_val=None,
    y_val=None,
):
    with mlflow.start_run(run_name=run_name):
        model.set_params(**model_params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        
        print(f"{run_name} : {acc}")
        
        mlflow.set_tags(tags)
        mlflow.log_params(model_params)
        mlflow.log_metric('acc', acc)
        mlflow.sklearn.log_model(model, 'sk_models')

In [57]:
for model_name, (model, model_params) in zip([
    'lr', 'svc', 'knn', 'dt', 'et', 'rf', 'ab', 'xgb', 'lgbm', 'cb'
], models):
    mlflow_run_sklearn(
        run_name=model_name + "_baseline",
        tags={
            'model_name' : model_name,
        },
        model_params=model_params,
        model=model,
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val
    )

lr_baseline : 0.8379888268156425
svc_baseline : 0.8435754189944135
knn_baseline : 0.8491620111731844
dt_baseline : 0.7597765363128491
et_baseline : 0.7262569832402235
rf_baseline : 0.8044692737430168
ab_baseline : 0.8324022346368715
xgb_baseline : 0.8156424581005587
lgbm_baseline : 0.8491620111731844
Learning rate set to 0.008911
0:	learn: 0.6871914	total: 87.7ms	remaining: 1m 27s
1:	learn: 0.6824900	total: 89.8ms	remaining: 44.8s
2:	learn: 0.6766355	total: 92.2ms	remaining: 30.6s
3:	learn: 0.6706831	total: 94.6ms	remaining: 23.6s
4:	learn: 0.6648818	total: 96.8ms	remaining: 19.3s
5:	learn: 0.6593994	total: 98.1ms	remaining: 16.3s
6:	learn: 0.6542849	total: 100ms	remaining: 14.2s
7:	learn: 0.6493357	total: 104ms	remaining: 12.9s
8:	learn: 0.6442401	total: 106ms	remaining: 11.7s
9:	learn: 0.6391813	total: 108ms	remaining: 10.7s
10:	learn: 0.6345856	total: 109ms	remaining: 9.83s
11:	learn: 0.6295030	total: 111ms	remaining: 9.15s
12:	learn: 0.6246199	total: 114ms	remaining: 8.63s
13:	lear