# Fastai EDA AND XGboost Optuna Hyperparameters Tuning

> This notebook uses Fast AI library for EDA and Optuna for hyperparameter tuning on Kaggle Homesite Quote Conversion Data set. 

- toc: true
- badges: true
- comments: true
- categories: [kaggle]
- author: Tracy Dinh
- image: images/chart-preview.png

## Import Libraries

In [None]:
#hide
!pip install -Uqq fastbook kaggle waterfallcharts treeinterpreter dtreeviz
import fastbook
fastbook.setup_book()

In [None]:
!pip install optuna

In [None]:
import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedKFold


from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score


from sklearn import preprocessing
from sklearn import model_selection
import sklearn.datasets


import xgboost as xgb
from xgboost import XGBClassifier

import optuna

import matplotlib.pyplot as plt

from fastbook import *
from fastai.tabular.all import *
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG
import random as rd



pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

## Download Data

In [None]:
!mkdir -p ~/.kaggle
!cp /content/gdrive/MyDrive/Kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
path = Path('/content/gdrive/MyDrive/Kaggle/' + 'data/homesite-quote')
path.mkdir(parents=True, exist_ok=True)
path


Path('/content/gdrive/MyDrive/Kaggle/data/homesite-quote')

In [None]:
!kaggle competitions download -c homesite-quote-conversion -p /content/gdrive/MyDrive/Kaggle/data/homesite-quote


In [None]:
! unzip -q -n '{path}/train.csv.zip' -d '{path}'
! unzip -q -n '{path}/test.csv.zip' -d '{path}'
! unzip -q -n '{path}/sample_submission.csv.zip' -d '{path}'

In [None]:
#data import
df = pd.read_csv(path/'train.csv', low_memory=False)
test_df = pd.read_csv(path/'test.csv', low_memory=False)

## EDA with Fastai

In [None]:
dep_var='QuoteConversion_Flag'

As 'QuoteNumber' is unique, set it as index

In [None]:
df_train = df.set_index('QuoteNumber')
df_test = test_df.set_index('QuoteNumber')


Use Fastai function to add relevant datetime fields

In [None]:
df_train['Original_Quote_Date'] = pd.to_datetime(df_train['Original_Quote_Date'])
df_test['Original_Quote_Date'] = pd.to_datetime(df_test['Original_Quote_Date'])
df_train = add_datepart(df_train, 'Original_Quote_Date')
df_test = add_datepart(df_test, 'Original_Quote_Date')

Drop 2 below fields because they have constant values

In [None]:
df_train.drop(columns=['PropertyField6','GeographicField10A'],axis=1,inplace=True)
df_test.drop(columns=['PropertyField6','GeographicField10A'],axis=1,inplace=True)

Use Fastai function to identify continuous and categorical variables

In [None]:
cont_names, cat_names = cont_cat_split(df_train,dep_var=dep_var)
len(cont_names), len(cat_names)

(155, 152)

'procs' will take care of of categorifying categorical variables, fill in missing values and normalise data

In [None]:
procs = [Categorify, FillMissing, Normalize]
splits = TrainTestSplitter(test_size=0.2, stratify=df_train[dep_var])(df_train)

Create a TabularPandas dataset

In [None]:
to = TabularPandas(df=df_train, procs=procs, cat_names=cat_names, 
                   cont_names=cont_names, y_names=dep_var,splits=splits,
                  y_block=CategoryBlock())


In [None]:
dls = to.dataloaders(bs=4096, val_bs=512, layers=[10000,500], embed_ps=0.02, ps=[0.001, 0.01])

## XGBoost with Otuna

> Use Optuna to select best hyperparamters for XGboost model. Code is referenced from https://www.kaggle.com/hamzaghanmi/xgboost-hyperparameter-tuning-using-optuna

In [None]:
X_train_fa, y_train_fa = to.train.xs, to.train.ys.values.ravel()
X_valid_fa, y_valid_fa = to.valid.xs, to.valid.ys.values.ravel()

Define parameter to test

In [None]:
def objective(trial):
    
    X_train_fa, y_train_fa = to.train.xs, to.train.ys.values.ravel()
    X_valid_fa, y_valid_fa = to.valid.xs, to.valid.ys.values.ravel()
    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 1000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = xgb.XGBClassifier(**param)  
    
    model.fit(X_train_fa,y_train_fa,eval_set=[(X_valid_fa,y_valid_fa)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict_proba(X_valid_fa)[:,1]
    
    auc = roc_auc_score(y_valid_fa, preds)
    
    return auc

Fit model using Optuna

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2021-07-03 13:18:30,783][0m A new study created in memory with name: no-name-fe83554f-1bcf-4f1e-976a-1d2d12ba22da[0m
[32m[I 2021-07-03 13:18:54,840][0m Trial 0 finished with value: 0.9614490168531203 and parameters: {'lambda': 0.6618680618471974, 'alpha': 0.04239837985417904, 'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.01, 'max_depth': 7, 'random_state': 48, 'min_child_weight': 253}. Best is trial 0 with value: 0.9614490168531203.[0m
[32m[I 2021-07-03 13:19:22,792][0m Trial 1 finished with value: 0.96370909267955 and parameters: {'lambda': 8.086230640660201, 'alpha': 0.004463392998945948, 'colsample_bytree': 0.9, 'subsample': 0.4, 'learning_rate': 0.01, 'max_depth': 11, 'random_state': 24, 'min_child_weight': 81}. Best is trial 1 with value: 0.96370909267955.[0m
[32m[I 2021-07-03 13:20:05,297][0m Trial 2 finished with value: 0.9663443798690221 and parameters: {'lambda': 0.06952864689008562, 'alpha': 0.03710247783593982, 'colsample_bytree': 0.4, 'subs

Save best trials

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)



Number of finished trials: 50
Best trial: {'lambda': 0.08287684030183871, 'alpha': 0.021800136799959794, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 44}


In [None]:
Best_trial_fastai= {'lambda': 0.08287684030183871, 'alpha': 0.021800136799959794, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 44}

### Optuna Visualisation

plot_optimization_histor: shows the scores from all trials as well as the best score so far at each point.

In [None]:

optuna.visualization.plot_optimization_history(study)


plot_parallel_coordinate: interactively visualizes the hyperparameters and scores


In [None]:
optuna.visualization.plot_parallel_coordinate(study)

plot_slice: shows the evolution of the search. You can see where in the hyperparameter space your search went and which parts of the space were explored more.


In [None]:
optuna.visualization.plot_slice(study)

plot_contour: plots parameter interactions on an interactive chart. You can choose which hyperparameters you would like to explore.


In [None]:
optuna.visualization.plot_contour(study, params=['alpha',
                            #'max_depth',
                            'lambda',
                            'subsample',
                            'learning_rate',
                            'subsample'])

Visualize parameter importances.

In [None]:
optuna.visualization.plot_param_importances(study)

Visualize empirical distribution function

In [None]:
optuna.visualization.plot_edf(study)

### Model Best Optuna Trials

In [None]:
Best_trial_fastai= {'lambda': 0.08287684030183871, 'alpha': 0.021800136799959794, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 44,'n_estimators': 1000,'tree_method':'gpu_hist'}


Using StratifiedKfold cross validation to test roc_auc_score

In [None]:
# preds = np.zeros(test_df.shape[0])
kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True)
auc=[]  # list contains auc for each fold
n=0
for trn_idx, test_idx in kf.split(X_train_fa,y_train_fa):
    X_tr,X_val=X_train_fa.iloc[trn_idx],X_train_fa.iloc[test_idx]
    y_tr,y_val=y_train_fa[trn_idx],y_train_fa[test_idx]
    model = xgb.XGBClassifier(**Best_trial_fastai)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    # preds+=model.predict(test_df[columns])/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:,1]))
    print(n+1,auc[n])
    n+=1

1 0.9670692718197905
2 0.9666783942446252
3 0.9624103790810723
4 0.9650869557007921
5 0.9661417297238831


Calculate mean of all folds

In [None]:
np.mean(auc)


0.9654773461140327

Create inference for test set fastai

In [None]:
roc_auc_binary = RocAucBinary()
learn = tabular_learner(dls, metrics=roc_auc_binary)

In [None]:
dl_test = learn.dls.test_dl(df_test.iloc[:])

In [None]:
X_test=dl_test.dataset.xs

Save Kaggle Submission File

In [None]:
preds = model.predict_proba(X_test)[:,1]
sample = pd.read_csv(path/'sample_submission.csv')
sample.QuoteConversion_Flag = preds
sample.to_csv(path/'xgb_optuna_fastaidata.csv', index=False)

Kaggle Score: 0.96633