In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from joblib import dump
import warnings, os

In [2]:
df_train = pd.read_csv("../data/processed/df_train_final.csv", index_col=0)
df_valid = pd.read_csv("../data/processed/df_valid_final.csv", index_col=0)

In [3]:
# extract the features and outcomes from the datasets
X_train = df_train.iloc[:,2:]
y_train = df_train.iloc[:,0]
X_valid = df_valid.iloc[:,2:]
y_valid = df_valid.iloc[:,0]

# combine train and validation datasets for fitting the final model after tuning
X_comb = pd.concat([X_train, X_valid])
y_comb = np.concatenate((y_train, y_valid))

In [4]:
# function for fitting a model and evaluating it using f1 score, then return the fitted model
def pred_eval(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    f1 = f1_score(y_valid, y_pred, average='micro')
    precision = precision_score(y_valid, y_pred, average='micro')
    recall = recall_score(y_valid, y_pred, average='micro')
    print(f"precision score: {precision:.4f}, recall score: {recall:.4f}, f1 score: {f1:.4f}")
    return model

## Fitting baseline models

In [5]:
# Random forest
rf_clf = pred_eval(RandomForestClassifier())

precision score: 0.2131, recall score: 0.2131, f1 score: 0.2131


In [6]:
# Bagging
bag_clf = pred_eval(BaggingClassifier())

precision score: 0.1877, recall score: 0.1877, f1 score: 0.1877


In [7]:
# Logistic
log_clf = pred_eval(LogisticRegression(multi_class='multinomial', max_iter=1000))

precision score: 0.2300, recall score: 0.2300, f1 score: 0.2300


In [8]:
# knn
knn_clf = pred_eval(KNeighborsClassifier(n_neighbors=25));

precision score: 0.1946, recall score: 0.1946, f1 score: 0.1946


In [9]:
# gradient boosting
xgb_clf = pred_eval(XGBClassifier(random_state=0))

precision score: 0.2111, recall score: 0.2111, f1 score: 0.2111


## Model tuning
Some of the well performed models are chosen to be tuned (random forest and gradiant boosting) to further improve their performance.

In [10]:
# disable warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

#### Gradiant boosting

In [12]:
# setup empty dict to be appended with tuned parameters
parameters = {}

# first, tune the max depth of the tree
param_grid = {'max_depth': range(3,7)}

# use random grid search to try fitting model with all possible param_grids
# tuning parameters one by one due to long fitting time for XGBClassifier 
gsearch = GridSearchCV(estimator=XGBClassifier(**parameters, learning_rate=0.1, n_estimators=200, nthread=8, random_state=0),
                       param_grid=param_grid, scoring='f1_micro', n_jobs=8, cv=5, verbose=1)
# fit the model
gsearch.fit(X_train, y_train)
# after fitted, append the tuned parameters to save them
parameters.update(gsearch.best_params_)
# print the bests model, best parameters, and best evaulated score (used f1 micro)
gsearch.best_estimator_, gsearch.best_params_, gsearch.best_score_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


(XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
               colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
               early_stopping_rounds=None, enable_categorical=False,
               eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
               importance_type=None, interaction_constraints='',
               learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
               max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
               missing=nan, monotone_constraints='()', n_estimators=200,
               n_jobs=8, nthread=8, num_parallel_tree=1,
               objective='multi:softprob', predictor='auto', random_state=0, ...),
 {'max_depth': 3},
 0.22001516300227442)

In [13]:
# then, tune the gamma parameter, similar to above
param_grid = {'gamma': [x/10 for x in range(5)]}

gsearch = GridSearchCV(estimator=XGBClassifier(**parameters, learning_rate=0.1, n_estimators=200, nthread=8, random_state=0),
                       param_grid=param_grid, scoring='f1_micro', n_jobs=8, cv=5)
gsearch.fit(X_train, y_train)
parameters.update(gsearch.best_params_)
gsearch.best_estimator_, gsearch.best_params_, gsearch.best_score_

(XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
               colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
               early_stopping_rounds=None, enable_categorical=False,
               eval_metric=None, gamma=0.3, gpu_id=-1, grow_policy='depthwise',
               importance_type=None, interaction_constraints='',
               learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
               max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
               missing=nan, monotone_constraints='()', n_estimators=200,
               n_jobs=8, nthread=8, num_parallel_tree=1,
               objective='multi:softprob', predictor='auto', random_state=0, ...),
 {'gamma': 0.3},
 0.22100075815011372)

In [14]:
# next, tune the subsample parameter
param_grid = {'subsample': [x/20 + 0.6 for x in range(7)]}

gsearch = GridSearchCV(estimator=XGBClassifier(**parameters, learning_rate=0.1, n_estimators=500, nthread=8, random_state=0),
                       param_grid=param_grid, scoring='f1_micro', n_jobs=8, cv=5)
gsearch.fit(X_train, y_train)
parameters.update(gsearch.best_params_)
gsearch.best_estimator_, gsearch.best_params_, gsearch.best_score_

(XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
               colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
               early_stopping_rounds=None, enable_categorical=False,
               eval_metric=None, gamma=0.3, gpu_id=-1, grow_policy='depthwise',
               importance_type=None, interaction_constraints='',
               learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
               max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
               missing=nan, monotone_constraints='()', n_estimators=500,
               n_jobs=8, nthread=8, num_parallel_tree=1,
               objective='multi:softprob', predictor='auto', random_state=0, ...),
 {'subsample': 0.8},
 0.21887793783169068)

In [15]:
# lastly, tune regularization parameter.
param_grid = {'reg_alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]}

gsearch = GridSearchCV(estimator=XGBClassifier(**parameters, learning_rate=0.1, n_estimators=500, nthread=8, random_state=0),
                       param_grid=param_grid, scoring='f1_micro', n_jobs=8, cv=5)
gsearch.fit(X_train, y_train)
parameters.update(gsearch.best_params_)
gsearch.best_estimator_, gsearch.best_params_, gsearch.best_score_

(XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
               colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
               early_stopping_rounds=None, enable_categorical=False,
               eval_metric=None, gamma=0.3, gpu_id=-1, grow_policy='depthwise',
               importance_type=None, interaction_constraints='',
               learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
               max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
               missing=nan, monotone_constraints='()', n_estimators=500,
               n_jobs=8, nthread=8, num_parallel_tree=1,
               objective='multi:softprob', predictor='auto', random_state=0, ...),
 {'reg_alpha': 0.001},
 0.22100075815011372)

In [16]:
# re-fit the model with tuned parameters on the combined train & validation datasets, and smaller learning rates
xgb_clf = XGBClassifier(**parameters, learning_rate=0.01, n_estimators=1000, nthread=8, random_state=0)
xgb_clf.fit(X_comb, y_comb)

# save the model
if not os.path.exists("../models"):
    os.mkdir("../models")
xgb_clf.save_model("../models/xgb.json")

#### Random forest

In [21]:
# Similarly to what's done for the gradiant boosting model
# setup dict to save tuned parameters
parameters = {}
# random forest models fit faster, so combination of multiple parameters can be searched through
# at one time.
# First, start with bootstrap, max depth, min sample split (min number of samples in a final node),
# max leaf nodes (max number of nodes splitted from a parent node), min samples leaf (min number of 
# samples in a leaf)
param_grid = {
    "bootstrap": [True, False],
    "max_depth": range(5, 30, 5),
    "min_samples_split": [2, 5, 10],
    "max_leaf_nodes": range(5, 55, 10),
    "min_samples_leaf": range(1, 11, 2)
}

gsearch = GridSearchCV(estimator=RandomForestClassifier(random_state=0),
                       param_grid=param_grid, scoring='f1_micro', n_jobs=8, cv=5, verbose=2.5)
gsearch.fit(X_train, y_train)
parameters.update(gsearch.best_params_)
gsearch.best_estimator_, gsearch.best_params_, gsearch.best_score_

Fitting 5 folds for each of 750 candidates, totalling 3750 fits


(RandomForestClassifier(bootstrap=False, max_depth=15, max_leaf_nodes=45,
                        min_samples_leaf=3, random_state=0),
 {'bootstrap': False,
  'max_depth': 15,
  'max_leaf_nodes': 45,
  'min_samples_leaf': 3,
  'min_samples_split': 2},
 0.22744503411675515)

In [23]:
# Then, tune the number of trees in the forest, max features, and loss function
param_grid = {
    "n_estimators": [100, 500, 1000],
    "max_features": ["sqrt", "log2", None],
    "criterion": ["gini", "entropy", "log_loss"]
}
gsearch = GridSearchCV(estimator=RandomForestClassifier(**parameters, random_state=0),
                       param_grid=param_grid, scoring='f1_micro', n_jobs=8, cv=5, verbose=2.5)
gsearch.fit(X_train, y_train)
parameters.update(gsearch.best_params_)
gsearch.best_estimator_, gsearch.best_params_, gsearch.best_score_

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 2/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=5;, score=0.218 total time=   0.6s
[CV 3/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=10;, score=0.218 total time=   0.6s
[CV 5/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=3, min_samples_split=2;, score=0.215 total time=   0.6s
[CV 2/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=3, min_samples_split=10;, score=0.218 total time=   0.6s
[CV 4/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=2;, score=0.217 total time=   0.6s
[CV 1/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=10;, score=0.218 total time=   0.6s
[CV 4/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=7, min_samples_split=5;, score=0.217 total time=   0.6s
[CV 

[CV 3/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=2;, score=0.218 total time=   0.7s
[CV 1/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=10;, score=0.218 total time=   0.6s
[CV 1/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=3, min_samples_split=5;, score=0.218 total time=   0.6s
[CV 1/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=2;, score=0.218 total time=   0.6s
[CV 5/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=5;, score=0.215 total time=   0.6s
[CV 3/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=7, min_samples_split=2;, score=0.219 total time=   0.6s
[CV 1/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=7, min_samples_split=10;, score=0.219 total time=   0.6s
[CV 4/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_sam

[CV 4/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=2;, score=0.216 total time=   0.6s
[CV 5/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=5;, score=0.215 total time=   0.6s
[CV 3/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=3, min_samples_split=2;, score=0.218 total time=   0.6s
[CV 3/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=3, min_samples_split=10;, score=0.218 total time=   0.6s
[CV 3/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=2;, score=0.218 total time=   0.6s
[CV 2/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=10;, score=0.217 total time=   0.6s
[CV 5/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_samples_leaf=7, min_samples_split=2;, score=0.216 total time=   0.6s
[CV 4/5] END bootstrap=True, max_depth=5, max_leaf_nodes=5, min_sam

(RandomForestClassifier(bootstrap=False, max_depth=15, max_leaf_nodes=45,
                        min_samples_leaf=3, n_estimators=500, random_state=0),
 {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 500},
 0.22774829416224412)

In [27]:
# refit the model with the best model chosen (with the tuned parameters), on the combined train and validation
# dataset.
rf_clf = gsearch.best_estimator_
rf_clf.fit(X_comb, y_comb)

# save the random forest model
dump(rf_clf, "../models/rf.joblib")

['../models/rf.joblib']

## Final notes
The model tuning could be done more precisely, such as performing another grid search with parameters closer to the value chosen by the first search, but I chose to stop here given the time constraint.