In [1]:
!pip install shap

Installing collected packages: slicer, shap
Successfully installed shap-0.43.0 slicer-0.0.7


In [2]:
!pip install optuna

Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.1 colorlog-6.7.0 optuna-3.4.0


In [3]:
## import packages
%matplotlib inline

from __future__ import print_function
from __future__ import division

import pandas as pd
import numpy as np
import pickle

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
import shap
import optuna

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, plot_importance
import lightgbm as lgb

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
## load data by locations
sj_train_features = pd.read_csv("sj_train_features.csv")
sj_train_labels = pd.read_csv("sj_train_labels.csv")
sj_validation_features = pd.read_csv("sj_validation_features.csv")
sj_validation_labels = pd.read_csv("sj_validation_labels.csv")

iq_train_features = pd.read_csv("iq_train_features.csv")
iq_train_labels = pd.read_csv("iq_train_labels.csv")
iq_validation_features = pd.read_csv("iq_validation_features.csv")
iq_validation_labels = pd.read_csv("iq_validation_labels.csv")

In [None]:
sj_train_features.drop(columns=['weekly_median_cases', 'city'], inplace=True)
sj_validation_features.drop(columns=['weekly_median_cases', 'city'], inplace=True)
iq_train_features.drop(columns=['weekly_median_cases', 'city'], inplace=True)
iq_validation_features.drop(columns=['weekly_median_cases', 'city'], inplace=True)

#### LightGBM

In [None]:
### LightGBM time split validation integrated with Optuna
import optuna

def lgb_objective_ts_cv(trial, train_features, train_labels):

  param_grid = {
      "num_iterations": trial.suggest_int("num_iterations", 20, 100, 10),
      "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2, step=0.01),
      "num_leaves": trial.suggest_int("num_leaves", 8, 72, step=1),
      "max_depth": trial.suggest_int("max_depth", 3, 8),
      "lambda_l1": trial.suggest_float("lambda_l1", 0.01, 0.1, step=0.01),
      "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 100, step=5),
      "bagging_fraction": trial.suggest_float(
          "bagging_fraction", 0.7, 0.95, step=0.01
      ),
      "feature_fraction_bynode": trial.suggest_float(
          "feature_fraction_bynode", 0.7, 0.95, step=0.01
      ),
      "feature_fraction": trial.suggest_float(
          "bagging_fraction", 0.8, 0.95, step=0.01
      )
  }

  tscv = TimeSeriesSplit(n_splits=5)
  folds = tscv.split(train_features)

  dtrain = lgb.Dataset(train_features, label=train_labels)

  param_grid['objective'] = "regression"
  param_grid['metric'] = "l1"
  param_grid['verbosity'] = -1
  param_grid['boosting_type'] = "gbdt"

  lgbcv = lgb.cv(param_grid,
                 dtrain,
                 folds=folds,
                 shuffle=False)
  cv_score = lgbcv['valid l1-mean'][-1] + lgbcv['valid l1-stdv'][-1]

  return cv_score

##### Hyperparameter Tuning

In [None]:
## SJ model
study = optuna.create_study(
        direction="minimize",
        study_name = "LightGBM Regression for sj"
    )
func = lambda trial: lgb_objective_ts_cv(trial, sj_train_features, sj_train_labels)
study.optimize(func, n_trials = 100)
parameters = study.best_params

reg_sj = lgb.LGBMRegressor(**parameters,
                        random_state = 42)
reg_sj.fit(sj_train_features, sj_train_labels, eval_metric = mean_absolute_error)
Y_pred = reg_sj.predict(sj_validation_features).astype(int).clip(0)

print("\n\nFinal MAE for validation set is {}".format(mean_absolute_error(sj_validation_labels, Y_pred)))

In [None]:
## IQ model
study = optuna.create_study(
        direction="minimize",
        study_name = "LightGBM Regression for iq"
    )
func = lambda trial: lgb_objective_ts_cv(trial, iq_train_features, iq_train_labels)
study.optimize(func, n_trials = 100)
parameters = study.best_params

reg_iq = lgb.LGBMRegressor(**parameters,
                        random_state = 42)
reg_iq.fit(iq_train_features, iq_train_labels, eval_metric = mean_absolute_error)
Y_pred = reg_iq.predict(iq_validation_features).astype(int).clip(0)

print("\n\nFinal MAE for validation set is {}".format(mean_absolute_error(iq_validation_labels, Y_pred)))

##### Feature Importance Analysis

In [None]:
# explain the model's predictions using SHAP
explainer = shap.Explainer(reg_sj)
shap_values = explainer(sj_train_features)

# summarize the effects of all the features
shap.plots.beeswarm(shap_values)

In [None]:
## SHAP importance for LightGBM
shap.plots.bar(shap_values)

In [None]:
# explain the model's predictions using SHAP
explainer = shap.Explainer(reg_iq)
shap_values = explainer(iq_train_features)

# summarize the effects of all the features
shap.plots.beeswarm(shap_values)

In [None]:
## SHAP importance for LightGBM
shap.plots.bar(shap_values)