# Histogram based Gradient Boosting

## Import required libraries

In [1]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report



## Load dataset and discretize popularity

In [10]:
df = pd.read_csv('/Users/haochenyang/Desktop/EECS545/Project/data_merged.csv')
mean_popularity = 45
df["popularity"] = [ 1 if i >= mean_popularity else 0 for i in df.popularity ]
print(df["popularity"].value_counts())

1    20660
0    19896
Name: popularity, dtype: int64


## Split features, labels and train-test data

In [11]:
X  = df.drop(['popularity'], axis=1)
y = df['popularity']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=21)

## Histogram based gradient-boosting

In [13]:
gb_clf = HistGradientBoostingClassifier(loss='binary_crossentropy', max_iter=1000)

In [14]:
gb_clf.fit(X_train, y_train)



In [15]:
gb_clf.score(X_train, y_train)

0.86607693256072

In [16]:
gb_clf.score(X_test, y_test)

0.8388806706114399

In [17]:
y_pred = gb_clf.predict(X_test)

In [18]:
print(confusion_matrix(y_test, y_pred))
print("Classification report\n")
print(classification_report(y_test, y_pred))

[[3504  474]
 [ 833 3301]]
Classification report

              precision    recall  f1-score   support

           0       0.81      0.88      0.84      3978
           1       0.87      0.80      0.83      4134

    accuracy                           0.84      8112
   macro avg       0.84      0.84      0.84      8112
weighted avg       0.84      0.84      0.84      8112



## Bayesian Search

In [20]:
!pip3 install skopt
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective
from matplotlib import pyplot as plt

Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement skopt (from versions: none)[0m
[31mERROR: No matching distribution found for skopt[0m
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


ModuleNotFoundError: No module named 'skopt'

In [6]:
opt = BayesSearchCV(
    HistGradientBoostingClassifier(random_state=42),
    {
        'max_iter' : Integer(100, 200),
        'learning_rate' : Real(0.01, 1),
        'max_leaf_nodes' : Integer(10,100),
        'min_samples_leaf' : Integer(10,100),
#         'C': (1e-6, 1e+6, 'log-uniform'),
#         'gamma': (1e-6, 1e+1, 'log-uniform'),
#         'degree': (1, 8),  # integer valued parameter
#         'kernel': ['linear', 'poly', 'rbf'],  # categorical parameter
    },
    n_iter=128,
    cv=3
)

In [None]:
opt.fit(X_train, y_train)

from joblib import dump
dump(opt, "Saved models/GradBoost")

## Best parameters and scores obtained

In [97]:
print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))
print("best params: %s" % str(opt.best_params_))

val. score: 0.89182343378041
test score: 0.8911049178424201
best params: OrderedDict([('learning_rate', 0.060833085502393476), ('max_iter', 200), ('max_leaf_nodes', 100), ('min_samples_leaf', 78)])


In [98]:
y_pred = opt.predict(X_test)

In [99]:
print(confusion_matrix(y_test, y_pred))
print("Classification report\n")
print(classification_report(y_test, y_pred))

[[24379  1052]
 [ 2699  6316]]
Classification report

              precision    recall  f1-score   support

           0       0.90      0.96      0.93     25431
           1       0.86      0.70      0.77      9015

    accuracy                           0.89     34446
   macro avg       0.88      0.83      0.85     34446
weighted avg       0.89      0.89      0.89     34446



On increasing number of iterations, max_leaf_nodes, min_samples_leaf etc., learning rate automatically goes down and results remain the same more or less.

## Visualize results with partial dependence plots

In [100]:
# plot_objective(opt.optimizer_results_[0],
#                    dimensions=["n_estimators", "learning_rate"],
#                    n_minimum_search=int(1e8))