<a href="https://colab.research.google.com/github/winnie-224/RecommenderSystems/blob/main/gbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

LightGBM is a gradient boosting framework that uses tree-based learning algorithms. It is designed to be distributed and efficient with the following advantages:

Fast training speed and high efficiency.
Low memory usage.
Great accuracy.
Support of parallel and GPU learning.
Capable of handling large-scale data.

In [None]:
# LightGBM: A Highly Efficient Gradient Boosting Decision Tree
import pandas as pd
import lightgbm as lightGBM
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
!pip install recommenders

Collecting recommenders
  Downloading recommenders-1.2.0-py3-none-any.whl (356 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m356.0/356.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting category-encoders<3,>=2.6.0 (from recommenders)
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cornac<2,>=1.15.2 (from recommenders)
  Downloading cornac-1.18.0-cp310-cp310-manylinux1_x86_64.whl (21.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
Collecting lightfm<2,>=1.17 (from recommenders)
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting locust<3,>

In [None]:
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import (
    rmse,
    mae,
    rsquared,
    exp_var
)

 Regression task - the mean average error (MAE) as the metric to evaluate the model.  The basic parameters to adjust are the number of leaves (MAX_LEAF), maximum number of trees (NUM_OF_TREES), and the learning rate (LEARNING_RATE).


In [None]:
# Model params
# Top k items to recommend
TOP_K = 10
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = "1M"
# Other data settings
USER_COL = "userID"
ITEM_COL = "itemID"
RATING_COL = "rating"
PREDICTION_COL = "prediction"
ITEM_FEAT_COL = "genre"
# Train test split ratio
SPLIT_RATIO = 0.70
# Model settings
MAX_LEAF = 64
NUM_OF_TREES = 100
LEARNING_RATE = 0.05
METRIC = "mae"

SEED = 42


In [None]:
params = {
    "objective": "regression",
    "boosting_type": "gbdt",
    "metric": METRIC,
    "num_leaves": MAX_LEAF,
    "n_estimators": NUM_OF_TREES,
    "boost_from_average": True,
    "n_jobs": -1,
    "learning_rate": LEARNING_RATE,
}


In [None]:
# Prepare data
# The genres of each movie are returned as '|' separated string, e.g. "Animation|Children's|Comedy".
data = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=[USER_COL, ITEM_COL, RATING_COL],
    genres_col=ITEM_FEAT_COL
)

100%|██████████| 5.78k/5.78k [00:00<00:00, 16.3kKB/s]


To use genres from our model, we multi-hot-encode them with scikit-learn's MultiLabelBinarizer.

In [None]:
(data['genre'])


0                         Drama
1                         Drama
2                         Drama
3                         Drama
4                         Drama
                   ...         
1000204             Documentary
1000205                   Drama
1000206                   Drama
1000207    Comedy|Drama|Western
1000208             Documentary
Name: genre, Length: 1000209, dtype: object

In [None]:
genres_encoder = MultiLabelBinarizer()
data[ITEM_FEAT_COL] = genres_encoder.fit_transform(
    data[ITEM_FEAT_COL].apply(lambda s:s.split("|"))
).tolist()
print("Genres :",genres_encoder.classes_)
data.head()


Genres : ['Action' 'Adventure' 'Animation' "Children's" 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Fantasy' 'Film-Noir' 'Horror' 'Musical' 'Mystery'
 'Romance' 'Sci-Fi' 'Thriller' 'War' 'Western']


Unnamed: 0,userID,itemID,rating,genre
0,1,1193,5.0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,1193,5.0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,12,1193,4.0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3,15,1193,4.0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
4,17,1193,5.0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
number_of_genres = len(genres_encoder.classes_)


In [None]:
# Expand the 'genre' list into separate columns
expanded_genre = pd.DataFrame(data[ITEM_FEAT_COL].tolist(), columns=[f"{ITEM_FEAT_COL}_{i+1}" for i in range(number_of_genres)])


In [None]:
# Concatenate the expanded genre columns with the original DataFrame
data = pd.concat([data, expanded_genre], axis=1)
# Drop the original 'genre' column
data.drop(ITEM_FEAT_COL, axis=1, inplace=True)
data.head()

Unnamed: 0,userID,itemID,rating,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,...,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18
0,1,1193,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1193,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12,1193,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15,1193,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17,1193,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Data splitting
train, test = python_stratified_split(
    data,
    ratio = SPLIT_RATIO,
    col_user = USER_COL,
    col_item=ITEM_COL,
    seed=SEED
)

In [None]:
print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=len(train),
    train_users=len(train[USER_COL].unique()),
    train_items=len(train[ITEM_COL].unique()),
    test_total=len(test),
    test_users=len(test[USER_COL].unique()),
    test_items=len(test[ITEM_COL].unique()),
))


Train:
Total Ratings: 700103
Unique Users: 6040
Unique Items: 3662

Test:
Total Ratings: 300106
Unique Users: 6040
Unique Items: 3550



In [None]:
# Model Training
lightGBM_regressor = lightGBM.LGBMRegressor(**params)


In [None]:
with Timer() as train_time:
    lightGBM_regressor.fit(
        X=train[train.columns.difference([RATING_COL])].values,
        y=train[RATING_COL].values,
    )

print(f"Took {train_time.interval} seconds for training.")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.261932 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 618
[LightGBM] [Info] Number of data points in the train set: 700103, number of used features: 56
[LightGBM] [Info] Start training from score 3.581873
Took 8.130323968000084 seconds for training.


In [None]:
# Model Evaluation
# Evaluate the Model
with Timer() as test_time:
    y_pred = lightGBM_regressor.predict(test[test.columns.difference([RATING_COL])])

print(f"Took {test_time.interval} seconds for prediction.")

Took 3.85704014599969 seconds for prediction.


In [None]:
pred = test[[USER_COL, ITEM_COL, RATING_COL]].copy()
pred[PREDICTION_COL] = y_pred
pred.head()

Unnamed: 0,userID,itemID,rating,prediction
28501,1,48,5.0,3.171093
18914,1,2018,4.0,3.626035
11041,1,919,4.0,4.153002
54908,1,531,4.0,3.674927
28157,1,2340,3.0,3.364754


In [None]:
# Rating metrics
eval_rmse = rmse(test, pred, col_user=USER_COL, col_item=ITEM_COL, col_rating=RATING_COL, col_prediction=PREDICTION_COL)
eval_mae = mae(test, pred, col_user=USER_COL, col_item=ITEM_COL, col_rating=RATING_COL, col_prediction=PREDICTION_COL)
eval_rsquared = rsquared(test, pred, col_user=USER_COL, col_item=ITEM_COL, col_rating=RATING_COL, col_prediction=PREDICTION_COL)
eval_exp_var = exp_var(test, pred, col_user=USER_COL, col_item=ITEM_COL, col_rating=RATING_COL, col_prediction=PREDICTION_COL)

In [None]:
print("Model:\t\tLightGBM",
      "RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae,
      "R2:\t\t%f" % eval_rsquared,
      "Exp var:\t%f" % eval_exp_var,
      sep='\n')

Model:		LightGBM
RMSE:		1.022369
MAE:		0.829762
R2:		0.161531
Exp var:	0.161532
