In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb

In [3]:
data = pd.read_csv("data/ensemble_data.csv")
data = data.sort_values(by=["customer_id"])
data.time_passed_last_purchase = data.time_passed_last_purchase.astype(np.float64)
data.prod_gender_group = data.prod_gender_group.astype('category')
data.prod_age_bin = data.prod_age_bin.astype('category')
data.FN = data.FN.astype('category')
data.Active = data.Active.astype('category')
data.club_member_status = data.club_member_status.astype('category')
data.fashion_news_frequency = data.fashion_news_frequency.astype('category')
data.customer_age_bin = data.customer_age_bin.astype('category')
data.customer_gender_group = data.customer_gender_group.astype('category')
data.dtypes 

customer_id                    object
article_id                      int64
label                         float64
prod_gender_group            category
prod_avg_price                float64
prod_rebuy_count              float64
prod_age_bin                 category
prod_sold_count               float64
quotient                      float64
FN                           category
Active                       category
club_member_status           category
fashion_news_frequency       category
age                           float64
customer_age_bin             category
customer_gender_group        category
customer_rebuy_count          float64
customer_avg_price            float64
numberOfArticles              float64
same_prod_rebuy_count         float64
time_passed_last_purchase     float64
avg_purchase_time             float64
dtype: object

## Training LightGBM

In [4]:
train = data[:int(data.shape[0] * 0.9)]  # first 90%
test = data[int(data.shape[0] * 0.9):]
q_train = train.groupby("customer_id")["customer_id"].count().to_numpy()
train_label = train.label
train = train.drop(["label", "customer_id", "article_id"], axis=1)

q_test = test.groupby("customer_id")["customer_id"].count().to_numpy()
test_label = test.label
test = test.drop(["label", "customer_id", "article_id"], axis=1)

In [30]:
model = lgb.LGBMRanker(
    objective="lambdarank",
    metric= ["ndcg", "map@"],
    learning_rate=0.00001,
    num_iterations= 50,
    max_depth=256,
    num_leaves=4096,
)

model.fit(
    train, train_label, group=q_train, eval_set=[(test, test_label)], eval_group=[q_test], eval_at=[1, 3, 6, 9, 12],
    callbacks=[lgb.callback.log_evaluation(), lgb.callback.early_stopping(5)],
    categorical_feature = 'auto'
)

[1]	valid_0's ndcg@1: 0.497189	valid_0's ndcg@3: 0.485307	valid_0's ndcg@6: 0.478112	valid_0's ndcg@9: 0.475568	valid_0's ndcg@12: 0.475759
Training until validation scores don't improve for 5 rounds
[2]	valid_0's ndcg@1: 0.561254	valid_0's ndcg@3: 0.556065	valid_0's ndcg@6: 0.5499	valid_0's ndcg@9: 0.547296	valid_0's ndcg@12: 0.546302
[3]	valid_0's ndcg@1: 0.583234	valid_0's ndcg@3: 0.573397	valid_0's ndcg@6: 0.564245	valid_0's ndcg@9: 0.559755	valid_0's ndcg@12: 0.557262
[4]	valid_0's ndcg@1: 0.587664	valid_0's ndcg@3: 0.577231	valid_0's ndcg@6: 0.568536	valid_0's ndcg@9: 0.563853	valid_0's ndcg@12: 0.562352
[5]	valid_0's ndcg@1: 0.593798	valid_0's ndcg@3: 0.58043	valid_0's ndcg@6: 0.570185	valid_0's ndcg@9: 0.566358	valid_0's ndcg@12: 0.564636
[6]	valid_0's ndcg@1: 0.592605	valid_0's ndcg@3: 0.580869	valid_0's ndcg@6: 0.57113	valid_0's ndcg@9: 0.566308	valid_0's ndcg@12: 0.565184
[7]	valid_0's ndcg@1: 0.592776	valid_0's ndcg@3: 0.58099	valid_0's ndcg@6: 0.571959	valid_0's ndcg@9: 0.

LGBMRanker(learning_rate=1e-05, max_depth=256, metric=['ndcg', 'map@'],
           num_iterations=50, num_leaves=4096, objective='lambdarank')