In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from dask import dataframe as dd
from distributed import Client, LocalCluster

cluster = LocalCluster(n_workers=4)
client = Client(cluster)

In [2]:
data = dd.read_csv("data/ensemble_train/ensemble_data.csv/*.part", blocksize="64MB")
data = data.drop(["Unnamed: 0"], axis=1)
data.time_passed_last_purchase = data.time_passed_last_purchase.astype(np.float64)
data.prod_gender_group = data.prod_gender_group.astype('category')
data.prod_age_bin = data.prod_age_bin.astype('category')
data.FN = data.FN.astype('category')
data.Active = data.Active.astype('category')
data.club_member_status = data.club_member_status.astype('category')
data.fashion_news_frequency = data.fashion_news_frequency.astype('category')
data.customer_age_bin = data.customer_age_bin.astype('category')
data.customer_gender_group = data.customer_gender_group.astype('category')
data.dtypes 

customer_id                    object
article_id                      int64
label                         float64
prod_gender_group            category
prod_avg_price                float64
prod_rebuy_count              float64
prod_age_bin                 category
prod_sold_count               float64
quotient                      float64
customer_index                  int64
FN                           category
Active                       category
club_member_status           category
fashion_news_frequency       category
age                           float64
customer_age_bin             category
customer_gender_group        category
customer_rebuy_count          float64
customer_avg_price            float64
numberOfArticles              float64
same_prod_rebuy_count         float64
time_passed_last_purchase     float64
avg_purchase_time             float64
dtype: object

## Training LightGBM

In [9]:
number_of_samples = data.customer_index.max().compute()
train = data[data["customer_index"] <= int(number_of_samples * 0.9)].compute()

test = data[data["customer_index"] > int(number_of_samples * 0.9)].compute()
q_train = train.groupby("customer_id")["customer_id"].count()
train_label = train.label
train = train.drop(["label", "customer_id", "article_id"], axis=1)

q_test = test.groupby("customer_id")["customer_id"].count()
test_label = test.label
test = test.drop(["label", "customer_id", "article_id"], axis=1)

In [19]:
model = lgb.LGBMRanker(
    objective="lambdarank",
    metric= "ndcg",
    learning_rate=0.00001,
    max_depth=512,
    num_leaves=4096,
    num_iterations=30
    
)

model.fit(
    train, train_label, group=q_train.values, eval_set=[(test, test_label)], eval_group=[q_test.values], eval_at=[12, 7, 3],
    callbacks=[lgb.callback.log_evaluation(), lgb.callback.early_stopping(5, first_metric_only=False)],
    categorical_feature = 'auto'
)

model.booster_.save_model('weights/lbm_lamda_ranker.txt',  num_iteration=model.best_iteration_)



[1]	valid_0's ndcg@3: 0.116454	valid_0's ndcg@7: 0.134626	valid_0's ndcg@12: 0.156435
Training until validation scores don't improve for 5 rounds
[2]	valid_0's ndcg@3: 0.12267	valid_0's ndcg@7: 0.141015	valid_0's ndcg@12: 0.161672
[3]	valid_0's ndcg@3: 0.124038	valid_0's ndcg@7: 0.140933	valid_0's ndcg@12: 0.160462
[4]	valid_0's ndcg@3: 0.126126	valid_0's ndcg@7: 0.142083	valid_0's ndcg@12: 0.16235
[5]	valid_0's ndcg@3: 0.126637	valid_0's ndcg@7: 0.143083	valid_0's ndcg@12: 0.163276
[6]	valid_0's ndcg@3: 0.127478	valid_0's ndcg@7: 0.143824	valid_0's ndcg@12: 0.164874
[7]	valid_0's ndcg@3: 0.126255	valid_0's ndcg@7: 0.143469	valid_0's ndcg@12: 0.165623
[8]	valid_0's ndcg@3: 0.129009	valid_0's ndcg@7: 0.145279	valid_0's ndcg@12: 0.167652
[9]	valid_0's ndcg@3: 0.129102	valid_0's ndcg@7: 0.146129	valid_0's ndcg@12: 0.169474
[10]	valid_0's ndcg@3: 0.128951	valid_0's ndcg@7: 0.146121	valid_0's ndcg@12: 0.169276
[11]	valid_0's ndcg@3: 0.128943	valid_0's ndcg@7: 0.147642	valid_0's ndcg@12: 0.1

<lightgbm.basic.Booster at 0x2291adce790>

## Prediction LightGBM