In [1]:
import random
import numpy as np
import pandas as pd
from dask import dataframe as dd
import lightgbm as lgb

In [2]:
data = dd.read_csv("data/ensemble_train/ensemble_data.csv/*.part", blocksize="64MB", dtype={"label": np.float32, "prod_rebuy_count": np.float32, "prod_sold_count":np.float32, "age_std":np.float32,
                                                                                            "age_mean":np.float32, "quotient":np.float32, "age":np.float32, "numberOfArticles":np.float32,
                                                                                            "same_prod_rebuy_count":np.float32, "time_passed_last_purchase":np.float32, "avg_purchase_time":np.float32,
                                                                                            "d2v":np.float32, "dl":np.float32, "tf_idf":np.float32, "nmf":np.float32, "lda":np.float32,
                                                                                            "freq_week":np.float32, "different_week":np.float32})
data = data.drop(["Unnamed: 0"], axis=1)
data.dtypes 

customer_id                   object
article_id                     int64
label                        float32
d2v                          float32
dl                           float32
tf_idf                       float32
nmf                          float32
lda                          float32
prod_rebuy_count             float32
prod_sold_count              float32
age_std                      float32
age_mean                     float32
quotient                     float32
customer_index                 int64
age                          float32
freq_week                    float32
different_week               float32
numberOfArticles             float32
same_prod_rebuy_count        float32
time_passed_last_purchase    float32
avg_purchase_time            float32
dtype: object

## Training LightGBM

In [3]:
number_of_customer = data.customer_index.max().compute()
randomlist = random.sample(range(0, number_of_customer),number_of_customer//10)
train = data[~data["customer_index"].isin(randomlist)].compute()
train.drop(columns="customer_index", inplace=True)

test = data[data["customer_index"].isin(randomlist)].compute()
test.drop(columns="customer_index", inplace=True)
q_train = train.groupby("customer_id")["customer_id"].count()
train_label = train.label
train = train.drop(["label", "customer_id", "article_id"], axis=1)

q_test = test.groupby("customer_id")["customer_id"].count()
test_label = test.label
test = test.drop(["label", "customer_id", "article_id"], axis=1)

In [4]:
train.head()

Unnamed: 0,d2v,dl,tf_idf,nmf,lda,prod_rebuy_count,prod_sold_count,age_std,age_mean,quotient,age,freq_week,different_week,numberOfArticles,same_prod_rebuy_count,time_passed_last_purchase,avg_purchase_time
0,6.1e-05,3.1e-05,6.8e-05,6.6e-05,2.4e-05,0.001319,0.054453,0.611296,0.351449,0.00503,0.096386,0.538462,0.038462,0.004706,0.0,1.0,0.0
1,3.3e-05,4e-05,5.1e-05,0.000152,5.5e-05,0.000635,0.195201,0.627555,0.216291,0.015509,0.096386,0.538462,0.038462,0.004706,0.0,1.0,0.0
2,0.000104,4.8e-05,0.000105,2.9e-05,8e-06,0.0,0.024458,0.588786,0.485814,0.02664,0.096386,0.538462,0.038462,0.004706,0.0,1.0,0.0
3,2.6e-05,2.3e-05,5.5e-05,2.6e-05,7e-06,0.003487,0.034149,0.616232,0.381128,0.024864,0.096386,0.538462,0.038462,0.004706,0.0,1.0,0.0
4,7.7e-05,1.3e-05,4.7e-05,4.9e-05,2.1e-05,0.001851,0.008768,0.518117,0.561772,0.00799,0.096386,0.538462,0.038462,0.004706,0.0,1.0,0.0


In [7]:
model = lgb.LGBMRanker(
    objective="lambdarank",
    metric= "ndcg",
    boosting_type="dart",
#   learning_rate=0.01,
    num_leaves=256,
    n_estimators=512,
    importance_type='gain',
    device= "gpu",
    gpu_platform_id= 0,
    gpu_device_id= 0,
#   max_depth=128,
    num_iterations=300,
#   verbose=10,
)

model.fit(
    train, train_label, group=q_train.values, eval_set=[(test, test_label)], eval_group=[q_test.values], eval_at=[12],
    callbacks=[lgb.callback.log_evaluation()],
)

model.booster_.save_model('weights/lbm_lamda_ranker.txt',  num_iteration=model.best_iteration_)

[1]	valid_0's ndcg@12: 0.150674
[2]	valid_0's ndcg@12: 0.190583
[3]	valid_0's ndcg@12: 0.203985
[4]	valid_0's ndcg@12: 0.20805
[5]	valid_0's ndcg@12: 0.210754
[6]	valid_0's ndcg@12: 0.211326
[7]	valid_0's ndcg@12: 0.213659
[8]	valid_0's ndcg@12: 0.212812
[9]	valid_0's ndcg@12: 0.213798
[10]	valid_0's ndcg@12: 0.214535
[11]	valid_0's ndcg@12: 0.214652
[12]	valid_0's ndcg@12: 0.214416
[13]	valid_0's ndcg@12: 0.214693
[14]	valid_0's ndcg@12: 0.215667
[15]	valid_0's ndcg@12: 0.216504
[16]	valid_0's ndcg@12: 0.216323
[17]	valid_0's ndcg@12: 0.21675
[18]	valid_0's ndcg@12: 0.216315
[19]	valid_0's ndcg@12: 0.217175
[20]	valid_0's ndcg@12: 0.217386
[21]	valid_0's ndcg@12: 0.217808
[22]	valid_0's ndcg@12: 0.217798
[23]	valid_0's ndcg@12: 0.217768
[24]	valid_0's ndcg@12: 0.21849
[25]	valid_0's ndcg@12: 0.2189
[26]	valid_0's ndcg@12: 0.218536
[27]	valid_0's ndcg@12: 0.218722
[28]	valid_0's ndcg@12: 0.218541
[29]	valid_0's ndcg@12: 0.219514
[30]	valid_0's ndcg@12: 0.220527
[31]	valid_0's ndcg@12: 

<lightgbm.basic.Booster at 0x2ce8404cd90>

In [8]:
for i in model.feature_importances_.argsort()[::-1]:
    print(train.columns[i], model.feature_importances_[i] / model.feature_importances_.sum())

prod_sold_count 0.2836786055996997
tf_idf 0.25924597119303733
lda 0.08416608189245862
d2v 0.06079268822580631
quotient 0.049681174374713845
age_mean 0.03371351459172249
prod_rebuy_count 0.03364375484289956
nmf 0.032261554126551156
same_prod_rebuy_count 0.031878400620133404
age_std 0.028964150954797416
age 0.025798145230368653
dl 0.022262848135965
numberOfArticles 0.019369335589310293
time_passed_last_purchase 0.01711205403704486
freq_week 0.01194918871525492
different_week 0.005298889806617426
avg_purchase_time 0.00018364206361899492
