In [22]:
import random
import numpy as np
import pandas as pd
import lightgbm as lgb
from dask import dataframe as dd

In [23]:
data = dd.read_csv("data/ensemble_train/ensemble_data.csv/*.part", blocksize="64MB")
data = data.drop(["Unnamed: 0"], axis=1)
data.time_passed_last_purchase = data.time_passed_last_purchase.astype(np.float64)
data.prod_gender_group = data.prod_gender_group.astype('category')
data.prod_age_bin = data.prod_age_bin.astype('category')
data.FN = data.FN.astype('category')
data.Active = data.Active.astype('category')
data.club_member_status = data.club_member_status.astype('category')
data.fashion_news_frequency = data.fashion_news_frequency.astype('category')
data.customer_age_bin = data.customer_age_bin.astype('category')
data.customer_gender_group = data.customer_gender_group.astype('category')
data.dtypes 

customer_id                    object
article_id                      int64
label                         float64
prod_gender_group            category
prod_avg_price                float64
prod_rebuy_count              float64
prod_age_bin                 category
prod_sold_count               float64
quotient                      float64
customer_index                  int64
FN                           category
Active                       category
club_member_status           category
fashion_news_frequency       category
age                           float64
customer_age_bin             category
customer_gender_group        category
customer_rebuy_count          float64
customer_avg_price            float64
numberOfArticles              float64
same_prod_rebuy_count         float64
time_passed_last_purchase     float64
avg_purchase_time             float64
dtype: object

## Training LightGBM

In [24]:
number_of_samples = data.customer_index.max().compute()
train = data[data["customer_index"] <= int(number_of_samples * 0.9)].compute()
train.drop(columns="customer_index", inplace=True)

test = data[data["customer_index"] > int(number_of_samples * 0.9)].compute()
test.drop(columns="customer_index", inplace=True)
q_train = train.groupby("customer_id")["customer_id"].count()
train_label = train.label
train = train.drop(["label", "customer_id", "article_id"], axis=1)

q_test = test.groupby("customer_id")["customer_id"].count()
test_label = test.label
test = test.drop(["label", "customer_id", "article_id"], axis=1)

In [26]:
model = lgb.LGBMRanker(
    objective="lambdarank",
    metric= "ndcg",
    learning_rate=0.00001,
    max_depth=512,
    num_leaves=4096,
    num_iterations=30
    
)

model.fit(
    train, train_label, group=q_train.values, eval_set=[(test, test_label)], eval_group=[q_test.values], eval_at=[12, 7, 3],
    callbacks=[lgb.callback.log_evaluation(), lgb.callback.early_stopping(5, first_metric_only=False)],
    categorical_feature = 'auto'
)

model.booster_.save_model('weights/lbm_lamda_ranker.txt',  num_iteration=model.best_iteration_)



[1]	valid_0's ndcg@3: 0.124696	valid_0's ndcg@7: 0.145493	valid_0's ndcg@12: 0.167735
Training until validation scores don't improve for 5 rounds
[2]	valid_0's ndcg@3: 0.123928	valid_0's ndcg@7: 0.144043	valid_0's ndcg@12: 0.165996
[3]	valid_0's ndcg@3: 0.13315	valid_0's ndcg@7: 0.153663	valid_0's ndcg@12: 0.176615
[4]	valid_0's ndcg@3: 0.135169	valid_0's ndcg@7: 0.154629	valid_0's ndcg@12: 0.178029
[5]	valid_0's ndcg@3: 0.138138	valid_0's ndcg@7: 0.157366	valid_0's ndcg@12: 0.18062
[6]	valid_0's ndcg@3: 0.139125	valid_0's ndcg@7: 0.159193	valid_0's ndcg@12: 0.181639
[7]	valid_0's ndcg@3: 0.139991	valid_0's ndcg@7: 0.160573	valid_0's ndcg@12: 0.18285
[8]	valid_0's ndcg@3: 0.140646	valid_0's ndcg@7: 0.160256	valid_0's ndcg@12: 0.183002
[9]	valid_0's ndcg@3: 0.140914	valid_0's ndcg@7: 0.161181	valid_0's ndcg@12: 0.183933
[10]	valid_0's ndcg@3: 0.141954	valid_0's ndcg@7: 0.162355	valid_0's ndcg@12: 0.185497
[11]	valid_0's ndcg@3: 0.141957	valid_0's ndcg@7: 0.162318	valid_0's ndcg@12: 0.18

<lightgbm.basic.Booster at 0x229a83ab910>

## Prediction LightGBM

In [1]:
import gc
import cudf
import numpy as np
import pandas as pd
import lightgbm as lgb

model = lgb.Booster(model_file='weights/lbm_lamda_ranker.txt')

In [2]:
customers = pd.read_pickle("data/ensemble/customers.pkl")
customers.drop(columns="customer_index", inplace=True)
customers.drop_duplicates(subset=["customer_id"], inplace=True)
customers = customers.sort_values(by=["customer_id"])
articles = pd.read_pickle("data/ensemble/articles.pkl")
articles.drop_duplicates(subset=["article_id"], inplace=True)
customer_hist = pd.read_pickle("data/ensemble/customer_hist.pkl")
customer_hist.drop_duplicates(subset=["customer_id", "article_id"], inplace=True)
article_ids = articles.article_id.values.tolist()
customer_ids = customers.customer_id.values
customers = cudf.DataFrame.from_pandas(customers)
customer_hist = cudf.DataFrame.from_pandas(customer_hist)
articles = cudf.DataFrame.from_pandas(articles)
batch_size = 512
article_ids = article_ids * batch_size
article_ids_str = ('0' + articles.article_id.astype(str)).to_numpy()
submission = pd.DataFrame({"customer_id":[],"predict":[]})

In [None]:
%%time
df_list = []
loop_size = len(customer_ids) + batch_size
for batch_i in range(batch_size, loop_size, batch_size):
    customer_ids_batch = customer_ids[0:batch_size]
    customer_ids_batch = np.repeat(customer_ids_batch, len(article_ids)/batch_size)
    df = cudf.DataFrame({"customer_id": customer_ids_batch, "article_id": article_ids, "week": 39})
    df = df.merge(articles.rename(columns={"age_bin":"prod_age_bin", "gender_group":"prod_gender_group", \
                                           "rebuy_count":"prod_rebuy_count","price":"prod_avg_price"}), on="article_id", how="inner")
    df = df.merge(customers.rename(columns={"age_bin":"customer_age_bin", "gender_group":"customer_gender_group",\
                                            "rebuy_count":"customer_rebuy_count","price":"customer_avg_price", \
                                            "article_id":"article_hist", "week":"week_hist"}), on="customer_id", how="inner")
    df = df.merge(customer_hist, on=["customer_id","article_id"], how="left")
    df.same_prod_rebuy_count = df.same_prod_rebuy_count.fillna(0)
    df.avg_purchase_time = df.avg_purchase_time.fillna(0)
    df.time_passed_last_purchase = df.time_passed_last_purchase.fillna(39 - 29) # 6 mounths is nearly 29 week
    df.time_passed_last_purchase = df.apply(lambda x: x.week - x.time_passed_last_purchase)
    df.drop(columns=["customer_id", "article_id","week"], inplace=True)
    ensemble_scores = model.predict(df.to_numpy()).reshape((batch_size, articles.shape[0]))
    indices = np.flip(np.argsort(ensemble_scores, axis=1), axis=1)[:,-12:]
    predicts = list(map(lambda x: ' '.join(x), article_ids_str[indices]))
    submission = pd.concat([submission, pd.DataFrame({"customer_id":customer_ids[0:batch_size],"predict":predicts})])
    del df
    gc.collect()
    print('\r' + f'{batch_i}: %{round(100*batch_i/loop_size, 2)}', end='')
print("\n")

556032: %81.15

In [None]:
submission.head()