In [1]:
import json
import numpy as np
import pandas as pd
import lightgbm as lgb
from dask import dataframe as dd

In [2]:
transactions = dd.read_csv('data/transactions_train.csv')[["customer_id", "article_id"]]
customer_purchase_number = transactions.groupby("customer_id").size().to_frame("prod_number").reset_index()
transactions = transactions.merge(customer_purchase_number, on="customer_id", how="inner")

c_ids = transactions.customer_id.unique()
number_of_customer = len(c_ids)
customer_encoding = {c_id: i for i, c_id in enumerate(c_ids)}
p_ids = transactions.article_id.unique()
number_of_products = len(p_ids)
product_encoding = {p_id: i for i, p_id in enumerate(p_ids)}
with open('model_data/customer_id_encoding.json', 'w') as fp:
    json.dump(customer_encoding, fp)

with open('model_data/product_id_encoding.json', 'w') as fp:
    json.dump(product_encoding, fp)

transactions.customer_id = transactions.customer_id.map(customer_encoding)
transactions.article_id = transactions.article_id.map(product_encoding)
p_ids = list(product_encoding.values())
del c_ids, product_encoding, customer_encoding, customer_purchase_number

In [3]:
train, test = transactions.random_split([0.9, 0.1], random_state=43)
df = test.merge(train[["customer_id"]], on=["customer_id"], how="outer", indicator=True)
train = dd.concat([train, df[(df._merge == 'left_only') |  (df.prod_number == 1)][["customer_id", "article_id"]]], axis=0, ignore_index=True, interleave_partitions=True, ignore_order=True)
df = df[(df._merge == 'both') &( df.prod_number > 1)][["customer_id"]].drop_duplicates()
test = test.merge(df, how="inner", on="customer_id")
train = train.drop(["prod_number"], axis=1)
test = test.drop(["prod_number"], axis=1)
cols = ["customer_id", "article_id"]
train[cols] = train[cols].applymap(np.float32)
test[cols] = test[cols].applymap(np.float32)
train = train.compute()
test = test.compute()
del transactions, df

In [4]:
train.sort_values(by=["customer_id"], inplace=True)
test.sort_values(by=["customer_id"], inplace=True)
q_train = train.groupby("customer_id").size().to_frame("prod_number").reset_index()
q_test = test.groupby("customer_id").size().to_frame("prod_number").reset_index() 
train["article_id"] = pd.Series(train.article_id, dtype="category")
test["article_id"] = pd.Series(test.article_id, dtype="category")

In [5]:
train["label"] = 1.0
test["label"] = 1.0
train = train.drop(["customer_id"], axis=1)
test = test.drop(["customer_id"], axis=1)
train.to_pickle("data/lgbm_data/train_data.pkl")
test.to_pickle("data/lgbm_data/test_data.pkl")
q_train.to_pickle("data/lgbm_data/q_train_data.pkl")
q_test.to_pickle("data/lgbm_data/q_test_data.pkl")

## Training LightGBM

In [6]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from dask import dataframe as dd

In [7]:
# 10 sec
train = pd.read_pickle("data/lgbm_data/train_data.pkl")
test = pd.read_pickle("data/lgbm_data/test_data.pkl")
q_train = pd.read_pickle("data/lgbm_data/q_train_data.pkl")
q_test = pd.read_pickle("data/lgbm_data/q_test_data.pkl")

In [10]:
gbm = lgb.LGBMRanker()
gbm.fit(
    train[["article_id"]], train.label, group=q_train.prod_number, early_stopping_rounds=10,
    eval_set = [(test[["article_id"]], test.label)], eval_group = [q_test.prod_number], verbose=10,
    eval_at = [1,3,5,7,10]
)

[10]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 1	valid_0's ndcg@5: 1	valid_0's ndcg@7: 1	valid_0's ndcg@10: 1


LGBMRanker()