In [1]:
!pip install lightgbm
!pip install h5py

Collecting lightgbm
  Downloading lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m925.9 kB/s[0m eta [36m0:00:00[0m:01[0m00:01[0m
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.2
[0mCollecting h5py
  Downloading h5py-3.6.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Installing collected packages: h5py
Successfully installed h5py-3.6.0
[0m

In [1]:
import gc
import cudf
import h5py
import numpy as np
import pandas as pd
import lightgbm as lgb

model = lgb.Booster(model_file='weights/lbm_lamda_ranker.txt', params = {"device_type":"gpu"})

In [2]:
customers = pd.read_pickle("data/ensemble/customers.pkl")
customers.drop(columns="customer_index", inplace=True)
customers.drop_duplicates(subset=["customer_id"], inplace=True)
customers = customers.sort_values(by=["customer_id"])
articles = pd.read_pickle("data/ensemble/articles.pkl")
articles.drop_duplicates(subset=["article_id"], inplace=True)
customer_hist = pd.read_pickle("data/ensemble/customer_hist.pkl")
customer_hist.drop_duplicates(subset=["customer_id", "article_id"], inplace=True)
article_ids = articles.article_id.values.tolist()
customer_ids = customers.customer_id.values
customers = cudf.DataFrame.from_pandas(customers)
customer_hist = cudf.DataFrame.from_pandas(customer_hist)
articles = cudf.DataFrame.from_pandas(articles)
batch_size = 512
article_ids = article_ids * batch_size
article_ids_str = ('0' + articles.article_id.astype(str)).to_numpy()
submission = pd.DataFrame({"customer_id":[],"predict":[]})

In [3]:
customer_encoding = {k:v for v,k in enumerate(customers.customer_id.to_numpy())}
product_encoding = {k:v for v,k in enumerate(articles.article_id.to_numpy())}
d2v = h5py.File('personalization/ensemble/d2vf.h5', "r")["d2v"]
dl = h5py.File('personalization/ensemble/dl.h5', "r")["dl"]
tf_idf = h5py.File('personalization/ensemble/tf_idf.h5', "r")["tf_idf"]
nmf = h5py.File('personalization/ensemble/nmf.h5', "r")["nmf"]
lda = h5py.File('personalization/ensemble/lda.h5', "r")["lda"]

In [8]:
%%time
df_list = []
loop_size = len(customer_ids) + batch_size
for batch_i in range(batch_size, loop_size, batch_size):
    customer_ids_batch = customer_ids[batch_i-batch_size:batch_i]
    customer_ids_batch = np.repeat(customer_ids_batch, len(article_ids)/batch_size)
    df = cudf.DataFrame({"customer_id": customer_ids_batch, "article_id": article_ids[:customer_ids_batch.shape[0]],
                         "d2v": d2v[batch_i-batch_size:batch_i].reshape(-1), "dl":dl[batch_i-batch_size:batch_i].reshape(-1),
                         "tf_idf":tf_idf[batch_i-batch_size:batch_i].reshape(-1), "nmf":nmf[batch_i-batch_size:batch_i].reshape(-1),
                         "lda":lda[batch_i-batch_size:batch_i].reshape(-1)})
    df = df.merge(articles, on="article_id", how="inner")
    df = df.merge(customers, on="customer_id", how="inner")
    df = df.merge(customer_hist, on=["customer_id","article_id"], how="left")
    df.same_prod_rebuy_count = df.same_prod_rebuy_count.fillna(0)
    df.avg_purchase_time = df.avg_purchase_time.fillna(0)
    df.time_passed_last_purchase = df.time_passed_last_purchase.fillna(1)
    df.drop(columns=["customer_id", "article_id"], inplace=True)
    model_inputs = df.to_numpy()
    del df
    gc.collect()
    ensemble_scores = model.predict(model_inputs, device = 'gpu', gpu_platform_id = 0, gpu_device_id = 0).reshape((customer_ids[batch_i-batch_size:batch_i].shape[0], articles.shape[0]))
    indices = np.flip(np.argsort(ensemble_scores, axis=1), axis=1)[:,-12:]
    predicts = list(map(lambda x: ' '.join(x), article_ids_str[indices]))
    submission = pd.concat([submission, pd.DataFrame({"customer_id":customer_ids[batch_i-batch_size:batch_i],"predict":predicts})])
    print('\r' + f'{batch_i}: %{round(100*batch_i/loop_size, 2)}', end='')
print("\n")

685056: %99.98

CPU times: user 6d 5h 52min 32s, sys: 9min 29s, total: 6d 6h 2min 2s
Wall time: 14h 38min 38s


In [9]:
submission_dict = {}
for i,row in submission.iterrows():
    submission_dict[row.customer_id] = row.predict

In [10]:
base_submission = pd.read_csv("data/sample_submission.csv")
base_submission.prediction = base_submission.apply(lambda x: submission_dict[x.customer_id] if x.customer_id in submission_dict.keys() else x.prediction, axis=1)

In [11]:
base_submission.to_csv("submissions/submission.csv",index=False)
base_submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0876357001 0717816005 0831269009 0890021001 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0888945002 0687704002 0877268004 0900670001 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0903870001 0902017001 0699923121 0153115021 04...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0751471043 0751471001 0918522001 0924243001 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0882059002 0816166010 0182909001 0516000001 06...
