In [14]:
import random
import numpy as np
import pandas as pd
from dask import dataframe as dd
import lightgbm as lgb

In [7]:
data = dd.read_csv("data/ensemble_train/ensemble_data.csv/*.part", blocksize="64MB", dtype={"label": np.float32, "prod_rebuy_count": np.float32, "prod_sold_count":np.float32, "age_std":np.float32,
                                                                                            "age_mean":np.float32, "quotient":np.float32, "age":np.float32, "numberOfArticles":np.float32,
                                                                                            "same_prod_rebuy_count":np.float32, "time_passed_last_purchase":np.float32, "avg_purchase_time":np.float32})
data = data.drop(["Unnamed: 0"], axis=1)
data.dtypes 

customer_id                   object
article_id                     int64
label                        float32
prod_rebuy_count             float32
prod_sold_count              float32
age_std                      float32
age_mean                     float32
quotient                     float32
customer_index                 int64
age                          float32
numberOfArticles             float32
same_prod_rebuy_count        float32
time_passed_last_purchase    float32
avg_purchase_time            float32
dtype: object

In [8]:
#data.time_passed_last_purchase = data.time_passed_last_purchase.astype(np.float64)
#data.prod_gender_group = data.prod_gender_group.astype('category')
#data.prod_age_bin = data.prod_age_bin.astype('category')
#data.FN = data.FN.astype('category')
#data.Active = data.Active.astype('category')
#data.club_member_status = data.club_member_status.astype('category')
#data.fashion_news_frequency = data.fashion_news_frequency.astype('category')
#data.customer_age_bin = data.customer_age_bin.astype('category')
#data.customer_gender_group = data.customer_gender_group.astype('category')

## Training LightGBM

In [9]:
number_of_customer = data.customer_index.max().compute()
randomlist = random.sample(range(0, number_of_customer),number_of_customer//10)
train = data[~data["customer_index"].isin(randomlist)].compute()
train.drop(columns="customer_index", inplace=True)

test = data[data["customer_index"].isin(randomlist)].compute()
test.drop(columns="customer_index", inplace=True)
q_train = train.groupby("customer_id")["customer_id"].count()
train_label = train.label
train = train.drop(["label", "customer_id", "article_id"], axis=1)

q_test = test.groupby("customer_id")["customer_id"].count()
test_label = test.label
test = test.drop(["label", "customer_id", "article_id"], axis=1)

In [10]:
train.head()

Unnamed: 0,prod_rebuy_count,prod_sold_count,age_std,age_mean,quotient,age,numberOfArticles,same_prod_rebuy_count,time_passed_last_purchase,avg_purchase_time
0,0.016981,0.001384,0.582456,0.431195,0.007102,0.096386,0.004706,0.0,1.0,0.0
1,0.0,0.000923,0.526745,0.171037,0.001773,0.096386,0.004706,0.0,1.0,0.0
2,0.000588,0.002307,0.561633,0.279337,0.002218,0.096386,0.004706,0.0,1.0,0.0
3,0.0,0.032764,0.464198,0.30235,0.000192,0.096386,0.004706,0.0,1.0,0.0
4,0.0,0.000923,0.524866,0.178622,0.002662,0.096386,0.004706,0.0,1.0,0.0


In [17]:
model = lgb.LGBMRanker(
    objective="lambdarank",
    metric= "ndcg",
    boosting_type="dart",
#   learning_rate=0.01,
    num_leaves=256,
    n_estimators=512,
    importance_type='gain',
    device= "gpu",
    gpu_platform_id= 0,
    gpu_device_id= 0,
#   max_depth=128,
    num_iterations=50,
#   verbose=10,
)

model.fit(
    train, train_label, group=q_train.values, eval_set=[(test, test_label)], eval_group=[q_test.values], eval_at=[12],
    callbacks=[lgb.callback.log_evaluation()],
)

model.booster_.save_model('weights/lbm_lamda_ranker.txt',  num_iteration=model.best_iteration_)



[1]	valid_0's ndcg@12: 0.113431
[2]	valid_0's ndcg@12: 0.119593
[3]	valid_0's ndcg@12: 0.133406
[4]	valid_0's ndcg@12: 0.144236
[5]	valid_0's ndcg@12: 0.146609
[6]	valid_0's ndcg@12: 0.147411
[7]	valid_0's ndcg@12: 0.148621
[8]	valid_0's ndcg@12: 0.149093
[9]	valid_0's ndcg@12: 0.149816
[10]	valid_0's ndcg@12: 0.150007
[11]	valid_0's ndcg@12: 0.150618
[12]	valid_0's ndcg@12: 0.150643
[13]	valid_0's ndcg@12: 0.150657
[14]	valid_0's ndcg@12: 0.15069
[15]	valid_0's ndcg@12: 0.150354
[16]	valid_0's ndcg@12: 0.150608
[17]	valid_0's ndcg@12: 0.150782
[18]	valid_0's ndcg@12: 0.151144
[19]	valid_0's ndcg@12: 0.150984
[20]	valid_0's ndcg@12: 0.150933
[21]	valid_0's ndcg@12: 0.151439
[22]	valid_0's ndcg@12: 0.151341
[23]	valid_0's ndcg@12: 0.151022
[24]	valid_0's ndcg@12: 0.151372
[25]	valid_0's ndcg@12: 0.152019
[26]	valid_0's ndcg@12: 0.151673
[27]	valid_0's ndcg@12: 0.151796
[28]	valid_0's ndcg@12: 0.151864
[29]	valid_0's ndcg@12: 0.151949
[30]	valid_0's ndcg@12: 0.152349
[31]	valid_0's ndcg@

<lightgbm.basic.Booster at 0x16912f9e280>

In [18]:
for i in model.feature_importances_.argsort()[::-1]:
    print(train.columns[i], model.feature_importances_[i]/model.feature_importances_.sum())

prod_sold_count 0.3871510235133944
same_prod_rebuy_count 0.18926619342993617
time_passed_last_purchase 0.08102875300143439
quotient 0.07569838576832018
age 0.06763288233957136
age_mean 0.06289784752067851
age_std 0.052353041999002224
prod_rebuy_count 0.04485166147753122
numberOfArticles 0.038807749101788
avg_purchase_time 0.0003124618483435447


In [1]:
!pip install lightgbm

[0m

In [1]:
import gc
import cudf
import numpy as np
import pandas as pd
import lightgbm as lgb

model = lgb.Booster(model_file='weights/lbm_lamda_ranker.txt')

In [2]:
customers = pd.read_pickle("data/ensemble/customers.pkl")
customers.drop(columns="customer_index", inplace=True)
customers.drop_duplicates(subset=["customer_id"], inplace=True)
customers = customers.sort_values(by=["customer_id"])
articles = pd.read_pickle("data/ensemble/articles.pkl")
articles.drop_duplicates(subset=["article_id"], inplace=True)
customer_hist = pd.read_pickle("data/ensemble/customer_hist.pkl")
customer_hist.drop_duplicates(subset=["customer_id", "article_id"], inplace=True)
article_ids = articles.article_id.values.tolist()
customer_ids = customers.customer_id.values
customers = cudf.DataFrame.from_pandas(customers)
customer_hist = cudf.DataFrame.from_pandas(customer_hist)
articles = cudf.DataFrame.from_pandas(articles)
batch_size = 512
article_ids = article_ids * batch_size
article_ids_str = ('0' + articles.article_id.astype(str)).to_numpy()
submission = pd.DataFrame({"customer_id":[],"predict":[]})

In [3]:
%%time
df_list = []
loop_size = len(customer_ids) + batch_size
for batch_i in range(batch_size, loop_size, batch_size):
    customer_ids_batch = customer_ids[batch_i-batch_size:batch_i]
    customer_ids_batch = np.repeat(customer_ids_batch, len(article_ids)/batch_size)
    df = cudf.DataFrame({"customer_id": customer_ids_batch, "article_id": article_ids[:customer_ids_batch.shape[0]]})
    df = df.merge(articles, on="article_id", how="inner")
    df = df.merge(customers, on="customer_id", how="inner")
    df = df.merge(customer_hist, on=["customer_id","article_id"], how="left")
    df.same_prod_rebuy_count = df.same_prod_rebuy_count.fillna(0)
    df.avg_purchase_time = df.avg_purchase_time.fillna(0)
    df.time_passed_last_purchase = df.time_passed_last_purchase.fillna(1)
    df.drop(columns=["customer_id", "article_id"], inplace=True)
    model_inputs = df.to_numpy()
    del df
    gc.collect()
    ensemble_scores = model.predict(model_inputs, device = 'gpu', gpu_platform_id = 0, gpu_device_id = 0).reshape((customer_ids[batch_i-batch_size:batch_i].shape[0], articles.shape[0]))
    indices = np.flip(np.argsort(ensemble_scores, axis=1), axis=1)[:,-12:]
    predicts = list(map(lambda x: ' '.join(x), article_ids_str[indices]))
    submission = pd.concat([submission, pd.DataFrame({"customer_id":customer_ids[batch_i-batch_size:batch_i],"predict":predicts})])
    print('\r' + f'{batch_i}: %{round(100*batch_i/loop_size, 2)}', end='')
print("\n")

685056: %99.98

CPU times: user 2d 9h 39min 23s, sys: 4min 41s, total: 2d 9h 44min 4s
Wall time: 5h 39min 3s


In [4]:
submission_dict = {}
for i,row in submission.iterrows():
    submission_dict[row.customer_id] = row.predict

In [5]:
base_submission = pd.read_csv("data/sample_submission.csv")
base_submission.prediction = base_submission.apply(lambda x: submission_dict[x.customer_id] if x.customer_id in submission_dict.keys() else x.prediction, axis=1)

In [6]:
base_submission.to_csv("submissions/submission.csv",index=False)
base_submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0832997004 0789690001 0805947001 0717490070 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0920753001 0905781001 0738713036 0770703001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0877274002 0201219014 0889816002 0879827001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0751471043 0751471001 0918522001 0924243001 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0903540001 0866488003 0907903001 0910949002 08...
