In [1]:
import random
import numpy as np
import pandas as pd
import lightgbm as lgb
from dask import dataframe as dd

In [2]:
data = dd.read_csv("data/ensemble_train/ensemble_data.csv/*.part", blocksize="64MB")
data = data.drop(["Unnamed: 0"], axis=1)
data.dtypes 

customer_id               object
article_id                 int64
label                    float64
prod_rebuy_count         float64
prod_sold_count          float64
age_std                  float64
age_mean                 float64
quotient                 float64
customer_index             int64
age                      float64
numberOfArticles         float64
same_prod_rebuy_count    float64
dtype: object

In [3]:
#data.time_passed_last_purchase = data.time_passed_last_purchase.astype(np.float64)
#data.prod_gender_group = data.prod_gender_group.astype('category')
#data.prod_age_bin = data.prod_age_bin.astype('category')
#data.FN = data.FN.astype('category')
#data.Active = data.Active.astype('category')
#data.club_member_status = data.club_member_status.astype('category')
#data.fashion_news_frequency = data.fashion_news_frequency.astype('category')
#data.customer_age_bin = data.customer_age_bin.astype('category')
#data.customer_gender_group = data.customer_gender_group.astype('category')

## Training LightGBM

In [4]:
number_of_customer = data.customer_index.max().compute()
randomlist = random.sample(range(0, number_of_customer),number_of_customer//10)
train = data[~data["customer_index"].isin(randomlist)].compute()
train.drop(columns="customer_index", inplace=True)

test = data[data["customer_index"].isin(randomlist)].compute()
test.drop(columns="customer_index", inplace=True)
q_train = train.groupby("customer_id")["customer_id"].count()
train_label = train.label
train = train.drop(["label", "customer_id", "article_id"], axis=1)

q_test = test.groupby("customer_id")["customer_id"].count()
test_label = test.label
test = test.drop(["label", "customer_id", "article_id"], axis=1)

In [6]:
model = lgb.LGBMRanker(
    objective="lambdarank",
    metric= "ndcg",
    boosting_type="dart",
#    learning_rate=0.01,
#    num_leaves=2048,
    n_estimators=512,
    importance_type='gain',
#    max_depth=128,
#    num_iterations=30,
#    verbose=10,
)

model.fit(
    train, train_label, group=q_train.values, eval_set=[(test, test_label)], eval_group=[q_test.values], eval_at=[12],
    callbacks=[lgb.callback.log_evaluation(), lgb.callback.early_stopping(5, first_metric_only=False)],
)

model.booster_.save_model('weights/lbm_lamda_ranker.txt',  num_iteration=model.best_iteration_)

[1]	valid_0's ndcg@12: 0.0701629
[2]	valid_0's ndcg@12: 0.0467735
[3]	valid_0's ndcg@12: 0.0581014
[4]	valid_0's ndcg@12: 0.0734156
[5]	valid_0's ndcg@12: 0.0813166
[6]	valid_0's ndcg@12: 0.0793523
[7]	valid_0's ndcg@12: 0.0814062
[8]	valid_0's ndcg@12: 0.0910874
[9]	valid_0's ndcg@12: 0.0953217
[10]	valid_0's ndcg@12: 0.0944371
[11]	valid_0's ndcg@12: 0.0921821
[12]	valid_0's ndcg@12: 0.0990012
[13]	valid_0's ndcg@12: 0.0958133
[14]	valid_0's ndcg@12: 0.0993463
[15]	valid_0's ndcg@12: 0.100068
[16]	valid_0's ndcg@12: 0.100486
[17]	valid_0's ndcg@12: 0.102877
[18]	valid_0's ndcg@12: 0.103182
[19]	valid_0's ndcg@12: 0.101821
[20]	valid_0's ndcg@12: 0.101764
[21]	valid_0's ndcg@12: 0.102268
[22]	valid_0's ndcg@12: 0.101792
[23]	valid_0's ndcg@12: 0.10244
[24]	valid_0's ndcg@12: 0.10255
[25]	valid_0's ndcg@12: 0.1028
[26]	valid_0's ndcg@12: 0.103459
[27]	valid_0's ndcg@12: 0.103499
[28]	valid_0's ndcg@12: 0.103563
[29]	valid_0's ndcg@12: 0.104097
[30]	valid_0's ndcg@12: 0.104131
[31]	vali

In [None]:
for i in model.feature_importances_.argsort()[::-1]:
    print(train.columns[i], model.feature_importances_[i]/model.feature_importances_.sum())

In [1]:
import gc
import cudf
import numpy as np
import pandas as pd
import lightgbm as lgb

model = lgb.Booster(model_file='weights/lbm_lamda_ranker.txt')

In [2]:
customers = pd.read_pickle("data/ensemble/customers.pkl")
customers.drop(columns="customer_index", inplace=True)
customers.drop_duplicates(subset=["customer_id"], inplace=True)
customers = customers.sort_values(by=["customer_id"])
articles = pd.read_pickle("data/ensemble/articles.pkl")
articles.drop_duplicates(subset=["article_id"], inplace=True)
customer_hist = pd.read_pickle("data/ensemble/customer_hist.pkl")
customer_hist.drop_duplicates(subset=["customer_id", "article_id"], inplace=True)
article_ids = articles.article_id.values.tolist()
customer_ids = customers.customer_id.values
customers = cudf.DataFrame.from_pandas(customers)
customer_hist = cudf.DataFrame.from_pandas(customer_hist)
articles = cudf.DataFrame.from_pandas(articles)
batch_size = 512
article_ids = article_ids * batch_size
article_ids_str = ('0' + articles.article_id.astype(str)).to_numpy()
submission = pd.DataFrame({"customer_id":[],"predict":[]})

In [3]:
%%time
df_list = []
loop_size = len(customer_ids) + batch_size
for batch_i in range(batch_size, loop_size, batch_size):
    customer_ids_batch = customer_ids[batch_i-batch_size:batch_i]
    customer_ids_batch = np.repeat(customer_ids_batch, len(article_ids)/batch_size)
    df = cudf.DataFrame({"customer_id": customer_ids_batch, "article_id": article_ids[:customer_ids_batch.shape[0]], "week": 39})
    df = df.merge(articles.rename(columns={"age_bin":"prod_age_bin", "gender_group":"prod_gender_group", \
                                           "rebuy_count":"prod_rebuy_count","price":"prod_avg_price"}), on="article_id", how="inner")
    df = df.merge(customers.rename(columns={"age_bin":"customer_age_bin", "gender_group":"customer_gender_group",\
                                            "rebuy_count":"customer_rebuy_count","price":"customer_avg_price", \
                                            "article_id":"article_hist", "week":"week_hist"}), on="customer_id", how="inner")
    df = df.merge(customer_hist, on=["customer_id","article_id"], how="left")
    df.same_prod_rebuy_count = df.same_prod_rebuy_count.fillna(0)
    df.avg_purchase_time = df.avg_purchase_time.fillna(0)
    df.time_passed_last_purchase = df.time_passed_last_purchase.fillna(39 - 29) # 6 mounths is nearly 29 week
    df.time_passed_last_purchase = df.apply(lambda x: x.week - x.time_passed_last_purchase)
    df.drop(columns=["customer_id", "article_id","week"], inplace=True)
    ensemble_scores = model.predict(df.to_numpy()).reshape((customer_ids[batch_i-batch_size:batch_i].shape[0], articles.shape[0]))
    indices = np.flip(np.argsort(ensemble_scores, axis=1), axis=1)[:,-12:]
    predicts = list(map(lambda x: ' '.join(x), article_ids_str[indices]))
    submission = pd.concat([submission, pd.DataFrame({"customer_id":customer_ids[batch_i-batch_size:batch_i],"predict":predicts})])
    del df
    gc.collect()
    print('\r' + f'{batch_i}: %{round(100*batch_i/loop_size, 2)}', end='')
print("\n")

685056: %99.98

CPU times: user 12h 15min 45s, sys: 7min 20s, total: 12h 23min 6s
Wall time: 1h 50min 48s


In [7]:
submission_dict = {}
for i,row in submission.iterrows():
    submission_dict[row.customer_id] = row.predict

In [13]:
base_submission = pd.read_csv("data/sample_submission.csv")
base_submission.prediction = base_submission.apply(lambda x: submission_dict[x.customer_id] if x.customer_id in submission_dict.keys() else x.prediction, axis=1)

In [16]:
base_submission.to_csv("submissions/submission.csv",index=False)
base_submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0870998004 0859400005 0866660002 0921347004 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0608776026 0743530002 0898918006 0732311003 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0730863012 0900256002 0535035002 0867503001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0922625004 0832520001 0915618001 0909370001 04...
