In [None]:
import random
import numpy as np
import pandas as pd
import lightgbm as lgb
from dask import dataframe as dd

In [None]:
data = dd.read_csv("data/ensemble_train/ensemble_data.csv/*.part", blocksize="64MB")
data = data.drop(["Unnamed: 0"], axis=1)
data.dtypes 

In [None]:
#data.time_passed_last_purchase = data.time_passed_last_purchase.astype(np.float64)
#data.prod_gender_group = data.prod_gender_group.astype('category')
#data.prod_age_bin = data.prod_age_bin.astype('category')
#data.FN = data.FN.astype('category')
#data.Active = data.Active.astype('category')
#data.club_member_status = data.club_member_status.astype('category')
#data.fashion_news_frequency = data.fashion_news_frequency.astype('category')
#data.customer_age_bin = data.customer_age_bin.astype('category')
#data.customer_gender_group = data.customer_gender_group.astype('category')

## Training LightGBM

In [None]:
number_of_customer = data.customer_index.max().compute()
randomlist = random.sample(range(0, number_of_customer),number_of_customer//10)
train = data[~data["customer_index"].isin(randomlist)].compute()
train.drop(columns="customer_index", inplace=True)

test = data[data["customer_index"].isin(randomlist)].compute()
test.drop(columns="customer_index", inplace=True)
q_train = train.groupby("customer_id")["customer_id"].count()
train_label = train.label
train = train.drop(["label", "customer_id", "article_id"], axis=1)

q_test = test.groupby("customer_id")["customer_id"].count()
test_label = test.label
test = test.drop(["label", "customer_id", "article_id"], axis=1)

In [None]:
model = lgb.LGBMRanker(
    objective="lambdarank",
    metric= "ndcg",
    boosting_type="dart",
#    learning_rate=0.01,
#    num_leaves=2048,
    n_estimators=512,
    importance_type='gain',
#    max_depth=128,
#    num_iterations=30,
#    verbose=10,
)

model.fit(
    train, train_label, group=q_train.values, eval_set=[(test, test_label)], eval_group=[q_test.values], eval_at=[12],
    callbacks=[lgb.callback.log_evaluation(), lgb.callback.early_stopping(5, first_metric_only=False)],
)

model.booster_.save_model('weights/lbm_lamda_ranker.txt',  num_iteration=model.best_iteration_)

In [None]:
for i in model.feature_importances_.argsort()[::-1]:
    print(train.columns[i], model.feature_importances_[i]/model.feature_importances_.sum())

In [2]:
import gc
import cudf
import numpy as np
import pandas as pd
import lightgbm as lgb

model = lgb.Booster(model_file='weights/lbm_lamda_ranker.txt')

In [3]:
customers = pd.read_pickle("data/ensemble/customers.pkl")
customers.drop(columns="customer_index", inplace=True)
customers.drop_duplicates(subset=["customer_id"], inplace=True)
customers = customers.sort_values(by=["customer_id"])
articles = pd.read_pickle("data/ensemble/articles.pkl")
articles.drop_duplicates(subset=["article_id"], inplace=True)
customer_hist = pd.read_pickle("data/ensemble/customer_hist.pkl")
customer_hist.drop_duplicates(subset=["customer_id", "article_id"], inplace=True)
article_ids = articles.article_id.values.tolist()
customer_ids = customers.customer_id.values
customers = cudf.DataFrame.from_pandas(customers)
customer_hist = cudf.DataFrame.from_pandas(customer_hist)
articles = cudf.DataFrame.from_pandas(articles)
batch_size = 512
article_ids = article_ids * batch_size
article_ids_str = ('0' + articles.article_id.astype(str)).to_numpy()
submission = pd.DataFrame({"customer_id":[],"predict":[]})

In [4]:
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,age_bin,gender_group,rebuy_count,price,numberOfArticles
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0,0,1,1,0.39759,3,2,0.0,0.050831,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0,0,1,1,0.108434,1,2,0.0,0.022017,0.025189
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0,0,1,1,0.096386,1,0,0.0,0.037271,0.005038
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1,1,1,0,0.433735,3,2,0.0,0.03422,0.010076
4,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,0,0,1,1,0.048193,1,1,0.0,0.038119,0.002519


In [5]:
articles.head()

Unnamed: 0,article_id,gender_group,price,rebuy_count,age_bin,prod_sold_count,quotient
0,108775044,2,0.016658,0.010206,0,0.000538,0.001179
27,111565001,2,0.012318,0.022987,0,0.04411,0.006522
54,111586001,2,0.02679,0.019888,0,0.127488,0.004821
74,111593001,2,0.026118,0.022327,0,0.123722,0.002481
94,111609001,2,0.018999,0.015368,1,0.025282,0.018896


In [None]:
%%time
df_list = []
loop_size = len(customer_ids) + batch_size
for batch_i in range(batch_size, loop_size, batch_size):
    customer_ids_batch = customer_ids[batch_i-batch_size:batch_i]
    customer_ids_batch = np.repeat(customer_ids_batch, len(article_ids)/batch_size)
    df = cudf.DataFrame({"customer_id": customer_ids_batch, "article_id": article_ids[:customer_ids_batch.shape[0]], "week": 39})
    df = df.merge(articles.rename(columns={"age_bin":"prod_age_bin", "gender_group":"prod_gender_group", \
                                           "rebuy_count":"prod_rebuy_count","price":"prod_avg_price"}), on="article_id", how="inner")
    df = df.merge(customers.rename(columns={"age_bin":"customer_age_bin", "gender_group":"customer_gender_group",\
                                            "rebuy_count":"customer_rebuy_count","price":"customer_avg_price", \
                                            "article_id":"article_hist", "week":"week_hist"}), on="customer_id", how="inner")
    df = df.merge(customer_hist, on=["customer_id","article_id"], how="left")
    df.same_prod_rebuy_count = df.same_prod_rebuy_count.fillna(0)
    df.avg_purchase_time = df.avg_purchase_time.fillna(0)
    df.time_passed_last_purchase = df.time_passed_last_purchase.fillna(39 - 29) # 6 mounths is nearly 29 week
    df.time_passed_last_purchase = df.apply(lambda x: x.week - x.time_passed_last_purchase)
    df.drop(columns=["customer_id", "article_id","week"], inplace=True)
    ensemble_scores = model.predict(df.to_numpy()).reshape((customer_ids[batch_i-batch_size:batch_i].shape[0], articles.shape[0]))
    indices = np.flip(np.argsort(ensemble_scores, axis=1), axis=1)[:,-12:]
    predicts = list(map(lambda x: ' '.join(x), article_ids_str[indices]))
    submission = pd.concat([submission, pd.DataFrame({"customer_id":customer_ids[batch_i-batch_size:batch_i],"predict":predicts})])
    del df
    gc.collect()
    print('\r' + f'{batch_i}: %{round(100*batch_i/loop_size, 2)}', end='')
print("\n")

In [None]:
submission_dict = {}
for i,row in submission.iterrows():
    submission_dict[row.customer_id] = row.predict

In [None]:
base_submission = pd.read_csv("data/sample_submission.csv")
base_submission.prediction = base_submission.apply(lambda x: submission_dict[x.customer_id] if x.customer_id in submission_dict.keys() else x.prediction, axis=1)

In [None]:
base_submission.to_csv("submissions/submission.csv",index=False)
base_submission.head()