In [1]:
# %pip install -U lightgbm==3.3.2

In [3]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

In [3]:
import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("../") # path to the `src`` folder
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [4]:
# !ln -s /home/tarique/myvenv/lib/gcc/x86_64-linux-gnu/5.4.0/libgomp.so /usr/lib/x86_64-linux-gnu/
# /home/tarique/myvenv/lib/gcc/x86_64-linux-gnu/5.4.0/libgomp.so

In [5]:
from src.data import DataHelper
from src.data.metrics import map_at_k, hr_at_k, recall_at_k

from src.retrieval.rules import (
    OrderHistory,
    OrderHistoryDecay,
    ItemPair,
    UserGroupTimeHistory,
    TimeHistory,
#     ALS,
#     BPR,
    ItemCF,
    UserGroupItemCF
)
from src.retrieval.collector import RuleCollector

from src.features import cum_sale, week_sale, repurchase_ratio, purchased_before, popularity

from src.utils import (
    calc_valid_date,
    merge_week_data,
    reduce_mem_usage,
    calc_embd_similarity
)

In [6]:
data_dir = Path("../src/data/")
model_dir = Path("../src/models/")

In [7]:
TRAIN_WEEK_NUM = 5
WEEK_NUM = TRAIN_WEEK_NUM + 2
# VERSION_NAME = "pivot"# "v1"
VERSION_NAME = "Large_Recall"# "v1"

valid_week = 1
train_n_week = TRAIN_WEEK_NUM

In [8]:
import os
if not os.path.exists(data_dir/"interim"/VERSION_NAME):
    os.mkdir(data_dir/"interim"/VERSION_NAME)
if not os.path.exists(data_dir/"processed"/VERSION_NAME):
    os.mkdir(data_dir/"processed"/VERSION_NAME)

In [9]:
TEST = False # * Set as `False` when do local experiments to save time

Pepare data: encoding ids and preprocessing

In [10]:
dh = DataHelper(data_dir)

In [11]:
# data = dh.preprocess_data(save=True, name="encoded_full") # * run only once, processed data will be saved

In [12]:
data = dh.load_data(name="encoded_full")

In [13]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

## Retrieval


Generate candidates for each week

In [14]:
user_info = data["inter"].groupby(['customer_id'])['price'].mean().reset_index(name='mean_price')
user_info['purchase_ability'] = pd.qcut(user_info['mean_price'], 5, labels=False)
del user_info['mean_price']

listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
data['user']['age_bins'] = pd.cut(data['user']['age'], listBin)
data['user'] = data['user'].merge(user_info, on='customer_id', how='left')

In [15]:
# # * WEEK_NUM = 0: test
# # * WEEK_NUM = 1: valid
# # * WEEK_NUM > 1: train

# for week in range(WEEK_NUM):
#     # * use sliding window to generate candidates
#     if week == 0 and not TEST:
#         continue
#     trans = data["inter"]

#     start_date, end_date = calc_valid_date(week)
#     print(f"Week {week}: [{start_date}, {end_date})")

#     train, valid = dh.split_data(trans, start_date, end_date)
#     train = train.merge(data['user'][['customer_id','age_bins','user_gender']], on='customer_id', how='left')
#     train = train.merge(user_info, on='customer_id', how='left')

#     train['t_dat'] = pd.to_datetime(train['t_dat'])
#     last_week = train[train['t_dat']>train['t_dat'].max()-pd.Timedelta(days=7)]
#     last_2week = train[train['t_dat']>train['t_dat'].max()-pd.Timedelta(days=14)]
#     last_60day = train[train['t_dat']>train['t_dat'].max()-pd.Timedelta(days=60)]
#     last_80day = train[train['t_dat']>train['t_dat'].max()-pd.Timedelta(days=80)]

#     if week != 0:
#         customer_list = valid["customer_id"].values
#     else:
#         customer_list = submission['customer_id'].values

#     # * ========================== Retrieval Strategies ==========================

#     candidates = RuleCollector().collect(
#         week_num = week,
#         trans_df = trans,
#         customer_list=customer_list,
#         rules=[
#             UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=200, scale=True, name='1'),
#             UserGroupTimeHistory(data, customer_list, last_week, ['purchase_ability'], n=200, scale=True, name='2'),
#             UserGroupTimeHistory(data, customer_list, last_week, ['user_gender'], n=200, scale=True, name='3'),
#             OrderHistory(train, days=35, n=200),
#             OrderHistoryDecay(train, days=7, n=200),
#             TimeHistory(customer_list, last_week, n=200),
#             ItemCF(last_80day, last_2week, top_k=10, name='1'),
#             ItemCF(last_60day, last_2week, top_k=10, name='2'),
#             ItemCF(last_2week, last_2week, top_k=10, name='3'),
#             UserGroupItemCF(last_80day, last_2week, 'age_bins', top_k=10, name='1'),
#             UserGroupItemCF(last_60day, last_2week, 'age_bins', top_k=10, name='2'),
#             UserGroupItemCF(last_2week, last_2week, 'age_bins', top_k=10, name='3'),
#             UserGroupItemCF(last_80day, last_2week, 'purchase_ability', top_k=10, name='4'),
#             UserGroupItemCF(last_60day, last_2week, 'purchase_ability', top_k=10, name='5'),
#             UserGroupItemCF(last_2week, last_2week, 'purchase_ability', top_k=10, name='6'),
#         ],
#         min_pos_rate=0.0,
#         norm=True,
#         compress=False,
#     )

#     candidates, _ = reduce_mem_usage(candidates)
#     candidates = (
#         pd.pivot_table(
#             candidates,
#             values="score",
#             index=["customer_id", "article_id"],
#             columns=["method"],
#             aggfunc=np.sum,
#         )
#         .reset_index()
#     )
#     rule_names = [x for x in candidates.columns if x not in ["customer_id", "article_id"]]
#     candidates['score'] = candidates[rule_names].sum(axis=1)
#     candidates['rank'] = candidates.groupby(['customer_id'])['score'].rank(ascending=False)
#     candidates = candidates[candidates['rank']<=200]

#     candidates.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_candidate.pqt")
#     valid.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{week}_label.pqt")

In [16]:
# # * WEEK_NUM = 0: test

# for week in range(1):
#     # * use sliding window to generate candidates
#     if week == 0 and not TEST:
#         continue
#     trans = data["inter"]

#     start_date, end_date = calc_valid_date(week)
#     print(f"Week {week}: [{start_date}, {end_date})")

#     train, valid = dh.split_data(trans, start_date, end_date)
#     train = train.merge(data['user'][['customer_id','age_bins','user_gender']], on='customer_id', how='left')
#     train = train.merge(user_info, on='customer_id', how='left')

#     train['t_dat'] = pd.to_datetime(train['t_dat'])
#     last_week = train[train['t_dat']>train['t_dat'].max()-pd.Timedelta(days=7)]
#     last_2week = train[train['t_dat']>train['t_dat'].max()-pd.Timedelta(days=14)]
#     last_60day = train[train['t_dat']>train['t_dat'].max()-pd.Timedelta(days=60)]
#     last_80day = train[train['t_dat']>train['t_dat'].max()-pd.Timedelta(days=80)]

#     if week != 0:
#         customer_list = valid["customer_id"].values
#     else:
#         customer_list = submission['customer_id'].values

#     # * ========================== Retrieval Strategies ==========================

#     candidates = RuleCollector().collect(
#         week_num = week,
#         trans_df = trans,
#         customer_list=customer_list,
#         rules=[
#             UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=200, scale=True, name='1'),
#             UserGroupTimeHistory(data, customer_list, last_week, ['purchase_ability'], n=200, scale=True, name='2'),
#             UserGroupTimeHistory(data, customer_list, last_week, ['user_gender'], n=200, scale=True, name='3'),
#             OrderHistory(train, days=35, n=200),
#             OrderHistoryDecay(train, days=7, n=200),
#             TimeHistory(customer_list, last_week, n=200),
#             ItemCF(last_80day, last_2week, top_k=10, name='1'),
#             ItemCF(last_60day, last_2week, top_k=10, name='2'),
#             ItemCF(last_2week, last_2week, top_k=10, name='3'),
#             UserGroupItemCF(last_80day, last_2week, 'age_bins', top_k=10, name='1'),
#             UserGroupItemCF(last_60day, last_2week, 'age_bins', top_k=10, name='2'),
#             UserGroupItemCF(last_2week, last_2week, 'age_bins', top_k=10, name='3'),
#             UserGroupItemCF(last_80day, last_2week, 'purchase_ability', top_k=10, name='4'),
#             UserGroupItemCF(last_60day, last_2week, 'purchase_ability', top_k=10, name='5'),
#             UserGroupItemCF(last_2week, last_2week, 'purchase_ability', top_k=10, name='6'),
#         ],
#         min_pos_rate=0.0,
#         norm=True,
#         compress=False,
#     )

#     candidates, _ = reduce_mem_usage(candidates)
#     candidates = (
#         pd.pivot_table(
#             candidates,
#             values="score",
#             index=["customer_id", "article_id"],
#             columns=["method"],
#             aggfunc=np.sum,
#         )
#         .reset_index()
#     )
#     rule_names = [x for x in candidates.columns if x not in ["customer_id", "article_id"]]
#     candidates['score'] = candidates[rule_names].sum(axis=1)
#     candidates['rank'] = candidates.groupby(['customer_id'])['score'].rank(ascending=False)
#     candidates = candidates[candidates['rank']<=200]

#     candidates.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_candidate.pqt")
#     valid.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{week}_label.pqt")

In [17]:
# del train, valid, last_week, customer_list, candidates
# gc.collect()

## Feature engineering


In [18]:
user = data["user"]
item = data["item"]
inter = data["inter"]

In [19]:
# # merge `product_code`
# data['user']['age_bucket'] = pd.cut(data['user']['age'].fillna(22), [-1, 19, 29, 39, 49, 59, 69, 119],
#                                     labels=[0, 1, 2, 3, 4, 5, 6])
# inter = inter.merge(item[["article_id", "product_code", "department_no"]], on="article_id", how="left")
# inter = inter.merge(user[["customer_id", "age_bucket"]], on="customer_id", how="left")
# inter["age_bucket"] = inter["age_bucket"].astype(int)
# # calculate week number
# inter['week'] = (pd.to_datetime('2020-09-29') - pd.to_datetime(inter['t_dat'])).dt.days // 7
# inter = inter.sort_values(["customer_id", "t_dat"]).reset_index(drop=True)

In [20]:
# inter.head()

In [21]:
# inter.shape

Week Sale

In [22]:
# inter["i_sale"] = week_sale(inter, ["article_id"])
# inter["p_sale"] = week_sale(inter, ["product_code"])
# inter["i_sale_uni"] = week_sale(inter, ["article_id"], True)
# inter["p_sale_uni"] = week_sale(inter, ["product_code"], True)
# inter["lw_i_sale"] = week_sale(inter, ["article_id"], step=1) # * last week sale
# inter["lw_p_sale"] = week_sale(inter, ["product_code"], step=1)
# inter["lw_i_sale_uni"] = week_sale(inter, ["article_id"], True, step=1)
# inter["lw_p_sale_uni"] = week_sale(inter, ["product_code"], True, step=1)

# inter["i_sale_by_channel"] = week_sale(inter, ["article_id","sales_channel_id"])
# inter["p_sale_by_channel"] = week_sale(inter, ["product_code","sales_channel_id"])
# inter["i_sale_uni_by_channel"] = week_sale(inter, ["article_id","sales_channel_id"], True)
# inter["p_sale_uni_by_channel"] = week_sale(inter, ["product_code","sales_channel_id"], True)

# inter["i_sale_by_age"] = week_sale(inter, ["article_id","age_bins"])
# inter["p_sale_by_age"] = week_sale(inter, ["product_code","age_bins"])
# inter["i_sale_by_dept"] = week_sale(inter, ["article_id","department_no"])
# inter["p_sale_by_dept"] = week_sale(inter, ["product_code","department_no"])
# inter["i_sale_by_dept_channel"] = week_sale(inter, ["article_id","department_no","sales_channel_id"])
# inter["p_sale_by_dept_channel"] = week_sale(inter, ["product_code","department_no","sales_channel_id"])

# inter["i_sale_uni_by_age"] = week_sale(inter, ["article_id","age_bucket"], True)
# inter["p_sale_uni_by_age"] = week_sale(inter, ["product_code","age_bucket"], True)
# inter["lw_i_sale_by_age"] = week_sale(inter, ["article_id","age_bucket"], step=1)
# inter["lw_p_sale_by_age"] = week_sale(inter, ["product_code","age_bucket"], step=1)

# inter["i_sale_by_channel_age"] = week_sale(inter, ["article_id","sales_channel_id","age_bucket"])
# inter["p_sale_by_channel_age"] = week_sale(inter, ["product_code","sales_channel_id","age_bucket"])
# inter["i_sale_uni_by_channel_age"] = week_sale(inter, ["article_id","sales_channel_id","age_bucket"], True)
# inter["p_sale_uni_by_channel_age"] = week_sale(inter, ["product_code","sales_channel_id","age_bucket"], True)
# inter["i_sale_by_dept_age"] = week_sale(inter, ["article_id","department_no","age_bucket"])
# inter["p_sale_by_dept_age"] = week_sale(inter, ["product_code","department_no","age_bucket"])

In [23]:
# inter["lw_i_sale_by_channel"] = week_sale(inter, ["article_id","sales_channel_id"], step=1)
# inter["lw_p_sale_by_channel"] = week_sale(inter, ["product_code","sales_channel_id"], step=1)

# inter["i_sale_ratio"] = inter["i_sale"] / (inter["p_sale"] + 1e-6)
# inter["i_sale_uni_ratio"] = inter["i_sale_uni"] / (inter["p_sale_uni"] + 1e-6)
# inter["lw_i_sale_ratio"] = inter["lw_i_sale"] / (inter["lw_p_sale"] + 1e-6)
# inter["lw_i_sale_uni_ratio"] = inter["lw_i_sale_uni"] / (inter["lw_p_sale_uni"] + 1e-6)

# inter["i_uni_ratio"] = inter["i_sale"] / (inter["i_sale_uni"] + 1e-6)
# inter["p_uni_ratio"] = inter["p_sale"] / (inter["p_sale_uni"] + 1e-6)
# inter["lw_i_uni_ratio"] = inter["lw_i_sale"] / (inter["lw_i_sale_uni"] + 1e-6)
# inter["lw_p_uni_ratio"] = inter["lw_p_sale"] / (inter["lw_p_sale_uni"] + 1e-6)

# inter["i_sale_trend"] = (inter["i_sale"] - inter["lw_i_sale"]) / (inter["lw_i_sale"] + 1e-6)
# inter["p_sale_trend"] = (inter["p_sale"] - inter["lw_p_sale"]) / (inter["lw_p_sale"] + 1e-6)

# item_feats = [
#     "product_type_no",
#     "product_group_name",
#     "graphical_appearance_no",
#     "colour_group_code",
#     "perceived_colour_value_id",
#     "perceived_colour_master_id",
# ]
# inter = inter.merge(item[["article_id", *item_feats]], on="article_id", how="left")

# for f in tqdm(item_feats):
#     inter[f"{f}_sale"] = week_sale(inter, [f], f"{f}_sale")
#     inter[f"lw_{f}_sale"] = week_sale(inter, [f], f"{f}_sale", step=1)
#     inter[f"{f}_sale_trend"] = (inter[f"{f}_sale"] - inter[f"lw_{f}_sale"]) / (inter[f"lw_{f}_sale"] + 1e-6)
    
# inter = inter.drop(columns=item_feats)

In [24]:
##Repurchase Ratio
# inter['i_repurchase_ratio'] = repurchase_ratio(inter, ['article_id'])
# inter['p_repurchase_ratio'] = repurchase_ratio(inter, ['product_code'])

##Popularity
# inter['i_pop'] = popularity(inter, 'article_id', week_num=WEEK_NUM)
# inter['p_pop'] = popularity(inter, 'product_code', week_num=WEEK_NUM)

##Already Bought Item
# inter["purchased_item"] = purchased_before(inter, ["article_id"])
# inter["purchased_pro"] = purchased_before(inter, ["product_code"])

In [25]:
## inter.to_parquet(data_dir / "interim/processed_inter.pqt")
### inter.to_parquet(data_dir / "interim/processed_inter_v2.pqt")
# inter.to_parquet(data_dir / "interim/processed_inter_v3.pqt")

## Merge Features


In [26]:
# inter = pd.read_parquet(data_dir / "interim/processed_inter.pqt")
# inter = pd.read_parquet(data_dir / "interim/processed_inter_v2.pqt")
inter = pd.read_parquet(data_dir / "interim/processed_inter_v3.pqt")

In [27]:
data["inter"] = inter

In [28]:
article_cluster = pd.read_parquet(data_dir/'articles_new.parquet')
# https://www.kaggle.com/code/beezus666/k-means-and-feature-importance-for-articles/notebook?scriptVersionId=94269787

itemid2idx = pickle.load(open(data_dir/"index_id_map/item_id2index.pkl", "rb"))
article_cluster['article_id'] = article_cluster['article_id'].map(itemid2idx)
article_cluster = article_cluster.rename(columns={'department_no':'department_no_cluster', 'ct':'cluster'})

Merge user and item embeddings

In [29]:
# #* embeddings from DSSM model
# dssm_user_embd = np.load(data_dir / "external/dssm_user_embd.npy", allow_pickle=True)
# dssm_item_embd = np.load(data_dir / "external/dssm_item_embd.npy", allow_pickle=True)\
# # * embeddings from YouTubeDNN model
# yt_user_embd = np.load(data_dir / "external/yt_user_embd.npy", allow_pickle=True)
# yt_item_embd = np.load(data_dir / "external/yt_item_embd.npy", allow_pickle=True)

# for i in tqdm(range(WEEK_NUM)):
#     if i == 0 and not TEST:
#         continue
        
#     candidate = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate.pqt")
#     # * merge features
#     candidate = merge_week_data(data, i, candidate)
#     candidate = candidate.merge(article_cluster[['article_id','colour_group_name_cat_cat_code', 'cluster',
#                                                          'department_no_cluster']],  on='article_id', how='left')
    
#     candidate["dssm_similarity"] = calc_embd_similarity(candidate, dssm_user_embd, dssm_item_embd)
#     candidate["yt_similarity"] = calc_embd_similarity(candidate, yt_user_embd, yt_item_embd)
#     candidate.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate.pqt")
    
# del dssm_user_embd, dssm_item_embd, yt_user_embd, yt_item_embd, article_cluster, itemid2idx

## Ranking


In [31]:
candidates = {}
labels = {}
for i in tqdm(range(1, WEEK_NUM)):
    candidates[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate.pqt")
    labels[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_label.pqt")

100%|██████████| 6/6 [00:34<00:00,  5.73s/it]


In [32]:
candidates[1].columns.tolist()

['customer_id',
 'article_id',
 'ItemCF_1',
 'ItemCF_2',
 'ItemCF_3',
 'OrderHistoryDecay_1',
 'OrderHistory_1',
 'TimeHistory_1',
 'UGItemCF_1',
 'UGItemCF_2',
 'UGItemCF_3',
 'UGItemCF_4',
 'UGItemCF_5',
 'UGItemCF_6',
 'UGTimeHistory_1',
 'UGTimeHistory_2',
 'UGTimeHistory_3',
 'score',
 'rank',
 'label',
 'week',
 't_dat',
 'price',
 'sales_channel_id',
 'product_code',
 'department_no',
 'i_sale',
 'p_sale',
 'i_sale_uni',
 'p_sale_uni',
 'lw_i_sale',
 'lw_p_sale',
 'lw_i_sale_uni',
 'lw_p_sale_uni',
 'i_sale_by_channel',
 'p_sale_by_channel',
 'i_sale_uni_by_channel',
 'p_sale_uni_by_channel',
 'i_sale_by_age',
 'p_sale_by_age',
 'i_sale_by_dept',
 'p_sale_by_dept',
 'i_sale_by_dept_channel',
 'p_sale_by_dept_channel',
 'lw_i_sale_by_channel',
 'lw_p_sale_by_channel',
 'i_sale_ratio',
 'i_sale_uni_ratio',
 'lw_i_sale_ratio',
 'lw_i_sale_uni_ratio',
 'i_uni_ratio',
 'p_uni_ratio',
 'lw_i_uni_ratio',
 'lw_p_uni_ratio',
 'i_sale_trend',
 'p_sale_trend',
 'product_type_no_sale',
 'lw

In [33]:
feats = [
    x
    for x in candidates[1].columns
    if x
    not in [
        "label",
        "t_dat",
        "week"
    ]
]

cat_features = [
    "FN",
    "Active",
    "club_member_status",
    "fashion_news_frequency",
    "product_type_no",
    "product_group_name",
    "graphical_appearance_no",
    "colour_group_code",
    "perceived_colour_value_id",
    "perceived_colour_master_id",
    "customer_id",
    "article_id",
    "product_code",
    "colour_group_name_cat_cat_code",
    "department_no",
#     "age",
    "sales_channel_id",
    "article_gender",
    "season_type",
    "user_gender",
    "age_bucket",
    "cluster",
    "department_no_cluster"
]

In [34]:
# * Convert categorical featues as `CategoricalDtype`
cate_dict = {}        
for feat in tqdm(cat_features):
    if feat in data['user'].columns:
        value_set = set(data['user'][feat].unique())
    elif feat in data['item'].columns:
        value_set = set(data['item'][feat].unique())
    elif feat in article_cluster.columns:
        value_set = set(article_cluster[feat].unique())
    else:
        value_set = set(data['inter'][feat].unique())
    cate_dict[feat] = CategoricalDtype(categories=value_set)
    
for i in tqdm(range(1,WEEK_NUM)):
    for feat in cat_features:
        candidates[i][feat] = candidates[i][feat].astype(cate_dict[feat])

100%|██████████| 22/22 [00:01<00:00, 14.79it/s]
100%|██████████| 6/6 [01:52<00:00, 18.80s/it]


In [35]:
del inter, article_cluster

## Train


In [36]:
def train_model(valid_week_num, train_week_num=4):
    
    params = {
    "objective": "lambdarank",
    "boosting_type": "gbdt",
    "metric": "map",
    "max_depth": 9,
    "num_leaves": 256,
    "learning_rate": 0.05,
    "verbose": -1,
    "eval_at": 12,
    }
    
    print("Validating week:", valid_week_num)
    
    full_data = pd.concat(candidates[i] for i in range(1, WEEK_NUM))
    train = full_data[(valid_week_num < full_data["week"]) & (full_data["week"] <= valid_week_num + train_week_num)]
    valid = full_data[full_data["week"] == valid_week_num]
    train = train.sort_values(by=["week", "customer_id"], ascending=True).reset_index(drop=True)
    valid = valid.sort_values(by=["customer_id"], ascending=True).reset_index(drop=True)
    
    print("Train positive rate:", train.label.mean())  # 0.9946384702188372 4-week
    print("Train shape:", train.shape)

    train_group = train[["customer_id", "article_id", "week"]]
    train_group = train_group.astype("int")  # * convert to int to avoid `0` in groupby count result
    
    train_group = train_group.groupby(["week", "customer_id"])["article_id"].count().values

    valid_group = valid[["customer_id", "article_id"]]
    valid_group = valid_group.astype("int")  # * convert to int to avoid `0` in groupby count result
    
    valid_group = valid_group.groupby(["customer_id"])["article_id"].count().values

    train_set = lgb.Dataset(
        data=train[feats],
        label=train["label"],
        group=train_group,
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    valid_set = lgb.Dataset(
        data=valid[feats],
        label=valid["label"],
        group=valid_group,
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    print("Training starts...!")

    ranker = lgb.train(
        params,
        train_set,
        num_boost_round=300,
        valid_sets=[valid_set],
        early_stopping_rounds=30,
        verbose_eval=10,
    )
    
    ranker.save_model(
        model_dir / f"lgb_ranker_{valid_week_num}.model",
        num_iteration=ranker.best_iteration,
    )
        
    return ranker

In [37]:
def train_binary_model(valid_week_num, train_week_num=1):
    
    print("Validating week:", valid_week_num)
    train = pd.concat(candidates[valid_week_num+j] for j in range(1,train_week_num+1))
    valid = candidates[valid_week_num]

    print("Train positive rate:", train.label.mean())  # 0.9946384702188372 4-week
    print("Train shape:", train.shape)

    params = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "metric": "auc",
    "max_depth": 9,
    "num_leaves": 256,
    "learning_rate": 0.05,
    "verbose": -1,
    }

    train_set = lgb.Dataset(
        data=train[feats],
        label=train["label"],
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    valid_set = lgb.Dataset(
        data=valid[feats],
        label=valid["label"],
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )
    
    print("Training starts...!")

    ranker = lgb.train(
        params,
        train_set,
        num_boost_round=300,
        valid_sets=[valid_set],
        early_stopping_rounds=30,
        verbose_eval=10,
    )
    
    ranker.save_model(
        model_dir / f"lgb_binary_{valid_week_num}.model",
        num_iteration=ranker.best_iteration,
    )
    
    return ranker

In [38]:
# ranker_2 = train_binary_model(valid_week, train_n_week)

# [204]	valid_0's auc: 0.688491  3 week

In [39]:
# ranker_2 = lgb.Booster(model_file=model_dir / "lgb_binary_1.model")

In [40]:
# sns.set(rc={'figure.figsize':(8, 18)})
# feat_importance = pd.DataFrame(
#     {"feature": feats, "importance": ranker_2.feature_importance()}
# ).sort_values(by="importance", ascending=False)
# sns.barplot(y="feature", x="importance", data=feat_importance)

In [None]:
ranker = train_model(valid_week, train_n_week)
# 0.82846 4-week
# [35]	valid_0's map@12: 0.705772 3 week

Validating week: 1
Train positive rate: 0.00288883582585698
Train shape: (74171747, 110)
Training starts...!


In [None]:
ranker = lgb.Booster(model_file=model_dir / "lgb_ranker_1.model")

In [None]:
sns.set(rc={'figure.figsize':(8, 18)})
feat_importance = pd.DataFrame(
    {"feature": feats, "importance": ranker.feature_importance()}
).sort_values(by="importance", ascending=False)
sns.barplot(y="feature", x="importance", data=feat_importance)

## Validate

In [None]:
val_candidates = candidates[1]

In [None]:
def predict(ranker, candidates, batch_size = 5_000_000):
    probs = np.zeros(candidates.shape[0])
    for batch in range(0, candidates.shape[0], batch_size):
        outputs = ranker.predict(candidates.loc[batch : batch + batch_size - 1, feats])
        probs[batch : batch + batch_size] = outputs
    candidates["prob"] = probs
    pred_lgb = candidates[['customer_id','article_id','prob']]
    pred_lgb = pred_lgb.sort_values(by=["customer_id","prob"], ascending=False).reset_index(drop=True)
    pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)
    pred_lgb = pred_lgb.drop_duplicates(['customer_id', 'prediction'], keep='first')
    pred_lgb['customer_id'] = pred_lgb['customer_id'].astype(int)
    pred_lgb = pred_lgb.groupby("customer_id")["prediction"].progress_apply(list).reset_index()
    return pred_lgb

In [None]:
# pred = predict(ranker_2, val_candidates)

# label = labels[1]
# label = pd.merge(label, pred, on="customer_id", how="left")

# map_at_k(label["article_id"], label["prediction"], k=12)

# # 0.028297597743100006 6 week 0.0250

# # 0.02863472423766257 4 week
# # 0.02882844782917304 4 week
# # 0.028909064246733205 4 week
# # 0.029366338019601947 4 week # 0.0268
# # 0.029402247883421353 6 week

# # 0.022458255193628294 3 week binary

In [None]:
# print(recall_at_k(label["article_id"], label["prediction"], k=200))
# print(recall_at_k(label["article_id"], label["prediction"], k=150))
# print(recall_at_k(label["article_id"], label["prediction"], k=12))

# # 0.22609381056959738
# # 0.20330711927052456
# # 0.05715520910553712

In [None]:
pred = predict(ranker, val_candidates)

label = labels[1]
label = pd.merge(label, pred, on="customer_id", how="left")

map_at_k(label["article_id"], label["prediction"], k=12)

# 0.028297597743100006 6 week 0.0250

# 0.02863472423766257 4 week
# 0.02882844782917304 4 week
# 0.028909064246733205 4 week
# 0.029366338019601947 4 week # 0.0268

# 0.029402247883421353 6 week
# 0.02606931574775391 3 week ranker

In [None]:
print(recall_at_k(label["article_id"], label["prediction"], k=200))
print(recall_at_k(label["article_id"], label["prediction"], k=150))
print(recall_at_k(label["article_id"], label["prediction"], k=12))

# 0.22610105862711796
# 0.19685366988994157
# 0.059677054468807014

In [None]:
kkkkkkk

## Test

In [None]:
del val_candidates, candidates
gc.collect()

In [None]:
test_candidates = pd.read_parquet(data_dir/"processed"/VERSION_NAME/"week0_candidate.pqt")
for feat in cat_features:
    test_candidates[feat] = test_candidates[feat].astype(cate_dict[feat])

In [None]:
test_pred = predict(ranker, test_candidates)

In [None]:
idx2uid = pickle.load(open(data_dir/"index_id_map/user_index2id.pkl", "rb"))
idx2iid = pickle.load(open(data_dir/"index_id_map/item_index2id.pkl", "rb"))

def parse(x):
    l = ['0'+str(idx2iid[i]) for i in x]
    l = ' '.join(l[:12])
    return l

In [None]:
test_pred['prediction'] = test_pred['prediction'].progress_apply(lambda x: parse(x))

In [None]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

In [None]:
del submission['prediction']
submission = submission.merge(test_pred, on='customer_id', how='left')
submission['customer_id'] = submission['customer_id'].map(idx2uid)

In [None]:
submission.to_csv('submission.csv.gz', index=False)

In [None]:
submission.head()

### Submit

In [None]:
submit_msg = """
0.029402247883421353 6-week
---
OrderHistory(train, 3),
OrderHistory(train, 7),
OrderHistoryDecay(train, 3, n=50),
OrderHistoryDecay(train, 7, n=50),
ItemPair(OrderHistory(train, 3).retrieve(), name='1'),
ItemPair(OrderHistory(train, 7).retrieve(), name='2'),
ItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),
ItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),
UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50 if week!=0 else 15, name='1'),
UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50 if week!=0 else 20.5, name='2'),
UserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50 if week!=0 else 2),
TimeHistory(customer_list, last_week, n=50 if week!=0 else 9, name='1'),
TimeHistory(customer_list, last_3days, n=50 if week!=0 else 16, name='2'),
TimeHistoryDecay(customer_list, train, 3, n=50 if week!=0 else 12),
TimeHistoryDecay(customer_list, train, 7, n=50 if week!=0 else 8),
SaleTrend(customer_list, train, 7, n=50 if week!=0 else 2)
---
min_pos_rate = 0.005 new features
pivot
"""

In [None]:
submit_msg

In [None]:
! kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f "submission.csv.gz" -m '\n0.029111989281461418 4-week drop full negative user\n---\nOrderHistory(train, 3),\nOrderHistory(train, 7),\nOrderHistoryDecay(train, 3, n=50),\nOrderHistoryDecay(train, 7, n=50),\nItemPair(OrderHistory(train, 3).retrieve(), name='1'),\nItemPair(OrderHistory(train, 7).retrieve(), name='2'),\nItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),\nItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),\nUserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50 if week!=0 else 15, name='1'),\nUserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50 if week!=0 else 20.5, name='2'),\nUserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50 if week!=0 else 2),\nTimeHistory(customer_list, last_week, n=50 if week!=0 else 9, name='1'),\nTimeHistory(customer_list, last_3days, n=50 if week!=0 else 16, name='2'),\nTimeHistoryDecay(customer_list, train, 3, n=50 if week!=0 else 12),\nTimeHistoryDecay(customer_list, train, 7, n=50 if week!=0 else 8),\nSaleTrend(customer_list, train, 7, n=50 if week!=0 else 2)\n---\nmin_pos_rate = 0.006\npivot\n'
