In [1]:
# %pip install -U lightgbm==3.3.2

In [2]:
# !git clone 'https://github.com/Wp-Zhang/H-M-Fashion-RecSys.git'

In [None]:
# !unzip /content/H-M-Fashion-RecSys-main.zip
# !mv ./H-M-Fashion-RecSys-main/* ./

In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("./") # path to the `src`` folder
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [2]:
from src.data import DataHelper
from src.data.metrics import map_at_k, hr_at_k, recall_at_k

from src.retrieval.rules import (
    OrderHistory,
    OrderHistoryDecay,
    ItemPair,
    UserGroupTimeHistory,
    UserGroupSaleTrend,
    TimeHistory,
    TimeHistoryDecay,
    SaleTrend,
    OutOfStock,
)
from src.retrieval.collector import RuleCollector

from src.features import cum_sale, week_sale, repurchase_ratio, purchased_before

from src.utils import (
    calc_valid_date,
    merge_week_data,
    reduce_mem_usage,
    calc_embd_similarity,
)

data_dir = Path("/content/data/")
model_dir = Path("/content/models/")

TRAIN_WEEK_NUM = 4
WEEK_NUM = TRAIN_WEEK_NUM + 2

VERSION_NAME = "pivot"
TEST = False # * Set as `False` when do local experiments to save time

import os
if not os.path.exists(data_dir/"interim"/VERSION_NAME):
    os.mkdir(data_dir/"interim")
    os.mkdir(data_dir/"interim"/VERSION_NAME)
if not os.path.exists(data_dir/"processed"/VERSION_NAME):
    os.mkdir(data_dir/"processed")
    os.mkdir(data_dir/"processed"/VERSION_NAME)

dh = DataHelper(data_dir)

# data = dh.preprocess_data(save=True, name="encoded_full") # * run only once, processed data will be saved

data = dh.load_data(name="encoded_full")
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

## Retrieval


Generate candidates for each week

In [5]:
# listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
# data['user']['age_bins'] = pd.cut(data['user']['age'], listBin)

In [None]:
# # * WEEK_NUM = 0: test
# # * WEEK_NUM = 1: valid
# # * WEEK_NUM > 1: train
# for week in range(1,WEEK_NUM):
#     # * use sliding window to generate candidates
#     if week == 0 and not TEST:
#         continue
#     trans = data["inter"]

#     start_date, end_date = calc_valid_date(week)
#     print(f"Week {week}: [{start_date}, {end_date})")
    
#     train, valid = dh.split_data(trans, start_date, end_date)
#     train = train.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')

#     last_week_start = pd.to_datetime(start_date) - pd.Timedelta(days=7)
#     last_week_start = last_week_start.strftime("%Y-%m-%d")
#     last_week = train.loc[train.t_dat >= last_week_start]
    
#     last_3day_start = pd.to_datetime(start_date) - pd.Timedelta(days=3)
#     last_3day_start = last_3day_start.strftime("%Y-%m-%d")
#     last_3days = train.loc[train.t_dat >= last_3day_start]

#     if week != 0:
#         customer_list = valid["customer_id"].values
#     else:
#         customer_list = submission['customer_id'].values

#     # * ========================== Retrieval Strategies ==========================

#     candidates = RuleCollector().collect(
#         week_num = week,
#         trans_df = trans,
#         customer_list=customer_list,
#         rules=[
#             OrderHistory(train, days=3, name='1'),
#             OrderHistory(train, days=7, name='2'),
#             OrderHistoryDecay(train, days=3, n=50, name='1'),
#             OrderHistoryDecay(train, days=7, n=50, name='2'),
#             ItemPair(OrderHistory(train, days=3).retrieve(), name='1'),
#             ItemPair(OrderHistory(train, days=7).retrieve(), name='2'),
#             ItemPair(OrderHistoryDecay(train, days=3, n=50).retrieve(), name='3'),
#             ItemPair(OrderHistoryDecay(train, days=7, n=50).retrieve(), name='4'),
#             UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50, name='1'),
#             UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50, name='2'),
#             UserGroupSaleTrend(data, customer_list, train, ['age_bins'], days=7, n=50),
#             TimeHistory(customer_list, last_week, n=50, name='1'),
#             TimeHistory(customer_list, last_3days, n=50, name='2'),
#             TimeHistoryDecay(customer_list, train, days=3, n=50, name='1'),
#             TimeHistoryDecay(customer_list, train, days=7, n=50, name='2'),
#             SaleTrend(customer_list, train, days=7, n=50),
#         ],
#         filters=[OutOfStock(trans)],
#         min_pos_rate=0.006,
#         compress=False,
#     )
    
#     candidates = (
#         pd.pivot_table(
#             candidates,
#             values="score",
#             index=["customer_id", "article_id"],
#             columns=["method"],
#             aggfunc=np.sum,
#         )
#         .reset_index()
#     )

#     candidates.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_candidate.pqt")
#     valid.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{week}_label.pqt")

In [7]:
# # * use the threshold in week 1 to generate candidates for test data, see the log in the upper cell 
# if TEST:
#     week = 0
#     trans = data["inter"]
    
#     start_date, end_date = calc_valid_date(week)
#     print(f"Week {week}: [{start_date}, {end_date})")
    
#     train, valid = dh.split_data(trans, start_date, end_date)
#     train = train.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')

#     last_week_start = pd.to_datetime(start_date) - pd.Timedelta(days=7)
#     last_week_start = last_week_start.strftime("%Y-%m-%d")
#     last_week = train.loc[train.t_dat >= last_week_start]
    
#     last_3day_start = pd.to_datetime(start_date) - pd.Timedelta(days=3)
#     last_3day_start = last_3day_start.strftime("%Y-%m-%d")
#     last_3days = train.loc[train.t_dat >= last_3day_start]

#     customer_list = submission['customer_id'].values

#     # * ========================== Retrieval Strategies ==========================

#     candidates = RuleCollector().collect(
#         week_num = week,
#         trans_df = trans,
#         customer_list=customer_list,
#         rules=[
#             OrderHistory(train, days=3, name='1'),
#             OrderHistory(train, days=7, name='2'),
#             OrderHistoryDecay(train, days=3, n=50, name='1'),
#             OrderHistoryDecay(train, days=7, n=50, name='2'),
#             ItemPair(OrderHistory(train, days=3).retrieve(), name='1'),
#             ItemPair(OrderHistory(train, days=7).retrieve(), name='2'),
#             ItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),
#             ItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),
#             UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=15, name='1'),
#             UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=20.5, name='2'),
#             UserGroupSaleTrend(data, customer_list, train, ['age_bins'], days=7, n=2),
#             TimeHistory(customer_list, last_week, n=9, name='1'),
#             TimeHistory(customer_list, last_3days, n=16, name='2'),
#             TimeHistoryDecay(customer_list, train, days=3, n=12, name='1'),
#             TimeHistoryDecay(customer_list, train, days=7, n=8, name='2'),
#             SaleTrend(customer_list, train, days=7, n=2),
#         ],
#         filters=[OutOfStock(trans)],
#         min_pos_rate=0.006,
#         compress=False,
#     )
    
#     candidates, _ = reduce_mem_usage(candidates)
#     candidates = (
#         pd.pivot_table(
#             candidates,
#             values="score",
#             index=["customer_id", "article_id"],
#             columns=["method"],
#             aggfunc=np.sum,
#         )
#         .reset_index()
#     )

#     candidates.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_candidate.pqt")
#     valid.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{week}_label.pqt")

In [8]:
# del train, valid, last_week, customer_list, candidates
# gc.collect()

73

## Feature engineering


In [9]:
user = data["user"]
item = data["item"]
inter = data["inter"].copy()

# merge `product_code`
# inter = inter.merge(item[["article_id", "product_code"]], on="article_id", how="left")

# calculate week number
inter['week'] = (pd.to_datetime('2020-09-29') - pd.to_datetime(inter['t_dat'])).dt.days // 7

inter = inter.sort_values(["customer_id", "t_dat"]).reset_index(drop=True)

import gc 
gc.collect()

67

In [10]:
# inter_make_feature = inter.copy()

# inter_make_feature['t_dat'] = pd.to_datetime(inter_make_feature['t_dat'])
# last_week_start = pd.to_datetime("2020-09-01")
# last_week_end = pd.to_datetime("2020-06-01")

# inter_make_feature = inter_make_feature.loc[(inter_make_feature.t_dat <= last_week_start)]

# inter_make_feature = inter_make_feature.merge(user, on="customer_id", how="left")
# inter_make_feature = inter_make_feature.merge(item, on="article_id", how="left")

In [11]:
# inter_make_feature

In [12]:
# to_delete_list = []


# for col in ['price', 'sales_channel_id', 'age']:
#   AGGs =  ['mean', 'std',]
#   if col =='price':
#     AGGs =  ['mean', 'std', 'sum', 'max', 'min']
#   for AGG in AGGs:
#     dict_agg = inter_make_feature.groupby('customer_id')[col].agg(AGG)
#     inter[f'aggfeatures_customer_{col}_{AGG}_all'] = inter['customer_id'].map(dict_agg).fillna(-999)
#     to_delete_list.append(f'aggfeatures_customer_{col}_{AGG}_all')

#   for AGG in AGGs:
#     dict_agg = inter_make_feature.groupby('article_id')[col].agg(AGG)
#     inter[f'aggfeatures_article_{col}_{AGG}_all'] = inter['customer_id'].map(dict_agg).fillna(-999)
#     to_delete_list.append(f'aggfeatures_article_{col}_{AGG}_all')

In [13]:
# last_week_end = pd.to_datetime("2020-06-01")

# inter_make_feature = inter_make_feature.loc[(inter_make_feature.t_dat > last_week_end)]


# for col in ['price', 'sales_channel_id', 'age']:
#   AGGs =  ['mean', 'std',]
#   if col =='price':
#     AGGs =  ['mean', 'std', 'sum', 'max', 'min']
#   for AGG in AGGs:
#     dict_agg = inter_make_feature.groupby('customer_id')[col].agg(AGG)
#     inter[f'aggfeatures_customer_{col}_{AGG}_3'] = inter['customer_id'].map(dict_agg).fillna(-999)
#     to_delete_list.append(f'aggfeatures_customer_{col}_{AGG}_3')

#   for AGG in AGGs:
#     dict_agg = inter_make_feature.groupby('article_id')[col].agg(AGG)
#     inter[f'aggfeatures_article_{col}_{AGG}_3'] = inter['customer_id'].map(dict_agg).fillna(-999)
#     to_delete_list.append(f'aggfeatures_article_{col}_{AGG}_3')

# del inter_make_feature 
# import gc 
# gc.collect()

In [14]:
# inter['diff_price_customer'] = inter[f'aggfeatures_article_price_mean_all']/inter[f'aggfeatures_customer_price_mean_all']
# inter['diff_channel_customer'] = inter[f'aggfeatures_article_sales_channel_id_mean_all']-inter[f'aggfeatures_customer_sales_channel_id_mean_all']


# inter['diff_price_customer'].fillna(-999, inplace=True)
# inter['diff_channel_customer'].fillna(-999, inplace=True)

In [15]:
# for col in to_delete_list:
#   del inter[col]

In [16]:
inter = inter.merge(item[["article_id", "product_code"]], on="article_id", how="left")


inter["item_sale"] = week_sale(inter, ["article_id"])
inter["pro_sale"] = week_sale(inter, ["product_code"])
inter["item_sale_uni"] = week_sale(inter, ["article_id"], True)
inter["pro_sale_uni"] = week_sale(inter, ["product_code"], True)

inter["item_sale_ratio"] = inter["item_sale"] / (inter["pro_sale"] + 1e-6)
inter["item_sale_uni_ratio"] = inter["pro_sale_uni"] / (inter["pro_sale_uni"] + 1e-6)

item_feats = [
    "product_type_no",
    "product_group_name",
    "graphical_appearance_no",
    "colour_group_code",
    "perceived_colour_value_id",
    "perceived_colour_master_id",
]
inter = inter.merge(item[["article_id", *item_feats]], on="article_id", how="left")

for feat in tqdm(item_feats):
    inter[f"{feat}_sale"] = week_sale(inter, [feat], f"{feat}_sale")

inter = inter.drop(columns=item_feats)

inter['i_repurchase_ratio'] = repurchase_ratio(inter, ['article_id'])
inter['p_repurchase_ratio'] = repurchase_ratio(inter, ['product_code'])

inter["purchased_item"] = purchased_before(inter, ["article_id"])
inter["purchased_pro"] = purchased_before(inter, ["product_code"])

print(inter.shape)
inter.head()

inter.to_parquet(data_dir / "processed/processed_inter.pqt")

100%|██████████| 6/6 [00:51<00:00,  8.59s/it]


(31788324, 23)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,product_code,item_sale,pro_sale,item_sale_uni,...,product_type_no_sale,product_group_name_sale,graphical_appearance_no_sale,colour_group_code_sale,perceived_colour_value_id_sale,perceived_colour_master_id_sale,i_repurchase_ratio,p_repurchase_ratio,purchased_item,purchased_pro
0,2018-12-27,1,29517,0.044051,1,91,11547,10,16,7,...,3201,13090,13932,8525,14427,10582,0.126904,0.141921,0.0,0.0
1,2018-12-27,1,100,0.035576,1,91,37,17,84,16,...,3146,13090,13932,13323,14427,13281,0.098616,0.129139,0.0,0.0
2,2018-12-27,1,30328,0.030492,1,91,11823,3,15,2,...,3201,13090,13932,8525,14427,10582,0.166667,0.15894,0.0,0.0
3,2019-05-02,1,50725,0.010153,2,73,20579,1,5,1,...,2381,10972,12895,4407,10953,5609,,0.169014,0.0,0.0
4,2019-05-25,1,16004,0.050831,2,70,5599,160,351,129,...,2145,8706,11189,9805,10172,9739,0.189273,0.28126,0.0,0.0


## Merge Features


In [3]:
inter = pd.read_parquet(data_dir / "processed/processed_inter.pqt")
data["inter"] = inter

# #* embeddings from DSSM model
# dssm_user_embd = np.load(data_dir / "external/dssm_user_embd.npy", allow_pickle=True)
# dssm_item_embd = np.load(data_dir / "external/dssm_item_embd.npy", allow_pickle=True)
# # * embeddings from YouTubeDNN model
# yt_user_embd = np.load(data_dir / "external/yt_user_embd.npy", allow_pickle=True)
# yt_item_embd = np.load(data_dir / "external/yt_item_embd.npy", allow_pickle=True)

for i in tqdm(range(WEEK_NUM)):
    if i == 0 and not TEST:
        continue
        
    candidate = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate.pqt")
    # * merge features
    candidate = merge_week_data(data, i, candidate)
    # # * merge DSSM user and item embeddings
    # candidate["dssm_similarity"] = calc_embd_similarity(candidate, dssm_user_embd, dssm_item_embd)
    # # * merge YouTubeDNN user and item embeddings
    # candidate["yt_similarity"] = calc_embd_similarity(candidate, yt_user_embd, yt_item_embd)

    candidate.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate.pqt")

# del dssm_user_embd, dssm_item_embd, yt_user_embd, yt_item_embd
# gc.collect()

KeyboardInterrupt: ignored

## Ranking


In [3]:
candidates = {}
labels = {}
for i in tqdm(range(1, WEEK_NUM)):
    candidates[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate.pqt")
    labels[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_label.pqt")

feats = [
    x
    for x in candidates[1].columns
    if x
    not in [
        "label",
        "sales_channel_id",
        "t_dat",
        "week",
        "WeekSaleTrend_item",
        "WeekSaleTrend_pro",
        "ThreeDaySaleTrend_item",
        "ThreeDaySaleTrend_pro",
    ]
]
cat_features = [
    "customer_id",
    "article_id",
    "product_code",
    "FN",
    "Active",
    "club_member_status",
    "fashion_news_frequency",
    "age",
    "product_type_no",
    "product_group_name",
    "graphical_appearance_no",
    "colour_group_code",
    "perceived_colour_value_id",
    "perceived_colour_master_id",
]

# * Convert categorical featues as `CategoricalDtype`
cate_dict = {}        
for feat in tqdm(cat_features):
    if feat in data['user'].columns:
        value_set = set(data['user'][feat].unique())
    elif feat in data['item'].columns:
        value_set = set(data['item'][feat].unique())
    else:
        value_set = set(data['inter'][feat].unique())
    cate_dict[feat] = CategoricalDtype(categories=value_set)

for i in tqdm(range(1,WEEK_NUM)):
    for feat in cat_features:
        candidates[i][feat] = candidates[i][feat].astype(cate_dict[feat])

100%|██████████| 5/5 [00:03<00:00,  1.42it/s]
100%|██████████| 14/14 [00:00<00:00, 15.53it/s]
100%|██████████| 5/5 [00:07<00:00,  1.58s/it]


### Train


In [33]:
data['user']

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,1,1,1,2,3,49,112979
1,2,1,1,2,3,25,57313
2,3,1,1,2,3,24,139157
3,4,1,1,2,3,54,128530
4,5,2,2,2,5,52,52372
...,...,...,...,...,...,...,...
1371975,1371976,1,1,2,3,24,169172
1371976,1371977,1,1,2,3,21,87256
1371977,1371978,2,2,2,5,21,95708
1371978,1371979,2,2,2,5,18,188280


In [4]:
inter = data['inter']
inter.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,3,40180,0.050831,2
1,2018-09-20,3,10521,0.030492,2
2,2018-09-20,8,6388,0.015237,2
3,2018-09-20,8,46305,0.016932,2
4,2018-09-20,8,46306,0.016932,2


In [35]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
inter['t_dat'] = pd.to_datetime(inter['t_dat'])
last_week_start = pd.to_datetime("2020-09-14")
last_week_end = pd.to_datetime("2020-06-01")

inter = inter.loc[(inter.t_dat <= last_week_start)]


In [37]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import os

if os.path.exists('./articleid_model_cbow.model'):
    model_sg = Word2Vec.load('./articleid_model_cbow.model')
else:
    feedid_seq_list = inter.groupby(['customer_id']).article_id.apply(lambda x: [str(id) for id in x] ).values
    
    model_sg = Word2Vec(feedid_seq_list,  size=64, window=32, min_count=1, sg=0, sample=1e-3, negative=15, workers=32, seed=1, iter=10)
    model_sg.save('./articleid_model_cbow.model')

In [38]:
feedid_seq_list = inter.groupby('customer_id').article_id.apply(lambda x: [str(id) for id in x] )
feedid_seq_list = feedid_seq_list.reset_index()

In [39]:
from collections import defaultdict

dict_customer_id_vec = defaultdict(lambda : np.ones(64)/64)

for item in tqdm(feedid_seq_list.values):
  if len(item[1])>1:
    vec = np.mean(list(map(lambda x: model_sg[x], item[1])), axis=0)
  else:
    vec = model_sg[item[1][0]]
  
  dict_customer_id_vec[item[0]] = vec/np.sqrt(np.sum(vec**2))
  

100%|██████████| 1356132/1356132 [03:25<00:00, 6591.36it/s]


In [40]:
# * some rules are skipped for some weeks, we need to concat them together
# * to merge the columns
full_data = pd.concat([candidates[i] for i in range(1, WEEK_NUM)], ignore_index=True)

In [42]:
dict_v = {}
def get_embedding(x):
  try:
    return dict_v[x]
  except:
    try:
      vec = model_sg[str(x)].astype('float32')
    except:
      vec = np.ones(64).astype('float32')/64
    dict_v[x] = vec/np.sqrt(np.sum(vec**2))
    return dict_v[x]

article_embedding = np.vstack(list(map(get_embedding, tqdm(full_data.article_id.values)))).astype('float32')

del dict_v
import gc  
gc.collect()

100%|██████████| 12845149/12845149 [00:07<00:00, 1778392.57it/s]


226

In [43]:
customer_embedding = np.vstack(list(map(lambda x: dict_customer_id_vec[x].astype('float32'), tqdm(full_data.customer_id.values)))).astype('float32')

100%|██████████| 12845149/12845149 [00:14<00:00, 906678.98it/s]


In [44]:
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

question1_vec = article_embedding
question2_vec = customer_embedding

full_data['cosine']     = (article_embedding * customer_embedding).sum(axis=-1) #[cosine(x, y)       for (x, y) in tqdm(zip(question1_vec, question2_vec))]

# full_data['cityblock']  = [cityblock(x, y)    for (x, y) in tqdm(zip(question1_vec, question2_vec))]
# full_data['canberra']   = [canberra(x, y)     for (x, y) in tqdm(zip(question1_vec, question2_vec))]
# full_data['euclidean']  = [euclidean(x, y)    for (x, y) in tqdm(zip(question1_vec, question2_vec))]
# full_data['minkowski']  = [minkowski(x, y, 3) for (x, y) in tqdm(zip(question1_vec, question2_vec))]
# full_data['braycurtis'] = [braycurtis(x, y)   for (x, y) in tqdm(zip(question1_vec, question2_vec))]

# full_data['skew_q1'] = [skew(x) for x in question1_vec]
# full_data['skew_q2'] = [skew(x) for x in question2_vec]
# full_data['kur_q1']  = [kurtosis(x) for x in question1_vec]
# full_data['kur_q2']  = [kurtosis(x) for x in question2_vec]

# full_data['skew_diff'] = np.abs(data['skew_q1'] - data['skew_q2'])
# full_data['kur_diff']  = np.abs(data['kur_q1'] - data['kur_q2'])

In [24]:
# extra_feats = ['cosine', 'cityblock', 'canberra', 'euclidean', 'minkowski', 'braycurtis']

In [45]:
for AGG in ['sum', 'mean']:
  dictmap = inter.groupby('customer_id').price.agg(AGG)
  full_data[f'customer_id_price_{AGG}'] = full_data['customer_id'].map(dictmap)

for AGG in ['sum', 'mean']:
  dictmap = inter.groupby('article_id').price.agg(AGG)
  full_data[f'article_id_price_{AGG}'] = full_data['article_id'].map(dictmap)

for AGG in ['mean']:
  dictmap = inter.groupby('article_id').sales_channel_id.agg(AGG)
  full_data[f'article_id_sales_channel_id_{AGG}'] = full_data['article_id'].map(dictmap)

  dictmap = inter.groupby('customer_id').sales_channel_id.agg(AGG)
  full_data[f'customer_id_sales_channel_id_{AGG}'] = full_data['customer_id'].map(dictmap)

full_data['diff_price'] = full_data['customer_id_price_mean']/full_data['article_id_price_mean']
full_data['diff_channel'] = full_data['article_id_sales_channel_id_mean']-full_data['customer_id_sales_channel_id_mean']


In [51]:
extra_feats = ['cosine', ] 

In [27]:
# for i in range(64):
#   full_data[f'extra_feat{i}'] = article_embedding[:,i].astype('float16')
#   full_data[f'extra_feat{i}'] = full_data[f'extra_feat{i}'].astype('float16')

In [28]:
# for i in range(64):
#   full_data[f'extra_feat_{i}'] = customer_embedding[:,i].astype('float16')
#   full_data[f'extra_feat_{i}'] = full_data[f'extra_feat_{i}'].astype('float16')

In [13]:
# extra_feats = [f'extra_feat{i}' for i in range(64)] + [f'extra_feat_{i}' for i in range(64)]

In [None]:
def train_model(full_data, valid_week_num, train_week_num=4):
    print("Validating week:", valid_week_num)
    train = full_data[
        (valid_week_num < full_data["week"])
        & (full_data["week"] <= valid_week_num + train_week_num)
    ]
    valid = full_data[full_data["week"] == valid_week_num]
    train = train.sort_values(by=["week", "customer_id"], ascending=True).reset_index(
        drop=True
    )
    valid = valid.sort_values(by=["customer_id"], ascending=True).reset_index(drop=True)
    print("Train positive rate:", train.label.mean())  # 0.9946384702188372 4-week

    train_group = train[["customer_id", "article_id", "week"]]
    train_group = train_group.astype(
        "int"
    )  # * convert to int to avoid `0` in groupby count result
    train_group = (
        train_group.groupby(["week", "customer_id"])["article_id"].count().values
    )

    valid_group = valid[["customer_id", "article_id"]]
    valid_group = valid_group.astype(
        "int"
    )  # * convert to int to avoid `0` in groupby count result
    valid_group = valid_group.groupby(["customer_id"])["article_id"].count().values

    train_set = lgb.Dataset(
        data=train[feats+extra_feats],
        label=train["label"],
        group=train_group,
        feature_name=feats+extra_feats,
        categorical_feature=cat_features,
        params=params,
    )

    valid_set = lgb.Dataset(
        data=valid[feats+extra_feats],
        label=valid["label"],
        group=valid_group,
        feature_name=feats+extra_feats,
        categorical_feature=cat_features,
        params=params,
    )

    ranker = lgb.train(
        params,
        train_set,
        num_boost_round=600,
        valid_sets=[valid_set],
        early_stopping_rounds=100,
        verbose_eval=10,
    )
    ranker.save_model(
        f"/content/models/lgb_ranker_{valid_week_num}.model",
        num_iteration=ranker.best_iteration,
    )
    return ranker

params = {
    "objective": "lambdarank",
    "boosting_type": "gbdt",
    "metric": "map",
    "max_depth": 8,
    "num_leaves": 128,
    "learning_rate": 0.02,

    "verbose": -1,
    "eval_at": 12,
    # 'device':'gpu'
}

ranker = train_model(full_data, 1, 4) # 0.82846 4-week

Validating week: 1
Train positive rate: 0.0067018773363074985
Training until validation scores don't improve for 100 rounds.
[10]	valid_0's map@12: 0.826219
[20]	valid_0's map@12: 0.827753
[30]	valid_0's map@12: 0.828029
[40]	valid_0's map@12: 0.828277
[50]	valid_0's map@12: 0.8281


In [None]:
# def train_model(full_data, valid_week_num, train_week_num=4):
#     print("Validating week:", valid_week_num)

    
#     train = full_data[
#         (valid_week_num < full_data["week"])
#         & (full_data["week"] <= valid_week_num + train_week_num)
#     ]
#     valid = full_data[full_data["week"] == valid_week_num]

#     train = train.sort_values(by=["week", "customer_id"], ascending=True).reset_index(
#         drop=True
#     )
#     valid = valid.sort_values(by=["customer_id"], ascending=True).reset_index(drop=True)
#     print("Train positive rate:", train.label.mean())  # 0.9946384702188372 4-week

#     train_group = train[["customer_id", "article_id", "week"]]
#     train_group = train_group.astype(
#         "int"
#     )  # * convert to int to avoid `0` in groupby count result
#     train_group = (
#         train_group.groupby(["week", "customer_id"])["article_id"].count().values
#     )

#     valid_group = valid[["customer_id", "article_id"]]
#     valid_group = valid_group.astype(
#         "int"
#     )  # * convert to int to avoid `0` in groupby count result
#     valid_group = valid_group.groupby(["customer_id"])["article_id"].count().values

#     train_set = lgb.Dataset(
#         data=train[feats+extra_feats],
#         label=train["label"],
#         # group=train_group,
#         feature_name=feats+extra_feats,
#         categorical_feature=cat_features,
#         params=params,
#     )

#     valid_set = lgb.Dataset(
#         data=valid[feats+extra_feats],
#         label=valid["label"],
#         # group=valid_group,
#         feature_name=feats+extra_feats,
#         categorical_feature=cat_features,
#         params=params,
#     )

#     ranker = lgb.train(
#         params,
#         train_set,
#         num_boost_round=300,
#         valid_sets=[valid_set],
#         early_stopping_rounds=100,
#         verbose_eval=10,
#     )
#     ranker.save_model(
#         f"/content/models/lgb_ranker__{valid_week_num}.model",
#         num_iteration=ranker.best_iteration,
#     )
#     return ranker

# params = {
#     "objective": "binary",
#     "boosting_type": "gbdt",
#     "metric": "binary_logloss",
#     "max_depth": 6,
#     "num_leaves": 128,
#     "learning_rate": 0.01,
#     "verbose": -1,
#     # "eval_at": 12,
#     # 'device':'gpu'
# }

# ranker = train_model(full_data, 1, 4) # 0.82846 4-week

In [56]:
ranker = lgb.Booster(model_file=f"/content/models/lgb_ranker_{1}.model")

val_candidates = full_data[full_data["week"] == 1].reset_index(drop=True)

def predict(ranker, candidates, batch_size = 5_000_000):
    probs = np.zeros(candidates.shape[0])
    for batch in range(0, candidates.shape[0], batch_size):
        outputs = ranker.predict(candidates.loc[batch : batch + batch_size - 1, feats+extra_feats])
        probs[batch : batch + batch_size] = outputs
    candidates["prob"] = probs
    pred_lgb = candidates[['customer_id','article_id','prob']]
    pred_lgb = pred_lgb.sort_values(by=["customer_id","prob"], ascending=False).reset_index(drop=True)
    pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)
    pred_lgb = pred_lgb.drop_duplicates(['customer_id', 'prediction'], keep='first')
    pred_lgb['customer_id'] = pred_lgb['customer_id'].astype(int)
    pred_lgb = pred_lgb.groupby("customer_id")["prediction"].progress_apply(list).reset_index()

    return pred_lgb

pred = predict(ranker, val_candidates)
label = labels[1]
label = pd.merge(label, pred, on="customer_id", how="left")

map_at_k(label["article_id"], label["prediction"], k=12)


100%|██████████| 68984/68984 [00:05<00:00, 13402.68it/s]


0.028804126951082563

In [None]:
# 0.02760747899904443
# 0.02832189492487917


In [57]:
df_imp = pd.DataFrame({'feature':feats+extra_feats, 'imp':ranker.feature_importance()}).sort_values(by='imp')
df_imp

Unnamed: 0,feature,imp
25,item_sale_uni_ratio,0
15,UGSaleTrend_1,0
10,SaleTrend_1,0
42,product_group_name,5
39,fashion_news_frequency,7
45,perceived_colour_value_id,8
38,club_member_status,9
37,Active,16
34,purchased_item,23
46,perceived_colour_master_id,24


In [None]:
# def train_data(full_data, valid_week_num, train_week_num=4):
#     print("Validating week:", valid_week_num)
#     train = full_data[
#         (valid_week_num < full_data["week"])
#         & (full_data["week"] <= valid_week_num + train_week_num)
#     ]
#     valid = full_data[full_data["week"] == valid_week_num]
#     train = train.sort_values(by=["week", "customer_id"], ascending=True).reset_index(
#         drop=True
#     )
#     valid = valid.sort_values(by=["customer_id"], ascending=True).reset_index(drop=True)
#     print("Train positive rate:", train.label.mean())  # 0.9946384702188372 4-week

#     return train, valid 

# train_data, valid_data = train_data(full_data, 1, 4)

### Test

In [None]:
del candidates, train, valid
gc.collect()

In [None]:
test_candidates = pd.read_parquet(data_dir/"processed"/VERSION_NAME/"week0_candidate.pqt")
for feat in cat_features:
    test_candidates[feat] = test_candidates[feat].astype(cate_dict[feat])

In [None]:
test_pred = predict(ranker, test_candidates)

In [None]:
idx2uid = pickle.load(open(data_dir/"index_id_map/user_index2id.pkl", "rb"))
idx2iid = pickle.load(open(data_dir/"index_id_map/item_index2id.pkl", "rb"))

In [None]:
def parse(x):
    l = ['0'+str(idx2iid[i]) for i in x]
    l = ' '.join(l[:12])
    return l

In [None]:
test_pred['prediction'] = test_pred['prediction'].progress_apply(lambda x: parse(x))

In [None]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

In [None]:
del submission['prediction']
submission = submission.merge(test_pred, on='customer_id', how='left')
submission['customer_id'] = submission['customer_id'].map(idx2uid)

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()

### Submit

In [None]:
submit_msg = """
0.028927037894290773 4-week drop
---
OrderHistory(train, days=3, name='1'),
OrderHistory(train, days=7, name='2'),
OrderHistoryDecay(train, days=3, n=50, name='1'),
OrderHistoryDecay(train, days=7, n=50, name='2'),
ItemPair(OrderHistory(train, days=3).retrieve(), name='1'),
ItemPair(OrderHistory(train, days=7).retrieve(), name='2'),
ItemPair(OrderHistoryDecay(train, days=3, n=50).retrieve(), name='3'),
ItemPair(OrderHistoryDecay(train, days=7, n=50).retrieve(), name='4'),
UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50, name='1'),
UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50, name='2'),
UserGroupSaleTrend(data, customer_list, train, ['age_bins'], days=7, n=50),
TimeHistory(customer_list, last_week, n=50, name='1'),
TimeHistory(customer_list, last_3days, n=50, name='2'),
TimeHistoryDecay(customer_list, train, days=3, n=50, name='1'),
TimeHistoryDecay(customer_list, train, days=7, n=50, name='2'),
SaleTrend(customer_list, train, days=7, n=50)
---
min_pos_rate = 0.005
pivot
"""

In [None]:
submit_msg

In [None]:
# ! mkdir ~/.kaggle
# ! cp ../kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# %pip install kaggle

In [None]:
! kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f ./submission.csv -m '\n0.029111989281461418 4-week drop full negative user\n---\nOrderHistory(train, 3),\nOrderHistory(train, 7),\nOrderHistoryDecay(train, 3, n=50),\nOrderHistoryDecay(train, 7, n=50),\nItemPair(OrderHistory(train, 3).retrieve(), name='1'),\nItemPair(OrderHistory(train, 7).retrieve(), name='2'),\nItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),\nItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),\nUserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50 if week!=0 else 15, name='1'),\nUserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50 if week!=0 else 20.5, name='2'),\nUserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50 if week!=0 else 2),\nTimeHistory(customer_list, last_week, n=50 if week!=0 else 9, name='1'),\nTimeHistory(customer_list, last_3days, n=50 if week!=0 else 16, name='2'),\nTimeHistoryDecay(customer_list, train, 3, n=50 if week!=0 else 12),\nTimeHistoryDecay(customer_list, train, 7, n=50 if week!=0 else 8),\nSaleTrend(customer_list, train, 7, n=50 if week!=0 else 2)\n---\nmin_pos_rate = 0.006\npivot\n'