In [1]:
# %pip install -U lightgbm

In [1]:
import pandas as pd
import numpy as np

import pickle

from tqdm import tqdm
from pathlib import Path
import gc

In [2]:
import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("../src/")
InteractiveShell.ast_node_interactivity = "all"

In [3]:
from data import DataHelper
from data.metrics import map_at_k, hr_at_k, recall_at_k
from retrieval.rules import (
    OrderHistory,
    OrderHistoryDecay,
    ItemPair,
    UserGroupTimeHistory,
    UserGroupSaleTrend,
    TimeHistory,
    TimeHistoryDecay,
    SaleTrend,
    OutOfStock,
)
from retrieval.collector import RuleCollector


In [4]:
data_dir = Path("../data/")
dh = DataHelper(data_dir)

In [5]:
# data = dh.preprocess_data(save=True) # run only once

In [6]:
data = dh.load_data(name="encoded_full")

In [7]:
listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
data['user']['age_bins'] = pd.cut(data['user']['age'], listBin)

In [8]:
trans = data["inter"].merge(data['item'][['article_id','product_code']], on='article_id', how='left')

## Retrieval

In [9]:
pd.to_datetime('2020-09-16') - 5*pd.Timedelta(days=7)

Timestamp('2020-08-12 00:00:00')

In [10]:
trans = data["inter"]
train, valid = dh.split_data(trans, "2020-09-16", "2020-09-23")
customer_list = valid["customer_id"].values

last_week = train.loc[train.t_dat >= "2020-09-09"]
last_3days = train.loc[train.t_dat >= "2020-09-13"]
last_2week = train.loc[train.t_dat >= "2020-09-02"]
last_5week = train.loc[train.t_dat >= "2020-08-12"]

In [11]:
train = train.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
last_week = last_week.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
last_3days = last_3days.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
last_2week = last_2week.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
last_5week = last_5week.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')

In [12]:
# last_week = last_week.merge(data['item'][['article_id','perceived_colour_master_id','product_group_name']], on='article_id', how='left')

In [13]:
candidates = RuleCollector().collect(
    # data=data,
    valid = valid,
    customer_list=customer_list,
    rules=[
        # OrderHistory(train, 7),
        # ItemPair(OrderHistory(train, 7).retrieve()),
        # UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], 24),
        # OrderHistoryDecay(train, 7),
        OrderHistory(train, 3),
        OrderHistory(train, 7),
        OrderHistory(train, 14),
        OrderHistoryDecay(train, 3, n=50),
        OrderHistoryDecay(train, 7, n=50),
        OrderHistoryDecay(train, 14, n=50),
        ItemPair(OrderHistory(train, 3).retrieve(), name='1'),
        ItemPair(OrderHistory(train, 7).retrieve(), name='2'),
        ItemPair(OrderHistory(train, 14).retrieve(), name='3'),
        ItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='4'),
        ItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='5'),
        ItemPair(OrderHistoryDecay(train, 14, n=50).retrieve(), name='6'),
        UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50, name='1'),
        UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50, name='2'),
        UserGroupTimeHistory(data, customer_list, last_2week, ['age_bins'], n=50, name='3'),
        UserGroupSaleTrend(data, customer_list, train, ['age_bins'], 3, n=50),
        UserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50),
        UserGroupSaleTrend(data, customer_list, train, ['age_bins'], 14, n=50),
        TimeHistory(customer_list, last_week, n=50, name='1'),
        TimeHistory(customer_list, last_3days, n=50, name='2'),
        TimeHistory(customer_list, last_2week, n=50, name='3'),
        TimeHistoryDecay(customer_list, train, 3, n=50),
        TimeHistoryDecay(customer_list, train, 7, n=50),
        TimeHistoryDecay(customer_list, train, 14, n=50),
        SaleTrend(customer_list, train, 3, n=50),
        SaleTrend(customer_list, train, 7, n=50),
        SaleTrend(customer_list, train, 14, n=50),
    ],
    filters=[OutOfStock(trans)],
    min_pos_rate=0.006,
    compress=False,
)

Retrieve items by rules:   4%|▎         | 1/27 [00:31<13:36, 31.42s/it]

Positive rate: 0.03038


Retrieve items by rules:   7%|▋         | 2/27 [00:59<12:15, 29.40s/it]

Positive rate: 0.02859


Retrieve items by rules:  11%|█         | 3/27 [01:29<11:53, 29.71s/it]

Positive rate: 0.02577


Retrieve items by rules:  15%|█▍        | 4/27 [02:34<16:47, 43.81s/it]

Positive rate: 0.01413
Positive rate: 0.01295


Retrieve items by rules:  19%|█▊        | 5/27 [03:44<19:25, 53.00s/it]

Positive rate: 0.01230


Retrieve items by rules:  22%|██▏       | 6/27 [04:53<20:31, 58.66s/it]

Positive rate: 0.01519


Retrieve items by rules:  26%|██▌       | 7/27 [05:16<15:39, 47.00s/it]

Positive rate: 0.01472


Retrieve items by rules:  30%|██▉       | 8/27 [05:41<12:37, 39.87s/it]

Positive rate: 0.01374


Retrieve items by rules:  33%|███▎      | 9/27 [06:10<10:59, 36.62s/it]

Positive rate: 0.00908


Retrieve items by rules:  37%|███▋      | 10/27 [06:52<10:47, 38.08s/it]

Positive rate: 0.00853


Retrieve items by rules:  41%|████      | 11/27 [07:42<11:08, 41.77s/it]

Positive rate: 0.00825


Retrieve items by rules:  44%|████▍     | 12/27 [08:42<11:49, 47.28s/it]

TOP15.0 Positive rate: 0.00602


Retrieve items by rules:  48%|████▊     | 13/27 [09:42<11:55, 51.08s/it]

TOP20.5 Positive rate: 0.00608


Retrieve items by rules:  52%|█████▏    | 14/27 [10:39<11:30, 53.11s/it]

TOP11.0 Positive rate: 0.00601


Retrieve items by rules:  59%|█████▉    | 16/27 [12:42<10:31, 57.38s/it]

skip
TOP2.0 Positive rate: 0.00652


Retrieve items by rules:  67%|██████▋   | 18/27 [14:54<09:18, 62.01s/it]

skip
TOP9.0 Positive rate: 0.00631


Retrieve items by rules:  70%|███████   | 19/27 [15:52<08:05, 60.68s/it]

TOP16.0 Positive rate: 0.00601


Retrieve items by rules:  78%|███████▊  | 21/27 [17:43<05:48, 58.05s/it]

skip
TOP12.0 Positive rate: 0.00607


Retrieve items by rules:  81%|████████▏ | 22/27 [19:29<06:02, 72.46s/it]

TOP8.0 Positive rate: 0.00616


Retrieve items by rules:  85%|████████▌ | 23/27 [21:06<05:19, 79.92s/it]

TOP11.0 Positive rate: 0.00604


Retrieve items by rules:  93%|█████████▎| 25/27 [23:46<02:36, 78.47s/it]

skip
TOP2.0 Positive rate: 0.00758


Retrieve items by rules: 100%|██████████| 27/27 [25:54<00:00, 57.57s/it]

skip





In [14]:
candidates = (
    pd.pivot_table(
        candidates,
        values="score",
        index=["customer_id", "article_id"],
        columns=["method"],
        aggfunc=np.sum,
    )
    .reset_index()
    # .fillna(0)
)

In [15]:
candidates.shape

(15630864, 24)

In [16]:
label = valid[["customer_id", "article_id"]]
label.columns = ["customer_id", "label_item"]
tmp_items = candidates.merge(label, on=["customer_id"], how="left")
tmp_items = tmp_items[tmp_items["label_item"].notnull()]
tmp_items["label"] = tmp_items.apply(lambda x: 1 if x["article_id"] in x["label_item"] else 0, axis=1)
pos_rate = tmp_items["label"].mean()
pos_rate
# 0.007686350632672472
# 0.0056985812619375735
# 0.0062356084250075995

0.0062356084250075995

In [17]:
candidates = candidates.drop_duplicates(['customer_id','article_id'])

In [18]:
candidates = candidates.groupby('customer_id')['article_id'].apply(list).reset_index()

In [19]:
candidates.rename(columns={'article_id': 'prediction'}, inplace=True)
valid2 = pd.merge(valid, candidates, on="customer_id", how="left")

In [20]:
map_at_k(valid2["article_id"], valid2["prediction"], k=12)
hr_at_k(valid2["article_id"], valid2["prediction"], k=12)
recall_at_k(valid2["article_id"], valid2["prediction"], k=12)
# 0.025620866741013788

0.007122429128294375

0.06400034790676098

0.028341529076267406

In [21]:
valid2['prediction'].apply(len).mean()
# 31.335150179751828 0.09236951948783895
# 56.68628957439406 0.1210664822292757
# 49.73995419227647 0.11656190014664647

49.73995419227647

In [22]:
recall_at_k(valid2["article_id"], valid2["prediction"], k=1000)

0.11656190014664647

In [23]:
recall_at_k(valid2["article_id"], valid2["prediction"], k=1000) / valid2['prediction'].apply(len).mean()
# 0.002504861238505555
# 0.0029477924617551813

0.0023434259648905343