In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [3]:
import os
import threadpoolctl
import warnings

from rectools.models import ImplicitALSWrapperModel
from rectools.dataset import Dataset
from implicit.als import AlternatingLeastSquares

warnings.filterwarnings('ignore')

# For implicit ALS
os.environ["OPENBLAS_NUM_THREADS"] = "1"
threadpoolctl.threadpool_limits(1, "blas")

  from .autonotebook import tqdm as notebook_tqdm


<threadpoolctl.threadpool_limits at 0x7d8d4e314a50>

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
from src.utils import processing, calculate_recall, create_index_hnsw 

In [6]:
catalog = pd.read_parquet(f'{Path.cwd().parent}/data/stokman_catalog_preprocessed.pq', engine='pyarrow')
catalog.head()

Unnamed: 0,add_date,shop_id,product_id,category_id,price,title,old_price
0,2024-09-30 03:07:09,350,1025536,8922,6990,6048 48557 44342 53515 17614,6990
1,2024-09-19 03:07:08,350,1025614,10600,330,42548 40244 23485 10853 6545 2343,330
2,2024-09-30 03:07:09,350,1025656,231,13240,14034 42002 48557 56089 46836 55620 50785,26490
3,2024-09-30 03:07:09,350,1025659,14546,7550,32640 50164 48557 25314 23255 42146,12590
4,2024-09-30 03:07:09,350,1025668,1949,1290,51789 48557 56089 20149,1290


In [7]:
actions = pd.read_parquet(f'{Path.cwd().parent}/data/train_actions.pq', engine='pyarrow')
actions.head()

Unnamed: 0,user_id,loc_user_id,action,date,products,pageId
0,6fd49b56-8cc6-11ed-86e0-002590c0647c,c6e357dc-121d-449d-a744-e9a0b56c2380,7,2024-09-07 00:00:04,[],2571824865
1,6fd49b56-8cc6-11ed-86e0-002590c0647c,c6e357dc-121d-449d-a744-e9a0b56c2380,7,2024-09-07 00:00:08,[],3834364438
2,f9c498ec-5d3b-11ef-86e0-002590c0647c,120c9064-1131-4dc3-8048-44184531b42e,7,2024-09-07 00:00:08,[],2448628415
3,59386b5c-e64f-11ec-8086-002590c82437,f0745572-893f-4e50-bc52-5af47badff5a,7,2024-09-07 00:00:11,[],3875013967
4,badbd396-6cab-11ef-86e0-002590c0647c,73423d85-d47c-4332-8155-5200615302b5,7,2024-09-07 00:00:11,[],3025531174


#### Разворачиваем данные по айдишникам продуктов

In [8]:
_actions = (
    actions
    .copy()
    .explode('products')
    .assign(products=lambda x: x.products.fillna(-1).astype(np.int64))
)
print(_actions.shape)

_catalog = (
    catalog
    .copy()
    .assign(product_id=lambda x: x.product_id.astype(np.int64))
)

(6859002, 6)


#### Разворачиваем данные по айдишникам продуктов

In [9]:
_actions['action_datetime'] = pd.to_datetime(_actions['date'])

_actions.action = _actions.action + 10
_actions.action = _actions.action.replace({14: 0, 13: 4, 18: 2, 19: 3, 10: 5, 17: 1, 11: 6, 12: 7, 16: 8, 15: 9})
_actions = _actions.sort_values(by=['action', 'date'])
print(_actions.shape)

(6859002, 7)


#### Новая шкала action в train_actions.pq в порядке возрастания важности
    0 - clearB (удаление всех товаров из корзины)
    1 - visit (посещение страницы с товаром)
    2 - visitCategory (посещение страницы с группой товаров)
    3 - search (поиск товара)           
    4 - delB (удаление товара из корзины)
    5 - view (просмотр товара)
    6 - like (лайк товара)
    7 - addB (добавление товара в корзину)
    8 - listB (посещение страницы корзины и вывод списка товаров в корзине)
    9 - order (оформление заказа)
    

#### Переименовываем поля и обьеденияем датасеты каталога и дейсвтвий пользователя    
'products' теперь 'item_id', 'action'  превратился в 'weight' (мы перевели код действия в шкалу по возрастанию ранее)

In [10]:
interactions = _actions.rename(columns={'products': 'item_id', 'action' : 'weight', 'action_datetime': 'datetime'})

In [11]:
items = _catalog.rename(columns={'product_id': 'item_id'})

In [12]:
interactions_merge = interactions.merge(items, on='item_id', how='left')
interactions_merge.shape

(6859002, 13)

In [13]:
interactions_merge['day'] = interactions_merge['datetime'].dt.day
interactions_merge['day_of_week'] = interactions_merge['datetime'].dt.dayofweek  # Monday=0, Sunday=6

interactions_merge['hour'] = interactions_merge['datetime'].dt.hour
interactions_merge['minute'] = interactions_merge['datetime'].dt.minute

In [14]:
interactions_merge['price_diff'] = interactions_merge['price'] - interactions_merge['old_price']

interactions_merge['category_id'] = interactions_merge['category_id'].fillna(0)
interactions_merge['price_diff'] = interactions_merge['price_diff'].fillna(0)

interactions_merge['category_id'] = interactions_merge['category_id'].astype(np.int32)

In [15]:
inters = interactions_merge.merge(
    interactions_merge.groupby('user_id')['loc_user_id'].count().reset_index(name='loc_user_count'),
    on='user_id',
    how='left'
)
print(inters.shape)

(6859002, 19)


In [16]:
inters = inters.merge(
    interactions_merge.groupby('user_id')['category_id'].count().reset_index(name='category_id_count'),
    on='user_id',
    how='left'
)
print(inters.shape)

(6859002, 20)


In [17]:
for action in sorted(interactions_merge.weight.unique()):
    inters = inters.merge(
        interactions_merge[interactions_merge.weight  == action].groupby('user_id')['weight'].count().reset_index(name=f'action_{action}_count'),
        on='user_id',
        how='left'
    )
    inters[f'action_{action}_count'] = inters[f'action_{action}_count'].fillna(0)
print(inters.shape)

(6859002, 30)


In [18]:
inters['delta_sec'] = inters.sort_values(by=['datetime']).groupby('user_id')['datetime'].diff()
inters['delta_sec'] = inters['delta_sec'].dt.total_seconds()
inters['delta_sec'] = inters['delta_sec'].fillna(0)

In [19]:
inters['delta_day'] = inters.sort_values(by=['datetime']).groupby('user_id')['day'].diff()
inters['delta_day'] = inters['delta_day'].fillna(0)

inters['delta_hour'] = inters.sort_values(by=['datetime']).groupby('user_id')['hour'].diff()
inters['delta_hour'] = inters['delta_hour'].fillna(0)

inters['delta_min'] = inters.sort_values(by=['datetime']).groupby('user_id')['minute'].diff()
inters['delta_min'] = inters['delta_min'].fillna(0)

In [20]:
inters = inters.drop(columns=['day', 'hour', 'minute'])

In [21]:
inters = inters.drop(columns=['price', 'old_price', 'date', 'add_date', 'loc_user_id', 'shop_id'])

In [22]:
inters = inters[inters['item_id'] != '/*']
inters = inters[inters['user_id'] != '/*']

In [23]:
inters[inters.user_id == 'e4362420-f6ca-11ec-86e0-002590c0647c'].head(500)

Unnamed: 0,user_id,weight,item_id,pageId,datetime,category_id,title,day_of_week,price_diff,loc_user_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
379,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:26,0,,5,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
383,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:41,0,,5,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,15.0,0.0,0.0,0.0
387,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:51,0,,5,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,10.0,0.0,0.0,0.0
388,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:52,0,,5,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
404,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,3450082821,2024-09-07 00:01:37,0,,5,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,25.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6295889,e4362420-f6ca-11ec-86e0-002590c0647c,5,6770769,252042197,2024-09-27 06:14:04,50448,18203 15838,4,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
6295906,e4362420-f6ca-11ec-86e0-002590c0647c,5,6990711,252042197,2024-09-27 06:14:21,50448,18203 15838,4,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
6295912,e4362420-f6ca-11ec-86e0-002590c0647c,5,6770769,252042197,2024-09-27 06:14:25,50448,18203 15838,4,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
6847200,e4362420-f6ca-11ec-86e0-002590c0647c,9,5422963,4263215769,2024-09-07 00:01:12,2247,30744 37270 46732 29313 52878 60036 63016,5,-5700.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0


In [24]:
inters.head()

Unnamed: 0,user_id,weight,item_id,pageId,datetime,category_id,title,day_of_week,price_diff,loc_user_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
0,5b08b2fc-0e71-11ee-86e0-002590c0647c,0,-1,89919732,2024-09-07 05:11:00,0,,5,0.0,141,...,0.0,27.0,2.0,1.0,1.0,0.0,24.0,0.0,0.0,1.0
1,1af7e04c-6cd1-11ef-9b7b-002590c82436,0,-1,851020872,2024-09-07 05:50:01,0,,5,0.0,91,...,0.0,25.0,0.0,2.0,5.0,1.0,6.0,0.0,0.0,1.0
2,d41cdad4-3b4c-11ef-86e0-002590c0647c,0,-1,1515288183,2024-09-07 07:37:02,0,,5,0.0,85,...,0.0,14.0,0.0,2.0,6.0,0.0,8.0,0.0,0.0,1.0
3,59b9d89a-1de2-11eb-86e0-002590c0647c,0,-1,1497267364,2024-09-07 11:52:12,0,,5,0.0,8270,...,44.0,694.0,1.0,125.0,1729.0,112.0,3.0,0.0,0.0,0.0
4,9c087566-8065-11ed-86e0-002590c0647c,0,-1,3039170152,2024-09-07 12:14:30,0,,5,0.0,182,...,1.0,26.0,0.0,3.0,8.0,0.0,9.0,0.0,0.0,0.0


#### Тренировка модели для распознования ключевого события

In [25]:
inters[((inters.weight == 1) | (inters.weight == 4)) & (inters.user_id == '0038911e-79a3-11ef-9b7b-002590c82436')].sort_values(by=['user_id', 'pageId', 'item_id'])

Unnamed: 0,user_id,weight,item_id,pageId,datetime,category_id,title,day_of_week,price_diff,loc_user_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
3522281,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1015778690,2024-09-25 18:09:12,0,,2,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,23166.0,0.0,7.0,-34.0
3337690,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,125129932,2024-09-24 18:18:04,0,,1,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,2.0,0.0,0.0,0.0
3138513,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1281889168,2024-09-23 18:17:15,0,,0,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,149.0,0.0,0.0,3.0
3338009,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1326997590,2024-09-24 18:20:22,0,,1,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,0.0,0.0,0.0,0.0
3338048,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1326997590,2024-09-24 18:20:40,0,,1,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,18.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3539012,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:20,0,,2,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,1.0,0.0,0.0,0.0
3539032,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:25,0,,2,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,4.0,0.0,0.0,0.0
3539048,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:31,0,,2,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,6.0,0.0,0.0,0.0
3539063,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:34,0,,2,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,3.0,0.0,0.0,0.0


In [26]:
max_date = inters['datetime'].max()
train_raw = inters[inters['datetime'] >= max_date - pd.Timedelta(days=22)].copy()
train_raw = train_raw[train_raw['datetime'] <= max_date - pd.Timedelta(days=0)]
test_raw = inters[inters['datetime'] > max_date - pd.Timedelta(days=3)].copy()

In [27]:
train = train_raw.drop(columns=['datetime', 'item_id', 'pageId'])
test = test_raw.drop(columns=['datetime', 'item_id', 'pageId'])

In [28]:
test.sample()

Unnamed: 0,user_id,weight,category_id,title,day_of_week,price_diff,loc_user_count,category_id_count,action_0_count,action_1_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
6338038,c35f7204-0864-11ee-a1c4-002590c82437,5,0,,4,0.0,115,115,0.0,77.0,...,0.0,17.0,2.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0


In [29]:
train.shape, test.shape

((6858963, 22), (1062417, 22))

### Классы не сбалансированы

In [30]:
train.weight.value_counts()

weight
1    4214660
2    1041842
5    1040483
8     356922
3      81142
7      50120
6      34672
4      26951
9      11802
0        369
Name: count, dtype: int64

### Кооректируем баланс классов

In [31]:
train = train[train.weight.isin([0, 4, 3, 6, 7, 9])].drop_duplicates()
train.shape

(154138, 22)

In [32]:
train_portion = pd.concat(
    [
        train[train.weight.isin([0, 1, 3, 6, 7, 9])],
        train_raw.sample(154138)],
    axis='rows'
)

In [33]:
train_portion.weight.unique()

array([0, 3, 6, 7, 9, 5, 1, 2, 8, 4])

In [34]:
h2o.init(verbose=False)
print(f'{h2o.cluster().show_status()}')
h2o.no_progress()

0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,1 month and 13 days
H2O_cluster_name:,H2O_from_python_varsey_9q0qpv
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.758 Gb
H2O_cluster_total_cores:,20
H2O_cluster_allowed_cores:,20


None


In [35]:
test.shape

(1062417, 22)

In [36]:
train_h2o = h2o.H2OFrame(train_portion)
test_h2o = h2o.H2OFrame(test.sample(40_000))

In [37]:
y = "weight"
x = set(train_h2o.names) - set([y, 'user_id', 'shop_id', 'day', 'hour', 'pageId', 'datetime', 'item_id', 'is_useful', 'title'])
x

{'action_0_count',
 'action_1_count',
 'action_2_count',
 'action_3_count',
 'action_4_count',
 'action_5_count',
 'action_6_count',
 'action_7_count',
 'action_8_count',
 'action_9_count',
 'category_id',
 'category_id_count',
 'day_of_week',
 'delta_day',
 'delta_hour',
 'delta_min',
 'delta_sec',
 'loc_user_count',
 'price_diff'}

In [38]:
train_h2o[y] = train_h2o[y].asfactor()
test_h2o[y] = test_h2o[y].asfactor()

In [39]:
train_h2o['category_id'] = train_h2o['category_id'].asfactor()
test_h2o['category_id'] = test_h2o['category_id'].asfactor()

### для снижения уровня рутины и фокусировки на цели - качесвтенных предсказаниях - используем automl

In [40]:
gbm_model = H2OGradientBoostingEstimator(seed=1234)
gbm_model.train(x=list(x), y=y, training_frame = train_h2o,  validation_frame = test_h2o, max_runtime_secs=120)

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,50.0,500.0,247495.0,5.0,5.0,5.0,10.0,32.0,31.064

0,1,2,3,4,5,6,7,8,9,Error,Rate
117.0,207.0,0.0,41.0,0.0,0.0,3.0,1.0,0.0,0.0,0.6829268,252 / 369
39.0,85078.0,304.0,8578.0,0.0,0.0,689.0,24.0,0.0,0.0,0.1017189,"9,634 / 94,712"
3.0,19260.0,403.0,3805.0,0.0,0.0,58.0,3.0,0.0,0.0,0.9828744,"23,129 / 23,532"
3.0,5284.0,0.0,30966.0,0.0,0.0,44.0,2.0,0.0,0.0,0.1469186,"5,333 / 36,299"
0.0,40.0,0.0,3.0,30.0,18.0,70.0,387.0,41.0,20.0,0.9507389,579 / 609
0.0,1765.0,0.0,274.0,0.0,14511.0,1540.0,4459.0,172.0,436.0,0.3733644,"8,646 / 23,157"
0.0,3124.0,0.0,282.0,0.0,286.0,27041.0,3583.0,148.0,120.0,0.2181066,"7,543 / 34,584"
2.0,2542.0,0.0,255.0,1.0,361.0,5411.0,38958.0,438.0,2210.0,0.223604,"11,220 / 50,178"
0.0,481.0,0.0,80.0,1.0,270.0,570.0,4931.0,1360.0,375.0,0.8314328,"6,708 / 8,068"
0.0,471.0,0.0,88.0,0.0,78.0,860.0,5830.0,74.0,3539.0,0.6765082,"7,401 / 10,940"

k,hit_ratio
1,0.7151865
2,0.9138922
3,0.9709115
4,0.9898035
5,0.9967074
6,0.9988672
7,0.9996886
8,0.9999683
9,1.0000001
10,1.0000001

0,1,2,3,4,5,6,7,8,9,Error,Rate
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0 / 1
9.0,22380.0,88.0,1923.0,0.0,0.0,558.0,2.0,0.0,0.0,0.1033654,"2,580 / 24,960"
0.0,5102.0,108.0,909.0,0.0,0.0,62.0,3.0,0.0,0.0,0.9825356,"6,076 / 6,184"
0.0,54.0,0.0,361.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1301205,54 / 415
0.0,1.0,0.0,1.0,0.0,3.0,9.0,72.0,11.0,0.0,1.0,97 / 97
0.0,717.0,0.0,87.0,0.0,3780.0,517.0,892.0,38.0,81.0,0.3815445,"2,332 / 6,112"
0.0,83.0,0.0,3.0,0.0,1.0,226.0,19.0,0.0,1.0,0.3213213,107 / 333
0.0,5.0,0.0,0.0,0.0,2.0,35.0,187.0,2.0,6.0,0.2109705,50 / 237
0.0,70.0,0.0,18.0,2.0,51.0,150.0,1016.0,249.0,43.0,0.8442777,"1,350 / 1,599"
0.0,2.0,0.0,0.0,0.0,2.0,8.0,29.0,0.0,21.0,0.6612903,41 / 62

k,hit_ratio
1,0.682825
2,0.8997
3,0.9675
4,0.988575
5,0.995775
6,0.9984251
7,0.9996251
8,0.9999751
9,1.0000001
10,1.0000001

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,training_auc,training_pr_auc,validation_rmse,validation_logloss,validation_classification_error,validation_auc,validation_pr_auc
,2024-10-13 00:32:51,0.260 sec,0.0,0.9,2.3025851,0.8123902,,,0.9,2.3025851,0.754525,,
,2024-10-13 00:32:54,2.933 sec,1.0,0.8426293,1.8781714,0.3164795,,,0.8409215,1.8726936,0.348725,,
,2024-10-13 00:32:55,3.827 sec,2.0,0.7977634,1.653275,0.3151093,,,0.7955815,1.6529437,0.3462,,
,2024-10-13 00:32:55,4.297 sec,3.0,0.7589843,1.4960444,0.3147765,,,0.7567727,1.5009332,0.34515,,
,2024-10-13 00:32:59,8.477 sec,13.0,0.5696154,0.9333646,0.3061803,,,0.5754589,0.9706975,0.332175,,
,2024-10-13 00:33:04,13.084 sec,24.0,0.5183363,0.7925869,0.2983416,,,0.529269,0.8384825,0.328475,,
,2024-10-13 00:33:09,18.201 sec,36.0,0.5012005,0.7432872,0.2926663,,,0.5143144,0.7932226,0.321675,,
,2024-10-13 00:33:14,23.693 sec,49.0,0.4917572,0.7178684,0.2850047,,,0.5072622,0.7740365,0.317625,,
,2024-10-13 00:33:15,24.654 sec,50.0,0.4914098,0.7169531,0.2848135,,,0.5070537,0.7735766,0.317175,,

variable,relative_importance,scaled_importance,percentage
category_id,211527.765625,1.0,0.4334366
action_6_count,77646.328125,0.3670739,0.1591033
action_3_count,55861.6054688,0.2640864,0.1144647
action_7_count,50677.4921875,0.2395784,0.1038421
action_1_count,22496.15625,0.1063508,0.0460963
delta_min,19445.125,0.0919271,0.0398446
delta_sec,11367.8076172,0.0537414,0.0232935
action_8_count,9555.7275391,0.0451748,0.0195804
action_9_count,9454.0019531,0.0446939,0.019372
loc_user_count,8091.7602539,0.0382539,0.0165806


In [41]:
# gbm_model.model_performance(test_h2o)

In [42]:
gbm_model.varimp(use_pandas=True)

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,category_id,211527.765625,1.0,0.433437
1,action_6_count,77646.328125,0.367074,0.159103
2,action_3_count,55861.605469,0.264086,0.114465
3,action_7_count,50677.492188,0.239578,0.103842
4,action_1_count,22496.15625,0.106351,0.046096
5,delta_min,19445.125,0.091927,0.039845
6,delta_sec,11367.807617,0.053741,0.023294
7,action_8_count,9555.727539,0.045175,0.01958
8,action_9_count,9454.001953,0.044694,0.019372
9,loc_user_count,8091.760254,0.038254,0.016581


#### Расчитываем recall для локальных тестов

#### Тестовые данные - три дня и ключевое событие - 9 (заказ)

In [43]:
last_3_days = inters['datetime'].max() - pd.Timedelta(3, unit='D')
last_13_days = inters['datetime'].max() - pd.Timedelta(0, unit='D')
test_purchasers = inters.sort_values(by=['datetime', 'weight'], ascending=True)
test_purchasers = test_purchasers[
    (test_purchasers['weight'] == 9) 
    & (test_purchasers['datetime'] > last_3_days)
    & (test_purchasers['datetime'] < last_13_days)
]
print(test_purchasers['datetime'].max(), test_purchasers['datetime'].min(),)
test_purchasers = test_purchasers.groupby(['user_id'])['item_id'].apply(processing)
test_purchasers = test_purchasers.reset_index()
test_purchasers.shape

2024-09-28 23:46:12 2024-09-26 00:38:57


(496, 2)

In [379]:
# Предсказания фильтрацией
last_3_days = inters['datetime'].max() - pd.Timedelta(0, unit='D')
last_8_days = inters['datetime'].max() - pd.Timedelta(3*24, unit='hours')
pred_purchasers = inters.sort_values(by=['datetime', 'weight'], ascending=True)
pred_purchasers = pred_purchasers[
    (pred_purchasers['weight'].isin([8, 1, 0, 4]))
    & (pred_purchasers['datetime'] > last_8_days)
    & (pred_purchasers['datetime'] < last_3_days)
    & (
            (pred_purchasers.action_7_count > 50) 
            | (pred_purchasers.action_8_count > 20) 
            | (pred_purchasers.action_9_count > 1) 
            # & (pred_purchasers.action_6_count > 0) 
            # & (pred_purchasers.action_3_count > 0)
    )
]
pred_purchasers = pred_purchasers.groupby(['user_id'])['item_id'].apply(processing)
pred_purchasers = pred_purchasers.reset_index()
pred_purchasers.shape

(1883, 2)

In [380]:
# Проверяем сколько пользователей из теста удалось "зацепить" фильтрацией
pred_purchasers[pred_purchasers.user_id.isin(test_purchasers.user_id.unique())].shape

(341, 2)

In [381]:
check_merge = test_purchasers.merge(pred_purchasers[-3000:], on='user_id', how='left')
check_merge.item_id_y.loc[check_merge.item_id_y.isnull()] = check_merge.item_id_y.loc[check_merge.item_id_y.isnull()].apply(lambda x: [])
check_merge.shape

(496, 3)

In [382]:
check_merge.sample(3)

Unnamed: 0,user_id,item_id_x,item_id_y
382,c0c46e2a-4225-11ec-86e0-002590c0647c,[4923178],[]
62,1956c7b0-7c37-11ef-9b7b-002590c82436,[1027036],[]
52,152d12bc-4c9d-11ea-86e0-002590c0647c,"[5154484, 6688812]","[6688812, 5474224, 6821835, 5154484, 4986538, ..."


In [383]:
#### Проверяем полноту фильтрации

In [384]:
check_merge['recall'] = check_merge.apply(calculate_recall, axis=1)
check_merge['recall'].mean().round(4)

0.3428

#### Используем классификатор чтобы предсказать пользователей с целевыми действиями

In [52]:
inters['datetime'].min(), inters['datetime'].max()

(Timestamp('2024-09-07 00:00:04'), Timestamp('2024-09-28 23:59:59'))

In [53]:
last_3_days = inters['datetime'].max() - pd.Timedelta(0, unit='D')
last_8_days = inters['datetime'].max() - pd.Timedelta(27, unit='D')
pred_data = inters[(inters['datetime'] > last_8_days) & (inters['datetime'] < last_3_days)]

In [54]:
t = h2o.H2OFrame(pred_data[list(x)])

In [55]:
t['category_id'] = t['category_id'].asfactor()

In [56]:
preds = gbm_model.predict(t)

#### Обьединяем предсказания с исходными данными, чтобы фильтровать дальше

In [118]:
res = pd.concat(
    [
        pred_data[list(x) + ['weight', 'user_id', 'item_id', 'datetime']].reset_index(drop=True),
        preds.as_data_frame().reset_index(drop=True)
    ],
    axis='columns',
)

In [119]:
# Максимальный скор с которым предсказали целевое действие
res['score'] = res[res.columns[-10:]].max(axis=1)

In [120]:
# Смотрим корреляцию по данным чтобы понять по чему фильтровать
# res[list(set(res.columns) - set(['user_id']))].sample(1000).corr()

In [385]:
mask = (
    (res.predict.isin([9, 0, 4]) & (res.score > 0.0))
    # |
    # (res.predict.isin([8]) & (res.score > 0.75))
)

In [386]:
# Оцениваем размер таблицы после фильрации
res[
    mask
].groupby('user_id')['item_id'].agg(lambda x: list(map(str, x))[:25]).reset_index().shape

(1957, 2)

In [387]:
# Оцениваем количество пользователей, которое удалось "зацепить" фильтрацией из теста
res[
    mask
    & (res.user_id.isin(test_purchasers.user_id.unique()))
].groupby('user_id')['item_id'].agg(lambda x: list(set(map(str, x)))[:25]).reset_index().shape

(261, 2)

In [388]:
ml_users = res[
    mask
    & (res.user_id.isin(test_purchasers.user_id.unique()))
].groupby('user_id')['item_id'].agg(lambda x: list(set(map(str, x)))[:25]).reset_index().user_id.to_list()

In [389]:
print(len(pred_purchasers.user_id.unique()))
print(len(ml_users))
print(len(set(ml_users).intersection(set(pred_purchasers.user_id.unique()))))

1883
261
150


In [390]:
# Генерим таблицу с кандидатами по результатам мл - предсказания
ml_pred = res[
    mask
].sort_values(by=['weight'], ascending=False).groupby('user_id')['item_id'].agg(
    lambda x: [y for y in (set(map(str, x))) if y != '-1'][:25]
).reset_index()
ml_pred.shape

(1957, 2)

In [391]:
# Напоминание о размере таблицы с кандидатами после обычной фильтрации - оставшееся от 3000 строк место мы заполнили результатами мл
pred_purchasers.shape

(1883, 2)

In [392]:
data_to_check = pd.concat(
    [
        ml_pred.reset_index(drop=True),
        pred_purchasers.reset_index(drop=True),
    ],
    axis='rows'
)
print(data_to_check.shape)
data_to_check = data_to_check.drop_duplicates(subset='user_id', keep='last') # .reset_index(drop=True)
print(data_to_check[data_to_check.user_id.isin(test_purchasers.user_id.unique())].shape)
data_to_check.shape

(3840, 2)
(452, 2)


(3410, 2)

In [393]:
# Ограничим резульат 3000 строк (как по условию). Так как данные обычной фильтрации эффективнее и они в конце обьединенных данных - ограничим с конца
data_to_check[-3000:].shape

(3000, 2)

#### Проверяем результат локально на данных фильтрация + мл

In [394]:
# check_merge_ml = test_purchasers.merge(data_to_check.sample(3000, random_state=42), on='user_id', how='left')
check_merge_ml = test_purchasers.merge(data_to_check[-3000:], on='user_id', how='left')
check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()] = check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()].apply(lambda x: [])
print(check_merge_ml.shape)
check_merge_ml['recall'] = check_merge_ml.apply(calculate_recall, axis=1)
check_merge_ml['recall'].mean().round(4)

(496, 3)


0.4144

### Построение рекомендаций 
дополним данные после фильрации товарами на основе "оценок" пользователей (ценность действия интерпретируем как оценку)

#### Построение модели

In [71]:
K_RECOS = 25
NUM_THREADS = 24
RANDOM_STATE = 32
ITERATIONS = 25

In [72]:
def make_base_model(factors: int, regularization: float, alpha: float, fit_features_together: bool=False):
    return ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            alpha=alpha,
            random_state=RANDOM_STATE,
            use_gpu=False,
            num_threads = NUM_THREADS,
            iterations=ITERATIONS),
        fit_features_together = fit_features_together,
        )

In [73]:
last_3_days = inters['datetime'].max() - pd.Timedelta(0, unit='D')
last_8_days = inters['datetime'].max() - pd.Timedelta(27, unit='D')
recos_data = inters[
    (inters['datetime'] > last_8_days)
    & (inters['datetime'] < last_3_days) 
    # & ((inters['weight'] > 3))   # дейсвтия ниже 5 не привязаны к товару, чтобы не рекомендовать -1 и сэкономить на размере входных данных - фильтруем
]

In [74]:
items = _catalog.rename(columns={'product_id': 'item_id'})

items = items.loc[items['item_id'].isin(recos_data['item_id'])].copy()

item_feature = items[["item_id", "category_id"]].explode("category_id")
item_feature.columns = ["id", "value"]
item_feature["feature"] = "category_id"
item_features = pd.concat((item_feature,))

In [75]:
item_features.sample(3)

Unnamed: 0,id,value,feature
35392,4959463,7081,category_id
78610,6627159,1702,category_id
81579,6742515,8945,category_id


In [76]:
recos_data.weight.unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [395]:
recs_dataset = Dataset.construct(
    interactions_df=recos_data[['user_id', 'item_id', 'weight', 'datetime']],
    item_features_df=item_features,
    cat_item_features=["category_id"],
)

model = make_base_model(factors=256, regularization=0.2, alpha=100)
model.fit(recs_dataset)

100%|██████████| 1/1 [01:38<00:00, 98.35s/it]


<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7d8ce8367610>

In [396]:
recos = model.recommend(
    users=data_to_check[-3000:].user_id,
    dataset=recs_dataset,
    k=22,
    filter_viewed=False,
)

In [397]:
recos = recos[recos.item_id > -1]

#### Дополняем данные с предыдущих предсказаний (фильтрация + ml) 

In [398]:
recos_agg = recos.groupby('user_id')['item_id'].agg(lambda x: list(map(str, x))).reset_index()

In [399]:
recos_agg_selected_uid = recos_agg[
    recos_agg.user_id.isin(
        data_to_check[-3000:].user_id
    )
]

In [400]:
recos_agg_selected_uid.shape

(3000, 2)

In [401]:
# Объединяем рекомендации с предсказанями фильтрацией и мл
enriched_data_to_check = data_to_check[-3000:].merge(recos_agg_selected_uid, on='user_id', how='left')

In [402]:
enriched_data_to_check.columns

Index(['user_id', 'item_id_x', 'item_id_y'], dtype='object')

In [403]:
# Заполняем nan пустыми списками для корректного расчета метрики
enriched_data_to_check.item_id_y.loc[enriched_data_to_check.item_id_y.isnull()] = enriched_data_to_check.item_id_y.loc[enriched_data_to_check.item_id_y.isnull()].apply(lambda x: [])

In [404]:
# Обьединяем исходное предсказание рекомендациями по rectools - сначала исходные айди, потом рекоммендации (порядок важен, тк далее оставляем только 25 элементов списка)
enriched_data_to_check['item_id'] = enriched_data_to_check['item_id_x'] + enriched_data_to_check['item_id_y']

In [405]:
enriched_data_to_check['products_count'] = enriched_data_to_check['item_id'].apply(len)
enriched_data_to_check['products_count'].mean()

26.130666666666666

In [406]:
# Убеждаемся что добавлением новых товаров мы не вышли 
enriched_data_to_check['item_id'] = enriched_data_to_check['item_id'].apply(lambda x: list(set(x[:25])))

In [407]:
enriched_data_to_check['products_count'] = enriched_data_to_check['item_id'].apply(len)
enriched_data_to_check['products_count'].mean()

21.409

In [408]:
enriched_data_to_check = enriched_data_to_check[['user_id', 'item_id']]

In [409]:
enriched_data_to_check.drop_duplicates(subset=['user_id']).shape

(3000, 2)

In [410]:
check_merge_ml = test_purchasers.merge(enriched_data_to_check[-3000:], on='user_id', how='left')
check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()] = check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()].apply(lambda x: [])
print(check_merge_ml.shape)
check_merge_ml['recall'] = check_merge_ml.apply(calculate_recall, axis=1)
check_merge_ml['recall'].mean().round(4)

(496, 3)


0.4174

### Сохраняем результаты

In [411]:
enriched_data_to_check.rename(columns={'item_id': 'products'}).reset_index(drop=True).to_csv("r_final.csv", index=False)

In [412]:
pd.read_csv('r_final.csv').shape

(3000, 2)

In [775]:
catalog = pd.read_parquet(f'{Path.cwd().parent}/data/stokman_catalog_preprocessed.pq', engine='pyarrow')
catalog.head()

catalog['desc'] = catalog['category_id'].astype(str) + ' ' + catalog['title']
catalog = catalog.drop_duplicates(subset='product_id').reset_index(drop=True)
catalog = catalog.dropna(subset=['desc'])
catalog.shape

(89483, 8)

In [776]:
documents = catalog.desc.dropna().to_list()
print(len(documents))
vectorizer = TfidfVectorizer(lowercase=False)
tfidf_matrix = vectorizer.fit_transform(documents).toarray().astype('float32')

89483


In [527]:
index = create_index_hnsw(tfidf_matrix)

In [529]:
# dict(zip(vectorizer.vocabulary_, vectorizer.idf_)) 

In [780]:
query_vector = vectorizer.transform(['230 38721 36072 46732 26240 32815',])

N = 1
_, indices = index.search(query_vector.toarray().reshape(1, -1), 4) 
for indx in range(N):
    match = catalog.iloc[indices[0][indx]].to_dict()
    print(match['product_id'], ' ', match['desc'])
    

5920453   230 38721 36072 46732 26240 32815


In [786]:
def get_similar(x):
    res = []
    for y in x[:4]:
        cand = catalog[catalog.product_id == y]
        if cand.shape[0] > 0:
            to_match = cand.desc.values[0]
            query_vector = vectorizer.transform([to_match, ])
            _, indices = index.search(query_vector.toarray().reshape(1, -1), 2)
            match1 = catalog.iloc[indices[0][0]].to_dict()
            match2 = catalog.iloc[indices[0][1]].to_dict()
            res.append(match1['product_id'])
            res.append(match2['product_id'])
    return res

In [787]:
enriched_data_to_check_w_similar = enriched_data_to_check.copy()

In [788]:
enriched_data_to_check_w_similar['items_sim']  = enriched_data_to_check_w_similar['item_id'].apply(lambda x: get_similar(x))

In [789]:
# Убеждаемся что добавлением новых товаров мы не вышли 
enriched_data_to_check_w_similar['item_id'] = enriched_data_to_check_w_similar['item_id'] + enriched_data_to_check_w_similar['items_sim'].apply(lambda x: list(set(x)))

In [790]:
enriched_data_to_check_w_similar['item_id'] = enriched_data_to_check_w_similar['item_id'].apply(lambda x: sorted(list(set(x[:25])), reverse=True))

In [791]:
enriched_data_to_check_w_similar['products_count'] = enriched_data_to_check_w_similar['item_id'].apply(len)
enriched_data_to_check_w_similar['products_count'].mean()

23.474

In [792]:
enriched_data_to_check_w_similar.sample(5)

Unnamed: 0,user_id,item_id,items_sim,products_count
1207,6ce9e6ac-4383-11ee-86e0-002590c0647c,"[787768, 734339, 6606440, 6438154, 6438151, 64...","[787768, 4964878, 1334129, 1335152, 6652005, 6...",25
166,15492920-4c9f-11ed-8644-002590c82437,"[718659, 718652, 6886686, 6588035, 6422179, 64...","[718652, 718659, 6886686, 1041541, 6588035, 59...",24
1978,a95cc5ca-025f-11ed-bf62-002590c82437,"[5880976, 5880442, 5418100, 5313529, 5313490, ...","[5880976, 5880442, 5418100, 5399602, 5313490, ...",23
2459,d162bc0e-31b3-11e8-a0d7-002590e45c38,"[6655476, 6627677, 6627675, 6481510, 6481492, ...","[6655476, 6110206, 6627675, 6627677, 6481492, ...",24
2157,b8ae1e90-5b84-11ef-86e0-002590c0647c,"[6534112, 6534088, 6534079, 6534028, 6534001, ...","[6533962, 6534088, 6533962, 6534088, 6533944, ...",21


In [793]:
check_merge_ml = test_purchasers.merge(enriched_data_to_check_w_similar[-3000:], on='user_id', how='left')
check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()] = check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()].apply(lambda x: [])
print(check_merge_ml.shape)
check_merge_ml['recall'] = check_merge_ml.apply(calculate_recall, axis=1)
check_merge_ml['recall'].mean().round(4)

(496, 5)


0.0857

In [176]:
# Не забываем выключить кластер h2o на локальной машине
h2o.cluster().shutdown()