In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [3]:
import os
import threadpoolctl
import warnings

from rectools.models import ImplicitALSWrapperModel
from rectools.dataset import Dataset
from implicit.als import AlternatingLeastSquares

warnings.filterwarnings('ignore')

# For implicit ALS
os.environ["OPENBLAS_NUM_THREADS"] = "1"
threadpoolctl.threadpool_limits(1, "blas")

<threadpoolctl.threadpool_limits at 0x780acb85a510>

In [4]:
from src.utils import processing, calculate_recall

In [5]:
catalog = pd.read_parquet(f'{Path.cwd().parent}/data/stokman_catalog_preprocessed.pq', engine='pyarrow')
catalog.head()

Unnamed: 0,add_date,shop_id,product_id,category_id,price,title,old_price
0,2024-09-30 03:07:09,350,1025536,8922,6990,6048 48557 44342 53515 17614,6990
1,2024-09-19 03:07:08,350,1025614,10600,330,42548 40244 23485 10853 6545 2343,330
2,2024-09-30 03:07:09,350,1025656,231,13240,14034 42002 48557 56089 46836 55620 50785,26490
3,2024-09-30 03:07:09,350,1025659,14546,7550,32640 50164 48557 25314 23255 42146,12590
4,2024-09-30 03:07:09,350,1025668,1949,1290,51789 48557 56089 20149,1290


In [6]:
actions = pd.read_parquet(f'{Path.cwd().parent}/data/train_actions.pq', engine='pyarrow')
actions.head()

Unnamed: 0,user_id,loc_user_id,action,date,products,pageId
0,6fd49b56-8cc6-11ed-86e0-002590c0647c,c6e357dc-121d-449d-a744-e9a0b56c2380,7,2024-09-07 00:00:04,[],2571824865
1,6fd49b56-8cc6-11ed-86e0-002590c0647c,c6e357dc-121d-449d-a744-e9a0b56c2380,7,2024-09-07 00:00:08,[],3834364438
2,f9c498ec-5d3b-11ef-86e0-002590c0647c,120c9064-1131-4dc3-8048-44184531b42e,7,2024-09-07 00:00:08,[],2448628415
3,59386b5c-e64f-11ec-8086-002590c82437,f0745572-893f-4e50-bc52-5af47badff5a,7,2024-09-07 00:00:11,[],3875013967
4,badbd396-6cab-11ef-86e0-002590c0647c,73423d85-d47c-4332-8155-5200615302b5,7,2024-09-07 00:00:11,[],3025531174


#### Разворачиваем данные по айдишникам продуктов

In [9]:
_actions = (
    actions
    .copy()
    .explode('products')
    .assign(products=lambda x: x.products.fillna(-1).astype(np.int64))
)
print(_actions.shape)

_catalog = (
    catalog
    .copy()
    .assign(product_id=lambda x: x.product_id.astype(np.int64))
)

(6859002, 6)


#### Разворачиваем данные по айдишникам продуктов

In [10]:
_actions['action_datetime'] = pd.to_datetime(_actions['date'])

_actions.action = _actions.action + 10
_actions.action = _actions.action.replace({14: 0, 13: 4, 18: 2, 19: 3, 10: 5, 17: 1, 11: 6, 12: 7, 16: 8, 15: 9})
_actions = _actions.sort_values(by=['action', 'date'])
print(_actions.shape)

(6859002, 7)


#### Новая шкала action в train_actions.pq в порядке возрастания важности
    0 - clearB (удаление всех товаров из корзины)
    1 - visit (посещение страницы с товаром)
    2 - visitCategory (посещение страницы с группой товаров)
    3 - search (поиск товара)           
    4 - delB (удаление товара из корзины)
    5 - view (просмотр товара)
    6 - like (лайк товара)
    7 - addB (добавление товара в корзину)
    8 - listB (посещение страницы корзины и вывод списка товаров в корзине)
    9 - order (оформление заказа)
    

#### Переименовываем поля и обьеденияем датасеты каталога и дейсвтвий пользователя    
'products' теперь 'item_id', 'action'  превратился в 'weight' (мы перевели код действия в шкалу по возрастанию ранее)

In [11]:
interactions = _actions.rename(columns={'products': 'item_id', 'action' : 'weight', 'action_datetime': 'datetime'})

In [12]:
items = _catalog.rename(columns={'product_id': 'item_id'})

In [13]:
interactions_merge = interactions.merge(items, on='item_id', how='left')
interactions_merge.shape

(6859002, 13)

In [14]:
interactions_merge['day'] = interactions_merge['datetime'].dt.day
interactions_merge['day_of_week'] = interactions_merge['datetime'].dt.dayofweek  # Monday=0, Sunday=6

interactions_merge['hour'] = interactions_merge['datetime'].dt.hour
interactions_merge['minute'] = interactions_merge['datetime'].dt.minute

In [15]:
interactions_merge['price_diff'] = interactions_merge['price'] - interactions_merge['old_price']

interactions_merge['category_id'] = interactions_merge['category_id'].fillna(0)
interactions_merge['price_diff'] = interactions_merge['price_diff'].fillna(0)

interactions_merge['category_id'] = interactions_merge['category_id'].astype(np.int32)

In [16]:
inters = interactions_merge.merge(
    interactions_merge.groupby('user_id')['loc_user_id'].count().reset_index(name='loc_user_count'),
    on='user_id',
    how='left'
)
print(inters.shape)

(6859002, 19)


In [17]:
inters = inters.merge(
    interactions_merge.groupby('user_id')['category_id'].count().reset_index(name='category_id_count'),
    on='user_id',
    how='left'
)
print(inters.shape)

(6859002, 20)


In [18]:
for action in sorted(interactions_merge.weight.unique()):
    inters = inters.merge(
        interactions_merge[interactions_merge.weight  == action].groupby('user_id')['weight'].count().reset_index(name=f'action_{action}_count'),
        on='user_id',
        how='left'
    )
    inters[f'action_{action}_count'] = inters[f'action_{action}_count'].fillna(0)
print(inters.shape)

(6859002, 30)


In [19]:
inters['delta_sec'] = inters.sort_values(by=['datetime']).groupby('user_id')['datetime'].diff()
inters['delta_sec'] = inters['delta_sec'].dt.total_seconds()
inters['delta_sec'] = inters['delta_sec'].fillna(0)

In [20]:
inters['delta_day'] = inters.sort_values(by=['datetime']).groupby('user_id')['day'].diff()
inters['delta_day'] = inters['delta_day'].fillna(0)

inters['delta_hour'] = inters.sort_values(by=['datetime']).groupby('user_id')['hour'].diff()
inters['delta_hour'] = inters['delta_hour'].fillna(0)

inters['delta_min'] = inters.sort_values(by=['datetime']).groupby('user_id')['minute'].diff()
inters['delta_min'] = inters['delta_min'].fillna(0)

In [21]:
inters = inters.drop(columns=['day', 'hour', 'minute'])

In [22]:
inters = inters.drop(columns=['price', 'title', 'old_price', 'date', 'add_date', 'loc_user_id', 'shop_id'])

In [55]:
inters = inters[inters['item_id'] != '/*']
inters = inters[inters['user_id'] != '/*']

In [56]:
inters[inters.user_id == 'e4362420-f6ca-11ec-86e0-002590c0647c'].head(500)

Unnamed: 0,user_id,weight,item_id,pageId,datetime,category_id,day_of_week,price_diff,loc_user_count,category_id_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
379,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:26,0,5,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
383,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:41,0,5,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,15.0,0.0,0.0,0.0
387,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:51,0,5,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,10.0,0.0,0.0,0.0
388,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:52,0,5,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
404,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,3450082821,2024-09-07 00:01:37,0,5,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,25.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6295889,e4362420-f6ca-11ec-86e0-002590c0647c,5,6770769,252042197,2024-09-27 06:14:04,50448,4,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
6295906,e4362420-f6ca-11ec-86e0-002590c0647c,5,6990711,252042197,2024-09-27 06:14:21,50448,4,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
6295912,e4362420-f6ca-11ec-86e0-002590c0647c,5,6770769,252042197,2024-09-27 06:14:25,50448,4,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
6847200,e4362420-f6ca-11ec-86e0-002590c0647c,9,5422963,4263215769,2024-09-07 00:01:12,2247,5,-5700.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0


In [57]:
inters.head()

Unnamed: 0,user_id,weight,item_id,pageId,datetime,category_id,day_of_week,price_diff,loc_user_count,category_id_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
0,5b08b2fc-0e71-11ee-86e0-002590c0647c,0,-1,89919732,2024-09-07 05:11:00,0,5,0.0,141,141,...,0.0,27.0,2.0,1.0,1.0,0.0,24.0,0.0,0.0,1.0
1,1af7e04c-6cd1-11ef-9b7b-002590c82436,0,-1,851020872,2024-09-07 05:50:01,0,5,0.0,91,91,...,0.0,25.0,0.0,2.0,5.0,1.0,6.0,0.0,0.0,1.0
2,d41cdad4-3b4c-11ef-86e0-002590c0647c,0,-1,1515288183,2024-09-07 07:37:02,0,5,0.0,85,85,...,0.0,14.0,0.0,2.0,6.0,0.0,8.0,0.0,0.0,1.0
3,59b9d89a-1de2-11eb-86e0-002590c0647c,0,-1,1497267364,2024-09-07 11:52:12,0,5,0.0,8270,8270,...,44.0,694.0,1.0,125.0,1729.0,112.0,3.0,0.0,0.0,0.0
4,9c087566-8065-11ed-86e0-002590c0647c,0,-1,3039170152,2024-09-07 12:14:30,0,5,0.0,182,182,...,1.0,26.0,0.0,3.0,8.0,0.0,9.0,0.0,0.0,0.0


#### Тренировка модели для распознования ключевого события

In [62]:
inters[((inters.weight == 1) | (inters.weight == 4)) & (inters.user_id == '0038911e-79a3-11ef-9b7b-002590c82436')].sort_values(by=['user_id', 'pageId', 'item_id'])

Unnamed: 0,user_id,weight,item_id,pageId,datetime,category_id,day_of_week,price_diff,loc_user_count,category_id_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
3522281,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1015778690,2024-09-25 18:09:12,0,2,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,23166.0,0.0,7.0,-34.0
3337690,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,125129932,2024-09-24 18:18:04,0,1,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,2.0,0.0,0.0,0.0
3138513,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1281889168,2024-09-23 18:17:15,0,0,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,149.0,0.0,0.0,3.0
3338009,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1326997590,2024-09-24 18:20:22,0,1,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,0.0,0.0,0.0,0.0
3338048,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1326997590,2024-09-24 18:20:40,0,1,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,18.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3539012,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:20,0,2,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,1.0,0.0,0.0,0.0
3539032,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:25,0,2,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,4.0,0.0,0.0,0.0
3539048,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:31,0,2,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,6.0,0.0,0.0,0.0
3539063,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:34,0,2,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,3.0,0.0,0.0,0.0


In [63]:
max_date = inters['datetime'].max()
train_raw = inters[inters['datetime'] >= max_date - pd.Timedelta(days=22)].copy()
train_raw = train_raw[train_raw['datetime'] <= max_date - pd.Timedelta(days=3)]
test_raw = inters[inters['datetime'] > max_date - pd.Timedelta(days=3)].copy()

In [64]:
train = train_raw.drop(columns=['datetime', 'item_id', 'pageId'])
test = test_raw.drop(columns=['datetime', 'item_id', 'pageId'])

In [65]:
test.sample()

Unnamed: 0,user_id,weight,category_id,day_of_week,price_diff,loc_user_count,category_id_count,action_0_count,action_1_count,action_2_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
3818553,b570ea84-1433-11ef-9b7b-002590c82436,1,0,4,0.0,132,132,0.0,80.0,39.0,...,0.0,13.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,1.0


In [66]:
train.shape, test.shape

((5796546, 21), (1062417, 21))

### Классы не сбалансированы

In [67]:
train.weight.value_counts()

weight
1    3554985
2     876035
5     875680
8     314936
3      69354
7      44249
6      26346
4      24150
9      10478
0        333
Name: count, dtype: int64

### Кооректируем баланс классов

In [68]:
train = train[train.weight.isin([0, 1, 3, 6, 7, 9])].drop_duplicates()
train.shape

(2539422, 21)

In [69]:
train_portion = pd.concat(
    [
        train[train.weight.isin([0, 1, 3, 6, 7, 9])],
        train_raw.sample(130_000)],
    axis='rows'
)

In [70]:
train_portion.weight.unique()

array([0, 1, 3, 6, 7, 9, 2, 5, 8, 4])

In [71]:
h2o.init(verbose=False)
print(f'{h2o.cluster().show_status()}')
h2o.no_progress()

0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,1 month and 12 days
H2O_cluster_name:,H2O_from_python_varsey_hn8wp8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,15.62 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


None


In [72]:
train_portion[train_portion.weight == 0]

Unnamed: 0,user_id,weight,category_id,day_of_week,price_diff,loc_user_count,category_id_count,action_0_count,action_1_count,action_2_count,...,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min,item_id,pageId,datetime
0,5b08b2fc-0e71-11ee-86e0-002590c0647c,0,0,5,0.0,141,141,1.0,101.0,8.0,...,1.0,1.0,0.0,24.0,0.0,0.0,1.0,,,NaT
1,1af7e04c-6cd1-11ef-9b7b-002590c82436,0,0,5,0.0,91,91,1.0,47.0,9.0,...,2.0,5.0,1.0,6.0,0.0,0.0,1.0,,,NaT
2,d41cdad4-3b4c-11ef-86e0-002590c0647c,0,0,5,0.0,85,85,2.0,48.0,13.0,...,2.0,6.0,0.0,8.0,0.0,0.0,1.0,,,NaT
3,59b9d89a-1de2-11eb-86e0-002590c0647c,0,0,5,0.0,8270,8270,2.0,5244.0,250.0,...,125.0,1729.0,112.0,3.0,0.0,0.0,0.0,,,NaT
4,9c087566-8065-11ed-86e0-002590c0647c,0,0,5,0.0,182,182,1.0,118.0,25.0,...,3.0,8.0,0.0,9.0,0.0,0.0,0.0,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,4b43b8cc-6c03-11ee-b1da-002590c82437,0,0,2,0.0,132,132,1.0,97.0,19.0,...,2.0,8.0,0.0,7.0,0.0,0.0,0.0,-1.0,1688766018,2024-09-11 10:48:08
99,e0ad5878-7143-11ef-9b7b-002590c82436,0,0,3,0.0,68,68,1.0,41.0,7.0,...,1.0,2.0,0.0,7.0,0.0,0.0,0.0,-1.0,1035106084,2024-09-12 20:25:27
285,bc95ac9c-1423-11ef-86e0-002590c0647c,0,0,0,0.0,6001,6001,6.0,3545.0,152.0,...,114.0,1524.0,89.0,2.0,0.0,0.0,0.0,-1.0,232532357,2024-09-23 07:50:13
266,fc8415f0-dc89-11ed-86e0-002590c0647c,0,0,6,0.0,37,37,1.0,18.0,2.0,...,0.0,6.0,0.0,13.0,0.0,0.0,0.0,-1.0,125463581,2024-09-22 10:37:17


In [73]:
train_h2o = h2o.H2OFrame(train_portion)
test_h2o = h2o.H2OFrame(test.sample(40_000))

In [74]:
y = "weight"
x = set(train_h2o.names) - set([y, 'user_id', 'shop_id', 'day', 'hour', 'pageId', 'datetime', 'item_id', 'is_useful'])
x

{'action_0_count',
 'action_1_count',
 'action_2_count',
 'action_3_count',
 'action_4_count',
 'action_5_count',
 'action_6_count',
 'action_7_count',
 'action_8_count',
 'action_9_count',
 'category_id',
 'category_id_count',
 'day_of_week',
 'delta_day',
 'delta_hour',
 'delta_min',
 'delta_sec',
 'loc_user_count',
 'price_diff'}

In [75]:
train_h2o[y] = train_h2o[y].asfactor()
test_h2o[y] = test_h2o[y].asfactor()

In [76]:
train_h2o['category_id'] = train_h2o['category_id'].asfactor()
test_h2o['category_id'] = test_h2o['category_id'].asfactor()

### для снижения уровня рутины и фокусировки на цели - качесвтенных предсказаниях - используем automl

In [77]:
gbm_model = H2OGradientBoostingEstimator(seed=1234)
gbm_model.train(x=list(x), y=y, training_frame = train_h2o,  validation_frame = test_h2o, max_runtime_secs=120)

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,48.0,480.0,252775.0,5.0,5.0,5.0,10.0,32.0,31.28125

0,1,2,3,4,5,6,7,8,9,Error,Rate
0.0,335.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,335 / 335
0.0,2511708.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.6e-06,"9 / 2,511,717"
0.0,19579.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9999489,"19,579 / 19,580"
0.0,31268.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"31,268 / 31,268"
0.0,46.0,0.0,0.0,14.0,13.0,64.0,365.0,45.0,10.0,0.9748654,543 / 557
0.0,1634.0,0.0,1.0,0.0,13318.0,1171.0,2949.0,106.0,216.0,0.3133282,"6,077 / 19,395"
0.0,2472.0,0.0,0.0,1.0,471.0,19906.0,3138.0,129.0,105.0,0.2408664,"6,316 / 26,222"
0.0,2600.0,0.0,2.0,2.0,426.0,4041.0,34601.0,336.0,1773.0,0.20968,"9,180 / 43,781"
0.0,510.0,0.0,0.0,0.0,242.0,744.0,4259.0,1106.0,284.0,0.8452064,"6,039 / 7,145"
0.0,518.0,0.0,0.0,0.0,41.0,994.0,5045.0,60.0,2764.0,0.706644,"6,658 / 9,422"

k,hit_ratio
1,0.9677818
2,0.9922815
3,0.9974055
4,0.9990414
5,0.9996692
6,0.9998858
7,0.9999645
8,0.9999918
9,1.0000001
10,1.0000001

0,1,2,3,4,5,6,7,8,9,Error,Rate
0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1 / 1
0.0,24655.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.06e-05,"1 / 24,656"
0.0,6263.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"6,263 / 6,263"
0.0,482.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,482 / 482
0.0,6.0,0.0,0.0,0.0,2.0,22.0,61.0,9.0,1.0,1.0,101 / 101
0.0,893.0,0.0,0.0,1.0,4091.0,456.0,787.0,23.0,55.0,0.3512528,"2,215 / 6,306"
0.0,95.0,0.0,0.0,0.0,1.0,179.0,21.0,0.0,2.0,0.3993289,119 / 298
0.0,12.0,0.0,0.0,0.0,3.0,42.0,195.0,1.0,7.0,0.25,65 / 260
0.0,72.0,0.0,0.0,0.0,68.0,275.0,937.0,181.0,54.0,0.8859483,"1,406 / 1,587"
0.0,4.0,0.0,0.0,0.0,0.0,5.0,28.0,0.0,9.0,0.8043478,37 / 46

k,hit_ratio
1,0.73275
2,0.899725
3,0.96705
4,0.987375
5,0.994925
6,0.99785
7,0.9994
8,0.99995
9,1.0
10,1.0

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,training_auc,training_pr_auc,validation_rmse,validation_logloss,validation_classification_error,validation_auc,validation_pr_auc
,2024-10-12 10:09:28,0.055 sec,0.0,0.9000000,2.3025851,0.1146319,,,0.9000000,2.3025851,0.425925,,
,2024-10-12 10:09:33,4.100 sec,1.0,0.7782773,1.5150371,0.0343943,,,0.8155189,1.7522178,0.2917,,
,2024-10-12 10:09:39,10.690 sec,3.0,0.6093403,0.9582738,0.0342486,,,0.7062928,1.3831565,0.29195,,
,2024-10-12 10:09:44,15.861 sec,5.0,0.4777831,0.6706354,0.0340235,,,0.6312687,1.2146701,0.290575,,
,2024-10-12 10:09:50,21.454 sec,7.0,0.3779972,0.4932895,0.0339579,,,0.5820973,1.1257001,0.290425,,
,2024-10-12 10:09:55,26.313 sec,9.0,0.3061070,0.3773530,0.0339381,,,0.5516711,1.0796152,0.2901,,
,2024-10-12 10:10:00,31.267 sec,11.0,0.2573241,0.3000391,0.0338841,,,0.5346229,1.0649881,0.289475,,
,2024-10-12 10:10:05,36.227 sec,13.0,0.2258601,0.2474892,0.0338365,,,0.5247218,1.0633209,0.289225,,
,2024-10-12 10:10:10,41.167 sec,15.0,0.2065492,0.2116750,0.0337912,,,0.5193614,1.0715536,0.288375,,
,2024-10-12 10:10:14,46.037 sec,17.0,0.1949069,0.1869481,0.0336994,,,0.5162294,1.0820638,0.287475,,

variable,relative_importance,scaled_importance,percentage
category_id,477982.21875,1.0,0.736269
action_6_count,52158.5429688,0.1091224,0.0803434
action_7_count,41741.1132812,0.0873278,0.0642967
action_3_count,15199.3730469,0.031799,0.0234126
delta_min,14706.1748047,0.0307672,0.0226529
action_1_count,9858.9677734,0.0206262,0.0151864
action_8_count,9232.3203125,0.0193152,0.0142212
delta_sec,8876.0048828,0.0185697,0.0136723
action_9_count,8277.3144531,0.0173172,0.0127501
loc_user_count,5855.4848633,0.0122504,0.0090196


In [79]:
gbm_model.model_performance(test_h2o)

0,1,2,3,4,5,6,7,8,9,Error,Rate
0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1 / 1
0.0,24655.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.06e-05,"1 / 24,656"
0.0,6263.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"6,263 / 6,263"
0.0,482.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,482 / 482
0.0,6.0,0.0,0.0,0.0,2.0,22.0,61.0,9.0,1.0,1.0,101 / 101
0.0,893.0,0.0,0.0,1.0,4091.0,456.0,787.0,23.0,55.0,0.3512528,"2,215 / 6,306"
0.0,95.0,0.0,0.0,0.0,1.0,179.0,21.0,0.0,2.0,0.3993289,119 / 298
0.0,12.0,0.0,0.0,0.0,3.0,42.0,195.0,1.0,7.0,0.25,65 / 260
0.0,72.0,0.0,0.0,0.0,68.0,275.0,937.0,181.0,54.0,0.8859483,"1,406 / 1,587"
0.0,4.0,0.0,0.0,0.0,0.0,5.0,28.0,0.0,9.0,0.8043478,37 / 46

k,hit_ratio
1,0.73275
2,0.899725
3,0.96705
4,0.987375
5,0.994925
6,0.99785
7,0.9994
8,0.99995
9,1.0
10,1.0


In [96]:
gbm_model.varimp(use_pandas=True)

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,category_id,477982.21875,1.0,0.736269
1,action_6_count,52158.542969,0.109122,0.080343
2,action_7_count,41741.113281,0.087328,0.064297
3,action_3_count,15199.373047,0.031799,0.023413
4,delta_min,14706.174805,0.030767,0.022653
5,action_1_count,9858.967773,0.020626,0.015186
6,action_8_count,9232.320312,0.019315,0.014221
7,delta_sec,8876.004883,0.01857,0.013672
8,action_9_count,8277.314453,0.017317,0.01275
9,loc_user_count,5855.484863,0.01225,0.00902


#### Расчитываем recall для локальных тестов

#### Тестовые данные - три дня и ключевое событие - 9 (заказ)

In [80]:
last_3_days = inters['datetime'].max() - pd.Timedelta(3, unit='D')
test_purchasers = inters.sort_values(by=['datetime', 'weight'], ascending=True)
test_purchasers = test_purchasers[(test_purchasers['weight'] == 9) & (test_purchasers['datetime'] > last_3_days)]
test_purchasers = test_purchasers.groupby(['user_id'])['item_id'].apply(processing)
test_purchasers = test_purchasers.reset_index()
test_purchasers.shape

(496, 2)

In [94]:
inters.columns

Index(['user_id', 'weight', 'item_id', 'pageId', 'datetime', 'category_id',
       'day_of_week', 'price_diff', 'loc_user_count', 'category_id_count',
       'action_0_count', 'action_1_count', 'action_2_count', 'action_3_count',
       'action_4_count', 'action_5_count', 'action_6_count', 'action_7_count',
       'action_8_count', 'action_9_count', 'delta_sec', 'delta_day',
       'delta_hour', 'delta_min'],
      dtype='object')

#### Предсказания фильтрацией

In [813]:
last_3_days = inters['datetime'].max() - pd.Timedelta(0, unit='D')
last_8_days = inters['datetime'].max() - pd.Timedelta(5*24, unit='hours')
pred_purchasers = inters.sort_values(by=['datetime', 'weight'], ascending=True)
pred_purchasers = pred_purchasers[
    (pred_purchasers['weight'] > 7)
    & (pred_purchasers['datetime'] > last_8_days)
    & (pred_purchasers['datetime'] < last_3_days)
    & (
            (pred_purchasers.action_7_count > 50) 
            | (pred_purchasers.action_8_count > 20) 
            | (pred_purchasers.action_9_count > 0) 
            # & (pred_purchasers.action_6_count > 500) 
            # & (pred_purchasers.action_3_count > 500)
    )
]
pred_purchasers = pred_purchasers.groupby(['user_id'])['item_id'].apply(processing)
pred_purchasers = pred_purchasers.reset_index()
pred_purchasers.shape

(1762, 2)

#### Проверяем сколько пользователей из теста удалось "зацепить" фильтрацией

In [814]:
pred_purchasers[pred_purchasers.user_id.isin(test_purchasers.user_id.unique())].shape

(496, 2)

In [815]:
check_merge = test_purchasers.merge(pred_purchasers[-3000:], on='user_id', how='left')
check_merge.item_id_y.loc[check_merge.item_id_y.isnull()] = check_merge.item_id_y.loc[check_merge.item_id_y.isnull()].apply(lambda x: [])
check_merge.shape

(496, 3)

In [816]:
check_merge.sample(1)

Unnamed: 0,user_id,item_id_x,item_id_y
456,e842b176-a0a9-11ed-a1c4-002590c82437,[4030080],"[6718122, 4030080, 6166801, 6228709]"


In [817]:
#### Проверяем полноту фильтрации

In [818]:
check_merge['recall'] = check_merge.apply(calculate_recall, axis=1)
check_merge['recall'].mean().round(4)

0.4984

#### Используем классификатор чтобы предсказать пользователей с целевыми дейсвтиями

In [792]:
inters['datetime'].min(), inters['datetime'].max()

(Timestamp('2024-09-07 00:00:04'), Timestamp('2024-09-28 23:59:59'))

In [794]:
last_3_days = inters['datetime'].max() - pd.Timedelta(0, unit='D')
last_8_days = inters['datetime'].max() - pd.Timedelta(27, unit='D')
pred_data = inters[(inters['datetime'] > last_8_days) & (inters['datetime'] < last_3_days)]

In [795]:
t = h2o.H2OFrame(pred_data[list(x)])

In [796]:
preds = gbm_model.predict(t)

#### Обьедтняем предсказания с исходными данными чтобы фильровать дальше

In [797]:
res = pd.concat(
    [
        pred_data[list(x) + ['weight', 'user_id', 'item_id', 'datetime']].reset_index(drop=True),
        preds.as_data_frame().reset_index(drop=True)
    ],
    axis='columns',
)

In [798]:
# Максимальный скор с которым предсказали целевое действие
res['score'] = res[res.columns[-10:]].max(axis=1)
        

In [799]:
# Смотрим корреляцию по данным чтобы понять по чему фильтровать
# res[list(set(res.columns) - set(['user_id']))].sample(1000).corr()

In [800]:
mask = (
    (res.predict.isin([9]) & (res.score > 0))
    # |
    # (res.predict.isin([8]) & (res.score > 0.85))
)

In [801]:
# Оцениваем размер таблицы после фильрации
res[
    mask
].groupby('user_id')['item_id'].agg(lambda x: list(map(str, x))[:25]).reset_index().shape

(1814, 2)

In [802]:
# Оцениваем количество пользователей, которое удалось "зацепить" фильтрацией из теста
res[
    mask
    & (res.user_id.isin(test_purchasers.user_id.unique()))
].groupby('user_id')['item_id'].agg(lambda x: list(set(map(str, x)))[:25]).reset_index().shape

(263, 2)

In [803]:
ml_users = res[
    mask
    & (res.user_id.isin(test_purchasers.user_id.unique()))
].groupby('user_id')['item_id'].agg(lambda x: list(set(map(str, x)))[:25]).reset_index().user_id.to_list()

In [804]:
print(len(pred_purchasers.user_id.unique()))
print(len(ml_users))
print(len(set(ml_users).intersection(set(pred_purchasers.user_id.unique()))))


1669
263
263


In [805]:
# Генерим таблицу с кандидатами по результатам мл - предсказания
ml_pred = res[
    mask
].sort_values(by=['weight'], ascending=False).groupby('user_id')['item_id'].agg(lambda x: list(set(map(str, x)))[:25]).reset_index()
ml_pred.shape

(1814, 2)

In [819]:
# Напоминание о размере таблицы с кандидатами после обычной фильтрации - оставшееся от 3000 строк место мы заполнили результатами мл
pred_purchasers.shape

(1762, 2)

In [820]:
data_to_check = pd.concat(
    [
        ml_pred.reset_index(drop=True),
        pred_purchasers.reset_index(drop=True),
    ],
    axis='rows'
)
print(data_to_check.shape)
data_to_check = data_to_check.drop_duplicates(subset='user_id', keep='last') # .reset_index(drop=True)
print(data_to_check[data_to_check.user_id.isin(test_purchasers.user_id.unique())].shape)
data_to_check.shape

(3576, 2)
(496, 2)


(3082, 2)

In [821]:
# Ограничим резульат 3000 строк (как по условию). Так как данные обычной фильтрации эффективнее и они в конце обьединенных данных - ограничим с конца
data_to_check[-3000:].shape

(3000, 2)

#### Проверяем результат локально на данных фильтрация + мл

In [822]:
# check_merge_ml = test_purchasers.merge(data_to_check.sample(3000, random_state=42), on='user_id', how='left')
check_merge_ml = test_purchasers.merge(data_to_check[-3000:], on='user_id', how='left')
check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()] = check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()].apply(lambda x: [])
print(check_merge_ml.shape)
check_merge_ml['recall'] = check_merge_ml.apply(calculate_recall, axis=1)
check_merge_ml['recall'].mean().round(4)

(496, 3)


0.4984

### Построение рекомендаций 
дополним данные после фильрации товарами на основе "оценок" пользователей (ценность действия интерпретируем как оценку)

#### Построение модели

In [823]:
K_RECOS = 25
NUM_THREADS = 24
RANDOM_STATE = 32
ITERATIONS = 25

In [824]:
def make_base_model(factors: int, regularization: float, alpha: float, fit_features_together: bool=False):
    return ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            alpha=alpha,
            random_state=RANDOM_STATE,
            use_gpu=False,
            num_threads = NUM_THREADS,
            iterations=ITERATIONS),
        fit_features_together = fit_features_together,
        )

In [825]:
last_3_days = inters['datetime'].max() - pd.Timedelta(0, unit='D')
last_8_days = inters['datetime'].max() - pd.Timedelta(27, unit='D')
recos_data = inters[
    (inters['datetime'] > last_8_days)
    & (inters['datetime'] < last_3_days) 
    & ((inters['weight'] > 3))   # дейсвтия ниже 5 не привязаны к товару, чтобы не рекомендовать -1 и сэкономить на размере входных данных - фильтруем
]

In [826]:
recos_data.weight.unique()

array([4, 5, 6, 7, 8, 9])

In [827]:
recs_dataset = Dataset.construct(
    interactions_df=recos_data[['user_id', 'item_id', 'weight', 'datetime']],
)

model = make_base_model(factors=256, regularization=0.2, alpha=100)
model.fit(recs_dataset)

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x78040d436cd0>

In [828]:
recos = model.recommend(
    users=data_to_check[-3000:].user_id,
    dataset=recs_dataset,
    k=20,
    filter_viewed=False,
)

In [829]:
recos[recos.item_id == -1]

Unnamed: 0,user_id,item_id,score,rank


#### Дополняем данные с предыдущих предсказаний (фильтрация + ml) 

In [830]:
recos_agg = recos.groupby('user_id')['item_id'].agg(lambda x: list(map(str, x))).reset_index()

In [831]:
recos_agg.head()

Unnamed: 0,user_id,item_id
0,0038911e-79a3-11ef-9b7b-002590c82436,"[3914454, 4795967, 1758351, 1788591, 1628891, ..."
1,00a8fe40-024d-11ef-9b7b-002590c82436,"[6873585, 6876534, 6166786, 6549415, 6456766, ..."
2,00d23b34-6e90-11ef-9b7b-002590c82436,"[6415369, 6415366, 5040520, 5379376, 5214061, ..."
3,00d6623e-7086-11ef-86e0-002590c0647c,"[3845038, 3709801, 5278573, 1551965, 3907086, ..."
4,00efb9f8-0d3c-11ef-9b7b-002590c82436,"[5479060, 5478679, 5478931, 5478691, 4005144, ..."


In [832]:
recos_agg_selected_uid = recos_agg[
    recos_agg.user_id.isin(
        data_to_check[-3000:].user_id
    )
]

In [833]:
recos_agg_selected_uid.shape

(3000, 2)

In [834]:
# Объединяем рекомендации с предсказанями фильтрацией и мл
enriched_data_to_check = data_to_check[-3000:].merge(recos_agg_selected_uid, on='user_id', how='left')

In [835]:
enriched_data_to_check.columns

Index(['user_id', 'item_id_x', 'item_id_y'], dtype='object')

In [836]:
# Заполняем nan пустыми списками для корректного расчета метрики
enriched_data_to_check.item_id_y.loc[enriched_data_to_check.item_id_y.isnull()] = enriched_data_to_check.item_id_y.loc[enriched_data_to_check.item_id_y.isnull()].apply(lambda x: [])

In [837]:
# Обьединяем исходное предсказание рекомендациями по rectools  - сначала исходные айди, потом рекоммендации (порядок важен, тк далее оставляем только 25 элементов списка)
enriched_data_to_check['item_id'] = enriched_data_to_check['item_id_x'] + enriched_data_to_check['item_id_y']

In [838]:
enriched_data_to_check.sample(5)

Unnamed: 0,user_id,item_id_x,item_id_y,item_id
1885,5c359f4a-8b1f-11ec-a6e9-002590c82437,"[6136903, 6137164, 4909771, 6336265, 6336391, ...","[6385027, 1778379, 635062, 7098138, 6477535, 6...","[6136903, 6137164, 4909771, 6336265, 6336391, ..."
415,5c233acc-accf-11ec-86e0-002590c0647c,"[449427, 2985292, 3862459, 6462064, 2719234, 4...","[1779921, 6556312, 5396179, 6676452, 6337345, ...","[449427, 2985292, 3862459, 6462064, 2719234, 4..."
2519,b711475a-61b3-11e8-8a53-0cc47a6d2fef,"[4999228, 5941346]","[4999228, 5941346, 6682428, 5144350, 5045935, ...","[4999228, 5941346, 4999228, 5941346, 6682428, ..."
2366,a03e2814-558a-11ef-9b7b-002590c82436,"[5162599, 5162536, 5162530, 7059768, 359597, 6...","[5162599, 5162536, 6841395, 5162530, 7059768, ...","[5162599, 5162536, 5162530, 7059768, 359597, 6..."
1034,d765e83c-48f1-11ee-86e0-002590c0647c,"[5036299, 5403910, 5404438, 1899898, 4064091, ...","[5404438, 1899898, 1779660, 5403910, 5404444, ...","[5036299, 5403910, 5404438, 1899898, 4064091, ..."


In [839]:
# Убеждаемся что добавлением новых товаров мы не вышли 
enriched_data_to_check['item_id'] = enriched_data_to_check['item_id'].apply(lambda x: list(set(x[:25])))

In [840]:
enriched_data_to_check = enriched_data_to_check[['user_id', 'item_id']]

In [841]:
enriched_data_to_check.drop_duplicates(subset=['user_id']).shape

(3000, 2)

In [842]:
check_merge_ml = test_purchasers.merge(enriched_data_to_check[-3000:], on='user_id', how='left')
check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()] = check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()].apply(lambda x: [])
print(check_merge_ml.shape)
check_merge_ml['recall'] = check_merge_ml.apply(calculate_recall, axis=1)
check_merge_ml['recall'].mean().round(4)

(496, 3)


0.4984

### Сохраняем результаты

In [843]:
enriched_data_to_check.rename(columns={'item_id': 'products'}).reset_index().to_csv("r_final.csv", index=False)

In [844]:
pd.read_csv('r_final.csv').shape

(3000, 3)

In [176]:
# Не забываем выключить кластер h2o на локальной машине
h2o.cluster().shutdown()