In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [3]:
import os
import threadpoolctl
import warnings

from rectools.models import ImplicitALSWrapperModel
from rectools.dataset import Dataset
from implicit.als import AlternatingLeastSquares

warnings.filterwarnings('ignore')

# For implicit ALS
os.environ["OPENBLAS_NUM_THREADS"] = "1"
threadpoolctl.threadpool_limits(1, "blas")

<threadpoolctl.threadpool_limits at 0x780acb85a510>

In [4]:
from src.utils import processing, calculate_recall

In [5]:
catalog = pd.read_parquet(f'{Path.cwd().parent}/data/stokman_catalog_preprocessed.pq', engine='pyarrow')
catalog.head()

Unnamed: 0,add_date,shop_id,product_id,category_id,price,title,old_price
0,2024-09-30 03:07:09,350,1025536,8922,6990,6048 48557 44342 53515 17614,6990
1,2024-09-19 03:07:08,350,1025614,10600,330,42548 40244 23485 10853 6545 2343,330
2,2024-09-30 03:07:09,350,1025656,231,13240,14034 42002 48557 56089 46836 55620 50785,26490
3,2024-09-30 03:07:09,350,1025659,14546,7550,32640 50164 48557 25314 23255 42146,12590
4,2024-09-30 03:07:09,350,1025668,1949,1290,51789 48557 56089 20149,1290


In [6]:
actions = pd.read_parquet(f'{Path.cwd().parent}/data/train_actions.pq', engine='pyarrow')
actions.head()

Unnamed: 0,user_id,loc_user_id,action,date,products,pageId
0,6fd49b56-8cc6-11ed-86e0-002590c0647c,c6e357dc-121d-449d-a744-e9a0b56c2380,7,2024-09-07 00:00:04,[],2571824865
1,6fd49b56-8cc6-11ed-86e0-002590c0647c,c6e357dc-121d-449d-a744-e9a0b56c2380,7,2024-09-07 00:00:08,[],3834364438
2,f9c498ec-5d3b-11ef-86e0-002590c0647c,120c9064-1131-4dc3-8048-44184531b42e,7,2024-09-07 00:00:08,[],2448628415
3,59386b5c-e64f-11ec-8086-002590c82437,f0745572-893f-4e50-bc52-5af47badff5a,7,2024-09-07 00:00:11,[],3875013967
4,badbd396-6cab-11ef-86e0-002590c0647c,73423d85-d47c-4332-8155-5200615302b5,7,2024-09-07 00:00:11,[],3025531174


#### Разворачиваем данные по айдишникам продуктов

In [9]:
_actions = (
    actions
    .copy()
    .explode('products')
    .assign(products=lambda x: x.products.fillna(-1).astype(np.int64))
)
print(_actions.shape)

_catalog = (
    catalog
    .copy()
    .assign(product_id=lambda x: x.product_id.astype(np.int64))
)

(6859002, 6)


#### Разворачиваем данные по айдишникам продуктов

In [10]:
_actions['action_datetime'] = pd.to_datetime(_actions['date'])

_actions.action = _actions.action + 10
_actions.action = _actions.action.replace({14: 0, 13: 4, 18: 2, 19: 3, 10: 5, 17: 1, 11: 6, 12: 7, 16: 8, 15: 9})
_actions = _actions.sort_values(by=['action', 'date'])
print(_actions.shape)

(6859002, 7)


#### Новая шкала action в train_actions.pq в порядке возрастания важности
    0 - clearB (удаление всех товаров из корзины)
    1 - visit (посещение страницы с товаром)
    2 - visitCategory (посещение страницы с группой товаров)
    3 - search (поиск товара)           
    4 - delB (удаление товара из корзины)
    5 - view (просмотр товара)
    6 - like (лайк товара)
    7 - addB (добавление товара в корзину)
    8 - listB (посещение страницы корзины и вывод списка товаров в корзине)
    9 - order (оформление заказа)
    

#### Переименовываем поля и обьеденияем датасеты каталога и дейсвтвий пользователя    
'products' теперь 'item_id', 'action'  превратился в 'weight' (мы перевели код действия в шкалу по возрастанию ранее)

In [11]:
interactions = _actions.rename(columns={'products': 'item_id', 'action' : 'weight', 'action_datetime': 'datetime'})

In [12]:
items = _catalog.rename(columns={'product_id': 'item_id'})

In [13]:
interactions_merge = interactions.merge(items, on='item_id', how='left')
interactions_merge.shape

(6859002, 13)

In [14]:
interactions_merge['day'] = interactions_merge['datetime'].dt.day
interactions_merge['day_of_week'] = interactions_merge['datetime'].dt.dayofweek  # Monday=0, Sunday=6

interactions_merge['hour'] = interactions_merge['datetime'].dt.hour
interactions_merge['minute'] = interactions_merge['datetime'].dt.minute

In [15]:
interactions_merge['price_diff'] = interactions_merge['price'] - interactions_merge['old_price']

interactions_merge['category_id'] = interactions_merge['category_id'].fillna(0)
interactions_merge['price_diff'] = interactions_merge['price_diff'].fillna(0)

interactions_merge['category_id'] = interactions_merge['category_id'].astype(np.int32)

In [16]:
inters = interactions_merge.merge(
    interactions_merge.groupby('user_id')['loc_user_id'].count().reset_index(name='loc_user_count'),
    on='user_id',
    how='left'
)
print(inters.shape)

(6859002, 19)


In [17]:
inters = inters.merge(
    interactions_merge.groupby('user_id')['category_id'].count().reset_index(name='category_id_count'),
    on='user_id',
    how='left'
)
print(inters.shape)

(6859002, 20)


In [18]:
for action in sorted(interactions_merge.weight.unique()):
    inters = inters.merge(
        interactions_merge[interactions_merge.weight  == action].groupby('user_id')['weight'].count().reset_index(name=f'action_{action}_count'),
        on='user_id',
        how='left'
    )
    inters[f'action_{action}_count'] = inters[f'action_{action}_count'].fillna(0)
print(inters.shape)

(6859002, 30)


In [19]:
inters['delta_sec'] = inters.sort_values(by=['datetime']).groupby('user_id')['datetime'].diff()
inters['delta_sec'] = inters['delta_sec'].dt.total_seconds()
inters['delta_sec'] = inters['delta_sec'].fillna(0)

In [20]:
inters['delta_day'] = inters.sort_values(by=['datetime']).groupby('user_id')['day'].diff()
inters['delta_day'] = inters['delta_day'].fillna(0)

inters['delta_hour'] = inters.sort_values(by=['datetime']).groupby('user_id')['hour'].diff()
inters['delta_hour'] = inters['delta_hour'].fillna(0)

inters['delta_min'] = inters.sort_values(by=['datetime']).groupby('user_id')['minute'].diff()
inters['delta_min'] = inters['delta_min'].fillna(0)

In [21]:
inters = inters.drop(columns=['day', 'hour', 'minute'])

In [22]:
inters = inters.drop(columns=['price', 'title', 'old_price', 'date', 'add_date', 'loc_user_id', 'shop_id'])

In [55]:
inters = inters[inters['item_id'] != '/*']
inters = inters[inters['user_id'] != '/*']

In [56]:
inters[inters.user_id == 'e4362420-f6ca-11ec-86e0-002590c0647c'].head(500)

Unnamed: 0,user_id,weight,item_id,pageId,datetime,category_id,day_of_week,price_diff,loc_user_count,category_id_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
379,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:26,0,5,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
383,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:41,0,5,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,15.0,0.0,0.0,0.0
387,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:51,0,5,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,10.0,0.0,0.0,0.0
388,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:52,0,5,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
404,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,3450082821,2024-09-07 00:01:37,0,5,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,25.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6295889,e4362420-f6ca-11ec-86e0-002590c0647c,5,6770769,252042197,2024-09-27 06:14:04,50448,4,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
6295906,e4362420-f6ca-11ec-86e0-002590c0647c,5,6990711,252042197,2024-09-27 06:14:21,50448,4,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
6295912,e4362420-f6ca-11ec-86e0-002590c0647c,5,6770769,252042197,2024-09-27 06:14:25,50448,4,0.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
6847200,e4362420-f6ca-11ec-86e0-002590c0647c,9,5422963,4263215769,2024-09-07 00:01:12,2247,5,-5700.0,384,384,...,0.0,57.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0


In [57]:
inters.head()

Unnamed: 0,user_id,weight,item_id,pageId,datetime,category_id,day_of_week,price_diff,loc_user_count,category_id_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
0,5b08b2fc-0e71-11ee-86e0-002590c0647c,0,-1,89919732,2024-09-07 05:11:00,0,5,0.0,141,141,...,0.0,27.0,2.0,1.0,1.0,0.0,24.0,0.0,0.0,1.0
1,1af7e04c-6cd1-11ef-9b7b-002590c82436,0,-1,851020872,2024-09-07 05:50:01,0,5,0.0,91,91,...,0.0,25.0,0.0,2.0,5.0,1.0,6.0,0.0,0.0,1.0
2,d41cdad4-3b4c-11ef-86e0-002590c0647c,0,-1,1515288183,2024-09-07 07:37:02,0,5,0.0,85,85,...,0.0,14.0,0.0,2.0,6.0,0.0,8.0,0.0,0.0,1.0
3,59b9d89a-1de2-11eb-86e0-002590c0647c,0,-1,1497267364,2024-09-07 11:52:12,0,5,0.0,8270,8270,...,44.0,694.0,1.0,125.0,1729.0,112.0,3.0,0.0,0.0,0.0
4,9c087566-8065-11ed-86e0-002590c0647c,0,-1,3039170152,2024-09-07 12:14:30,0,5,0.0,182,182,...,1.0,26.0,0.0,3.0,8.0,0.0,9.0,0.0,0.0,0.0


#### Тренировка модели для распознования ключевого события

In [62]:
inters[((inters.weight == 1) | (inters.weight == 4)) & (inters.user_id == '0038911e-79a3-11ef-9b7b-002590c82436')].sort_values(by=['user_id', 'pageId', 'item_id'])

Unnamed: 0,user_id,weight,item_id,pageId,datetime,category_id,day_of_week,price_diff,loc_user_count,category_id_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
3522281,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1015778690,2024-09-25 18:09:12,0,2,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,23166.0,0.0,7.0,-34.0
3337690,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,125129932,2024-09-24 18:18:04,0,1,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,2.0,0.0,0.0,0.0
3138513,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1281889168,2024-09-23 18:17:15,0,0,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,149.0,0.0,0.0,3.0
3338009,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1326997590,2024-09-24 18:20:22,0,1,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,0.0,0.0,0.0,0.0
3338048,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1326997590,2024-09-24 18:20:40,0,1,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,18.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3539012,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:20,0,2,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,1.0,0.0,0.0,0.0
3539032,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:25,0,2,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,4.0,0.0,0.0,0.0
3539048,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:31,0,2,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,6.0,0.0,0.0,0.0
3539063,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:34,0,2,0.0,155,155,...,3.0,22.0,0.0,2.0,10.0,1.0,3.0,0.0,0.0,0.0


In [1380]:
max_date = inters['datetime'].max()
train_raw = inters[inters['datetime'] >= max_date - pd.Timedelta(days=22)].copy()
train_raw = train_raw[train_raw['datetime'] <= max_date - pd.Timedelta(days=0)]
test_raw = inters[inters['datetime'] > max_date - pd.Timedelta(days=3)].copy()

In [1381]:
train = train_raw.drop(columns=['datetime', 'item_id', 'pageId'])
test = test_raw.drop(columns=['datetime', 'item_id', 'pageId'])

In [1382]:
test.sample()

Unnamed: 0,user_id,weight,category_id,day_of_week,price_diff,loc_user_count,category_id_count,action_0_count,action_1_count,action_2_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
3766403,5c9d7eaa-0110-11ed-86e0-002590c0647c,1,0,4,0.0,296,296,0.0,193.0,29.0,...,1.0,55.0,2.0,3.0,11.0,2.0,12.0,0.0,0.0,0.0


In [1383]:
train.shape, test.shape

((6858963, 21), (1062417, 21))

### Классы не сбалансированы

In [1384]:
train.weight.value_counts()

weight
1    4214660
2    1041842
5    1040483
8     356922
3      81142
7      50120
6      34672
4      26951
9      11802
0        369
Name: count, dtype: int64

### Кооректируем баланс классов

In [1385]:
train = train[train.weight.isin([0, 1, 3, 6, 7, 9])].drop_duplicates()
train.shape

(2987363, 21)

In [1386]:
train_portion = pd.concat(
    [
        train[train.weight.isin([0, 1, 3, 6, 7, 9])],
        train_raw.sample(130_000)],
    axis='rows'
)

In [1387]:
train_portion.weight.unique()

array([0, 1, 3, 6, 7, 9, 5, 2, 4, 8])

In [1388]:
h2o.init(verbose=False)
print(f'{h2o.cluster().show_status()}')
h2o.no_progress()

0,1
H2O_cluster_uptime:,9 hours 4 mins
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,1 month and 13 days
H2O_cluster_name:,H2O_from_python_varsey_hn8wp8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,10.69 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


None


In [1389]:
train_portion[train_portion.weight == 0]

Unnamed: 0,user_id,weight,category_id,day_of_week,price_diff,loc_user_count,category_id_count,action_0_count,action_1_count,action_2_count,...,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min,item_id,pageId,datetime
0,5b08b2fc-0e71-11ee-86e0-002590c0647c,0,0,5,0.0,141,141,1.0,101.0,8.0,...,1.0,1.0,0.0,24.0,0.0,0.0,1.0,,,NaT
1,1af7e04c-6cd1-11ef-9b7b-002590c82436,0,0,5,0.0,91,91,1.0,47.0,9.0,...,2.0,5.0,1.0,6.0,0.0,0.0,1.0,,,NaT
2,d41cdad4-3b4c-11ef-86e0-002590c0647c,0,0,5,0.0,85,85,2.0,48.0,13.0,...,2.0,6.0,0.0,8.0,0.0,0.0,1.0,,,NaT
3,59b9d89a-1de2-11eb-86e0-002590c0647c,0,0,5,0.0,8270,8270,2.0,5244.0,250.0,...,125.0,1729.0,112.0,3.0,0.0,0.0,0.0,,,NaT
4,9c087566-8065-11ed-86e0-002590c0647c,0,0,5,0.0,182,182,1.0,118.0,25.0,...,3.0,8.0,0.0,9.0,0.0,0.0,0.0,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,1ded931a-15d2-11ef-9b7b-002590c82436,0,0,5,0.0,48,48,1.0,33.0,11.0,...,0.0,2.0,0.0,10.0,0.0,0.0,0.0,-1.0,3519148408,2024-09-14 12:12:56
175,860596d0-44c3-11ef-9b7b-002590c82436,0,0,0,0.0,2882,2882,5.0,1101.0,110.0,...,158.0,804.0,1.0,5.0,0.0,0.0,0.0,-1.0,4207310809,2024-09-16 20:28:39
367,a7c0cfae-43f8-11ef-9b7b-002590c82436,0,0,5,0.0,112,112,1.0,64.0,6.0,...,4.0,12.0,3.0,9.0,0.0,0.0,0.0,-1.0,2293099223,2024-09-28 17:51:53
89,f8c9fe8e-3298-11ed-9003-002590c82437,0,0,3,0.0,192,192,1.0,122.0,31.0,...,5.0,9.0,0.0,8.0,0.0,0.0,0.0,-1.0,1888667810,2024-09-12 12:11:58


In [1390]:
train_h2o = h2o.H2OFrame(train_portion)
test_h2o = h2o.H2OFrame(test.sample(40_000))

In [1391]:
y = "weight"
x = set(train_h2o.names) - set([y, 'user_id', 'shop_id', 'day', 'hour', 'pageId', 'datetime', 'item_id', 'is_useful'])
x

{'action_0_count',
 'action_1_count',
 'action_2_count',
 'action_3_count',
 'action_4_count',
 'action_5_count',
 'action_6_count',
 'action_7_count',
 'action_8_count',
 'action_9_count',
 'category_id',
 'category_id_count',
 'day_of_week',
 'delta_day',
 'delta_hour',
 'delta_min',
 'delta_sec',
 'loc_user_count',
 'price_diff'}

In [1392]:
train_h2o[y] = train_h2o[y].asfactor()
test_h2o[y] = test_h2o[y].asfactor()

In [1393]:
train_h2o['category_id'] = train_h2o['category_id'].asfactor()
test_h2o['category_id'] = test_h2o['category_id'].asfactor()

### для снижения уровня рутины и фокусировки на цели - качесвтенных предсказаниях - используем automl

In [1394]:
gbm_model = H2OGradientBoostingEstimator(seed=1234)
gbm_model.train(x=list(x), y=y, training_frame = train_h2o,  validation_frame = test_h2o, max_runtime_secs=120)

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,42.0,420.0,208164.0,5.0,5.0,5.0,10.0,32.0,31.359524

0,1,2,3,4,5,6,7,8,9,Error,Rate
0.0,373.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,373 / 373
0.0,2940396.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"0 / 2,940,396"
0.0,19835.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"19,835 / 19,835"
0.0,35958.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"35,958 / 35,958"
0.0,29.0,0.0,0.0,7.0,15.0,65.0,339.0,19.0,15.0,0.9856851,482 / 489
0.0,1738.0,0.0,0.0,0.0,13026.0,1406.0,3137.0,86.0,232.0,0.3362548,"6,599 / 19,625"
0.0,4736.0,0.0,0.0,0.0,439.0,25185.0,3518.0,88.0,98.0,0.2606564,"8,879 / 34,064"
0.0,2884.0,0.0,2.0,0.0,416.0,5047.0,38921.0,276.0,1875.0,0.2124603,"10,500 / 49,421"
0.0,477.0,0.0,0.0,0.0,247.0,762.0,4071.0,845.0,202.0,0.8720472,"5,759 / 6,604"
0.0,568.0,0.0,0.0,0.0,33.0,1146.0,5899.0,56.0,2896.0,0.7267409,"7,702 / 10,598"

k,hit_ratio
1,0.9691768
2,0.9928648
3,0.9976223
4,0.9991233
5,0.9997026
6,0.9998938
7,0.9999686
8,0.9999965
9,0.9999997
10,1.0

0,1,2,3,4,5,6,7,8,9,Error,Rate
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0 / 0
0.0,24859.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"0 / 24,859"
0.0,6225.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"6,225 / 6,225"
0.0,458.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,458 / 458
0.0,7.0,0.0,0.0,0.0,6.0,20.0,69.0,12.0,2.0,1.0,116 / 116
0.0,809.0,0.0,0.0,0.0,3964.0,557.0,800.0,8.0,54.0,0.3598191,"2,228 / 6,192"
0.0,90.0,0.0,0.0,0.0,3.0,173.0,22.0,1.0,1.0,0.4034483,117 / 290
0.0,18.0,0.0,0.0,0.0,3.0,25.0,158.0,0.0,9.0,0.258216,55 / 213
0.0,74.0,0.0,1.0,1.0,45.0,238.0,1046.0,156.0,53.0,0.9033457,"1,458 / 1,614"
0.0,3.0,0.0,0.0,0.0,0.0,6.0,12.0,0.0,12.0,0.6363636,21 / 33

k,hit_ratio
1,0.73305
2,0.89545
3,0.964075
4,0.9861
5,0.994325
6,0.99775
7,0.999575
8,0.999975
9,0.9999999
10,0.9999999

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,training_auc,training_pr_auc,validation_rmse,validation_logloss,validation_classification_error,validation_auc,validation_pr_auc
,2024-10-12 19:13:56,0.022 sec,0.0,0.9000000,2.3025851,0.1121935,,,0.9000000,2.3025851,0.4193,,
,2024-10-12 19:13:59,3.068 sec,1.0,0.7777930,1.5123892,0.0325127,,,0.8152746,1.7511145,0.28855,,
,2024-10-12 19:14:05,8.892 sec,3.0,0.6081732,0.9544436,0.0323828,,,0.7062021,1.3861254,0.289175,,
,2024-10-12 19:14:10,14.698 sec,5.0,0.4759335,0.6661723,0.0322590,,,0.6312387,1.2194139,0.2889,,
,2024-10-12 19:14:16,20.492 sec,7.0,0.3754237,0.4882680,0.0321708,,,0.5819454,1.1301319,0.288375,,
,2024-10-12 19:14:22,26.455 sec,9.0,0.3029938,0.3723115,0.0321284,,,0.5523522,1.0893287,0.288075,,
,2024-10-12 19:14:28,32.345 sec,11.0,0.2535641,0.2945056,0.0320996,,,0.5348281,1.0716679,0.287575,,
,2024-10-12 19:14:34,38.181 sec,13.0,0.2216767,0.2418074,0.0320479,,,0.5250570,1.0711103,0.28675,,
,2024-10-12 19:14:40,43.928 sec,15.0,0.2020427,0.2059869,0.0319972,,,0.5198075,1.0803698,0.286525,,
,2024-10-12 19:14:46,49.855 sec,17.0,0.1903021,0.1810436,0.0319440,,,0.5167675,1.0923106,0.285975,,

variable,relative_importance,scaled_importance,percentage
category_id,537948.0,1.0,0.7336933
action_6_count,65836.421875,0.1223844,0.0897926
action_7_count,46916.2695312,0.0872134,0.0639879
action_3_count,17469.6582031,0.0324746,0.0238264
delta_min,16454.890625,0.0305883,0.0224424
action_1_count,11743.7089844,0.0218306,0.0160169
action_9_count,9727.078125,0.0180818,0.0132665
delta_sec,8700.9570312,0.0161743,0.011867
action_8_count,6658.0922852,0.0123768,0.0090808
loc_user_count,5406.5703125,0.0100504,0.0073739


In [None]:
gbm_model.model_performance(test_h2o)

In [1395]:
gbm_model.varimp(use_pandas=True)

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,category_id,537948.0,1.0,0.733693
1,action_6_count,65836.421875,0.122384,0.089793
2,action_7_count,46916.269531,0.087213,0.063988
3,action_3_count,17469.658203,0.032475,0.023826
4,delta_min,16454.890625,0.030588,0.022442
5,action_1_count,11743.708984,0.021831,0.016017
6,action_9_count,9727.078125,0.018082,0.013267
7,delta_sec,8700.957031,0.016174,0.011867
8,action_8_count,6658.092285,0.012377,0.009081
9,loc_user_count,5406.570312,0.01005,0.007374


#### Расчитываем recall для локальных тестов

#### Тестовые данные - три дня и ключевое событие - 9 (заказ)

In [1397]:
last_3_days = inters['datetime'].max() - pd.Timedelta(3, unit='D')
last_13_days = inters['datetime'].max() - pd.Timedelta(0, unit='D')
test_purchasers = inters.sort_values(by=['datetime', 'weight'], ascending=True)
test_purchasers = test_purchasers[
    (test_purchasers['weight'] == 9) 
    & (test_purchasers['datetime'] > last_3_days)
    & (test_purchasers['datetime'] < last_13_days)
]
print(test_purchasers['datetime'].max(), test_purchasers['datetime'].min(),)
test_purchasers = test_purchasers.groupby(['user_id'])['item_id'].apply(processing)
test_purchasers = test_purchasers.reset_index()
test_purchasers.shape

2024-09-28 23:46:12 2024-09-26 00:38:57


(496, 2)

In [1398]:
inters['datetime'].max() - pd.Timedelta(7, unit='D')

Timestamp('2024-09-21 23:59:59')

In [1399]:
inters.columns

Index(['user_id', 'weight', 'item_id', 'pageId', 'datetime', 'category_id',
       'day_of_week', 'price_diff', 'loc_user_count', 'category_id_count',
       'action_0_count', 'action_1_count', 'action_2_count', 'action_3_count',
       'action_4_count', 'action_5_count', 'action_6_count', 'action_7_count',
       'action_8_count', 'action_9_count', 'delta_sec', 'delta_day',
       'delta_hour', 'delta_min'],
      dtype='object')

#### Предсказания фильтрацией

    

In [1400]:
def processing2(purchasers):
    purchasers = purchasers.map(lambda x: [f'{y}' for y in x] if isinstance(x, list) else [f'{x}'])
    purchasers = list([item for sublist in purchasers for item in sublist if item != '-1'])
    return list(set(purchasers))[:25]

In [1401]:
last_3_days = inters['datetime'].max() - pd.Timedelta(0, unit='D')
last_8_days = inters['datetime'].max() - pd.Timedelta(5*24, unit='hours')
pred_purchasers = inters.sort_values(by=['datetime', 'weight'], ascending=True)
pred_purchasers = pred_purchasers[
    (pred_purchasers['weight'].isin([8, 1]))
    & (pred_purchasers['datetime'] > last_8_days)
    & (pred_purchasers['datetime'] < last_3_days)
    & (
            (pred_purchasers.action_7_count > 50) 
            | (pred_purchasers.action_8_count > 20) 
            | (pred_purchasers.action_9_count > 0) 
            # & (pred_purchasers.action_6_count > 0) 
            # & (pred_purchasers.action_3_count > 0)
    )
]
pred_purchasers = pred_purchasers.groupby(['user_id'])['item_id'].apply(processing2)
pred_purchasers = pred_purchasers.reset_index()
pred_purchasers.shape

(3016, 2)

#### Проверяем сколько пользователей из теста удалось "зацепить" фильтрацией

In [1402]:
pred_purchasers[pred_purchasers.user_id.isin(test_purchasers.user_id.unique())].shape

(496, 2)

In [1403]:
check_merge = test_purchasers.merge(pred_purchasers[-3000:], on='user_id', how='left')
check_merge.item_id_y.loc[check_merge.item_id_y.isnull()] = check_merge.item_id_y.loc[check_merge.item_id_y.isnull()].apply(lambda x: [])
check_merge.shape

(496, 3)

In [1404]:
check_merge.sample(100)

Unnamed: 0,user_id,item_id_x,item_id_y
367,b77a0f86-f887-11ed-86e0-002590c0647c,[4004316],[4004316]
485,f7a5ce6e-8275-11ea-8a53-0cc47a6d2fef,"[4064079, 5173426, 5094481, 5379376]","[4064079, 3864580, 5094481, 5114500, 5173483, ..."
469,ed556bf8-e594-11eb-86e0-002590c0647c,"[5450296, 5450620, 5450650, 440399]","[5450650, 5450296, 4973041, 5064721, 5450620, ..."
209,619d2efc-160b-11ef-86e0-002590c0647c,[2786098],"[2786098, 5419213]"
94,2791b7fe-d4da-11ee-9e62-002590c82436,"[1026199, 6164842]","[1026199, 964291, 1854862, 6601346, 5869471, 3..."
...,...,...,...
49,13c1039e-7b18-11ef-86e0-002590c0647c,[6457720],"[6457720, 5011840]"
107,2fa36e2a-3687-11ee-86e0-002590c0647c,"[6548842, 5444338, 1864951, 6548866]","[5182840, 5444338, 6548866, 6548842, 6534124, ..."
492,fd347a70-6e74-11ef-9b7b-002590c82436,[5182849],[5182849]
490,fc19fa7c-7d46-11ee-86e0-002590c0647c,"[2694817, 5135620]","[7059189, 2694817, 5135620, 6926664, 7040796, ..."


In [1405]:
#### Проверяем полноту фильтрации

In [1406]:
check_merge['recall'] = check_merge.apply(calculate_recall, axis=1)
check_merge['recall'].mean().round(4)

0.4964

#### Используем классификатор чтобы предсказать пользователей с целевыми действиями

In [1407]:
inters['datetime'].min(), inters['datetime'].max()

(Timestamp('2024-09-07 00:00:04'), Timestamp('2024-09-28 23:59:59'))

In [1408]:
last_3_days = inters['datetime'].max() - pd.Timedelta(0, unit='D')
last_8_days = inters['datetime'].max() - pd.Timedelta(27, unit='D')
pred_data = inters[(inters['datetime'] > last_8_days) & (inters['datetime'] < last_3_days)]

In [1409]:
t = h2o.H2OFrame(pred_data[list(x)])

In [1410]:
t['category_id'] = t['category_id'].asfactor()

In [1411]:
preds = gbm_model.predict(t)

#### Обьединяем предсказания с исходными данными, чтобы фильтровать дальше

In [1412]:
res = pd.concat(
    [
        pred_data[list(x) + ['weight', 'user_id', 'item_id', 'datetime']].reset_index(drop=True),
        preds.as_data_frame().reset_index(drop=True)
    ],
    axis='columns',
)

In [1413]:
# Максимальный скор с которым предсказали целевое действие
res['score'] = res[res.columns[-10:]].max(axis=1)

In [1414]:
# Смотрим корреляцию по данным чтобы понять по чему фильтровать
# res[list(set(res.columns) - set(['user_id']))].sample(1000).corr()

In [1415]:
mask = (
    (res.predict.isin([9]) & (res.score > 0))
    # |
    # (res.predict.isin([8]) & (res.score > 0.75))
)

In [1416]:
# Оцениваем размер таблицы после фильрации
res[
    mask
].groupby('user_id')['item_id'].agg(lambda x: list(map(str, x))[:25]).reset_index().shape

(1654, 2)

In [1417]:
# Оцениваем количество пользователей, которое удалось "зацепить" фильтрацией из теста
res[
    mask
    & (res.user_id.isin(test_purchasers.user_id.unique()))
].groupby('user_id')['item_id'].agg(lambda x: list(set(map(str, x)))[:25]).reset_index().shape

(236, 2)

In [1418]:
ml_users = res[
    mask
    & (res.user_id.isin(test_purchasers.user_id.unique()))
].groupby('user_id')['item_id'].agg(lambda x: list(set(map(str, x)))[:25]).reset_index().user_id.to_list()

In [1419]:
print(len(pred_purchasers.user_id.unique()))
print(len(ml_users))
print(len(set(ml_users).intersection(set(pred_purchasers.user_id.unique()))))

3016
236
236


In [1420]:
# Генерим таблицу с кандидатами по результатам мл - предсказания
ml_pred = res[
    mask
].sort_values(by=['weight'], ascending=False).groupby('user_id')['item_id'].agg(
    lambda x: [y for y in (set(map(str, x))) if y != '-1'][:25]
).reset_index()
ml_pred.shape

(1654, 2)

In [1421]:
# Напоминание о размере таблицы с кандидатами после обычной фильтрации - оставшееся от 3000 строк место мы заполнили результатами мл
pred_purchasers.shape

(3016, 2)

In [1422]:
data_to_check = pd.concat(
    [
        ml_pred.reset_index(drop=True),
        pred_purchasers.reset_index(drop=True),
    ],
    axis='rows'
)
print(data_to_check.shape)
data_to_check = data_to_check.drop_duplicates(subset='user_id', keep='last') # .reset_index(drop=True)
print(data_to_check[data_to_check.user_id.isin(test_purchasers.user_id.unique())].shape)
data_to_check.shape

(4670, 2)
(496, 2)


(3777, 2)

In [1423]:
# Ограничим резульат 3000 строк (как по условию). Так как данные обычной фильтрации эффективнее и они в конце обьединенных данных - ограничим с конца
data_to_check[-3000:].shape

(3000, 2)

#### Проверяем результат локально на данных фильтрация + мл

In [1424]:
# check_merge_ml = test_purchasers.merge(data_to_check.sample(3000, random_state=42), on='user_id', how='left')
check_merge_ml = test_purchasers.merge(data_to_check[-3000:], on='user_id', how='left')
check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()] = check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()].apply(lambda x: [])
print(check_merge_ml.shape)
check_merge_ml['recall'] = check_merge_ml.apply(calculate_recall, axis=1)
check_merge_ml['recall'].mean().round(4)

(496, 3)


0.4964

### Построение рекомендаций 
дополним данные после фильрации товарами на основе "оценок" пользователей (ценность действия интерпретируем как оценку)

#### Построение модели

In [1425]:
K_RECOS = 25
NUM_THREADS = 24
RANDOM_STATE = 32
ITERATIONS = 25

In [1426]:
def make_base_model(factors: int, regularization: float, alpha: float, fit_features_together: bool=False):
    return ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            alpha=alpha,
            random_state=RANDOM_STATE,
            use_gpu=False,
            num_threads = NUM_THREADS,
            iterations=ITERATIONS),
        fit_features_together = fit_features_together,
        )

In [1427]:
last_3_days = inters['datetime'].max() - pd.Timedelta(0, unit='D')
last_8_days = inters['datetime'].max() - pd.Timedelta(27, unit='D')
recos_data = inters[
    (inters['datetime'] > last_8_days)
    & (inters['datetime'] < last_3_days) 
    # & ((inters['weight'] > 3))   # дейсвтия ниже 5 не привязаны к товару, чтобы не рекомендовать -1 и сэкономить на размере входных данных - фильтруем
]

In [1429]:
# # Process item features to the form of a flatten dataframe
# items = items.loc[items['item_id'].isin(train['item_id'])].copy()
# 
# genre_feature = items[["item_id", "category_id"]].explode("category_id")
# genre_feature.columns = ["id", "value"]
# genre_feature["feature"] = "category_id"
# item_features = pd.concat((genre_feature,))

In [1430]:
recos_data.weight.unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [1431]:
recs_dataset = Dataset.construct(
    interactions_df=recos_data[['user_id', 'item_id', 'weight', 'datetime']],
)

model = make_base_model(factors=256, regularization=0.2, alpha=100)
model.fit(recs_dataset)

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7807d79c55d0>

In [1432]:
recos = model.recommend(
    users=data_to_check[-3000:].user_id,
    dataset=recs_dataset,
    k=26,
    filter_viewed=False,
)

In [1433]:
recos = recos[recos.item_id > -1]

#### Дополняем данные с предыдущих предсказаний (фильтрация + ml) 

In [1434]:
recos_agg = recos.groupby('user_id')['item_id'].agg(lambda x: list(map(str, x))).reset_index()

In [1435]:
recos_agg_selected_uid = recos_agg[
    recos_agg.user_id.isin(
        data_to_check[-3000:].user_id
    )
]

In [1436]:
recos_agg_selected_uid.shape

(3000, 2)

In [1437]:
# Объединяем рекомендации с предсказанями фильтрацией и мл
enriched_data_to_check = data_to_check[-3000:].merge(recos_agg_selected_uid, on='user_id', how='left')

In [1438]:
enriched_data_to_check.columns

Index(['user_id', 'item_id_x', 'item_id_y'], dtype='object')

In [1439]:
# Заполняем nan пустыми списками для корректного расчета метрики
enriched_data_to_check.item_id_y.loc[enriched_data_to_check.item_id_y.isnull()] = enriched_data_to_check.item_id_y.loc[enriched_data_to_check.item_id_y.isnull()].apply(lambda x: [])

In [1440]:
# Обьединяем исходное предсказание рекомендациями по rectools  - сначала исходные айди, потом рекоммендации (порядок важен, тк далее оставляем только 25 элементов списка)
enriched_data_to_check['item_id'] = enriched_data_to_check['item_id_x'] + enriched_data_to_check['item_id_y']

In [1441]:
# Убеждаемся что добавлением новых товаров мы не вышли 
enriched_data_to_check['item_id'] = enriched_data_to_check['item_id'].apply(lambda x: list(set(x[:25])))

In [1442]:
enriched_data_to_check = enriched_data_to_check[['user_id', 'item_id']]

In [1443]:
enriched_data_to_check.drop_duplicates(subset=['user_id']).shape

(3000, 2)

In [1444]:
check_merge_ml = test_purchasers.merge(enriched_data_to_check[-3000:], on='user_id', how='left')
check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()] = check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()].apply(lambda x: [])
print(check_merge_ml.shape)
check_merge_ml['recall'] = check_merge_ml.apply(calculate_recall, axis=1)
check_merge_ml['recall'].mean().round(4)

(496, 3)


0.4974

### Сохраняем результаты

In [1446]:
enriched_data_to_check.rename(columns={'item_id': 'products'}).reset_index().to_csv("r_final.csv", index=False)

In [1447]:
pd.read_csv('r_final.csv').shape

(3000, 3)

In [176]:
# Не забываем выключить кластер h2o на локальной машине
h2o.cluster().shutdown()