In [6]:
from pathlib import Path

import numpy as np
import pandas as pd

In [7]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [8]:
import os
import threadpoolctl
import warnings

from rectools.models import ImplicitALSWrapperModel
from rectools.dataset import Dataset
from implicit.als import AlternatingLeastSquares

warnings.filterwarnings('ignore')

# For implicit ALS
os.environ["OPENBLAS_NUM_THREADS"] = "1"
threadpoolctl.threadpool_limits(1, "blas")

<threadpoolctl.threadpool_limits at 0x745766f10e10>

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [773]:
from src.utils import processing, calculate_recall, create_index_hnsw 

ImportError: cannot import name 'create_index_hnsw' from 'src.utils' (/home/varsey/PycharmProjects/reshenie-kokos-002/src/utils.py)

In [338]:
catalog = pd.read_parquet(f'{Path.cwd().parent}/data/stokman_catalog_preprocessed.pq', engine='pyarrow')
catalog.head()

Unnamed: 0,add_date,shop_id,product_id,category_id,price,title,old_price
0,2024-09-30 03:07:09,350,1025536,8922,6990,6048 48557 44342 53515 17614,6990
1,2024-09-19 03:07:08,350,1025614,10600,330,42548 40244 23485 10853 6545 2343,330
2,2024-09-30 03:07:09,350,1025656,231,13240,14034 42002 48557 56089 46836 55620 50785,26490
3,2024-09-30 03:07:09,350,1025659,14546,7550,32640 50164 48557 25314 23255 42146,12590
4,2024-09-30 03:07:09,350,1025668,1949,1290,51789 48557 56089 20149,1290


In [12]:
actions = pd.read_parquet(f'{Path.cwd().parent}/data/train_actions.pq', engine='pyarrow')
actions.head()

Unnamed: 0,user_id,loc_user_id,action,date,products,pageId
0,6fd49b56-8cc6-11ed-86e0-002590c0647c,c6e357dc-121d-449d-a744-e9a0b56c2380,7,2024-09-07 00:00:04,[],2571824865
1,6fd49b56-8cc6-11ed-86e0-002590c0647c,c6e357dc-121d-449d-a744-e9a0b56c2380,7,2024-09-07 00:00:08,[],3834364438
2,f9c498ec-5d3b-11ef-86e0-002590c0647c,120c9064-1131-4dc3-8048-44184531b42e,7,2024-09-07 00:00:08,[],2448628415
3,59386b5c-e64f-11ec-8086-002590c82437,f0745572-893f-4e50-bc52-5af47badff5a,7,2024-09-07 00:00:11,[],3875013967
4,badbd396-6cab-11ef-86e0-002590c0647c,73423d85-d47c-4332-8155-5200615302b5,7,2024-09-07 00:00:11,[],3025531174


#### Разворачиваем данные по айдишникам продуктов

In [13]:
_actions = (
    actions
    .copy()
    .explode('products')
    .assign(products=lambda x: x.products.fillna(-1).astype(np.int64))
)
print(_actions.shape)

_catalog = (
    catalog
    .copy()
    .assign(product_id=lambda x: x.product_id.astype(np.int64))
)

(6859002, 6)


#### Разворачиваем данные по айдишникам продуктов

In [14]:
_actions['action_datetime'] = pd.to_datetime(_actions['date'])

_actions.action = _actions.action + 10
_actions.action = _actions.action.replace({14: 0, 13: 4, 18: 2, 19: 3, 10: 5, 17: 1, 11: 6, 12: 7, 16: 8, 15: 9})
_actions = _actions.sort_values(by=['action', 'date'])
print(_actions.shape)

(6859002, 7)


#### Новая шкала action в train_actions.pq в порядке возрастания важности
    0 - clearB (удаление всех товаров из корзины)
    1 - visit (посещение страницы с товаром)
    2 - visitCategory (посещение страницы с группой товаров)
    3 - search (поиск товара)           
    4 - delB (удаление товара из корзины)
    5 - view (просмотр товара)
    6 - like (лайк товара)
    7 - addB (добавление товара в корзину)
    8 - listB (посещение страницы корзины и вывод списка товаров в корзине)
    9 - order (оформление заказа)
    

#### Переименовываем поля и обьеденияем датасеты каталога и дейсвтвий пользователя    
'products' теперь 'item_id', 'action'  превратился в 'weight' (мы перевели код действия в шкалу по возрастанию ранее)

In [15]:
interactions = _actions.rename(columns={'products': 'item_id', 'action' : 'weight', 'action_datetime': 'datetime'})

In [16]:
items = _catalog.rename(columns={'product_id': 'item_id'})

In [17]:
interactions_merge = interactions.merge(items, on='item_id', how='left')
interactions_merge.shape

(6859002, 13)

In [18]:
interactions_merge['day'] = interactions_merge['datetime'].dt.day
interactions_merge['day_of_week'] = interactions_merge['datetime'].dt.dayofweek  # Monday=0, Sunday=6

interactions_merge['hour'] = interactions_merge['datetime'].dt.hour
interactions_merge['minute'] = interactions_merge['datetime'].dt.minute

In [19]:
interactions_merge['price_diff'] = interactions_merge['price'] - interactions_merge['old_price']

interactions_merge['category_id'] = interactions_merge['category_id'].fillna(0)
interactions_merge['price_diff'] = interactions_merge['price_diff'].fillna(0)

interactions_merge['category_id'] = interactions_merge['category_id'].astype(np.int32)

In [20]:
inters = interactions_merge.merge(
    interactions_merge.groupby('user_id')['loc_user_id'].count().reset_index(name='loc_user_count'),
    on='user_id',
    how='left'
)
print(inters.shape)

(6859002, 19)


In [21]:
inters = inters.merge(
    interactions_merge.groupby('user_id')['category_id'].count().reset_index(name='category_id_count'),
    on='user_id',
    how='left'
)
print(inters.shape)

(6859002, 20)


In [22]:
for action in sorted(interactions_merge.weight.unique()):
    inters = inters.merge(
        interactions_merge[interactions_merge.weight  == action].groupby('user_id')['weight'].count().reset_index(name=f'action_{action}_count'),
        on='user_id',
        how='left'
    )
    inters[f'action_{action}_count'] = inters[f'action_{action}_count'].fillna(0)
print(inters.shape)

(6859002, 30)


In [23]:
inters['delta_sec'] = inters.sort_values(by=['datetime']).groupby('user_id')['datetime'].diff()
inters['delta_sec'] = inters['delta_sec'].dt.total_seconds()
inters['delta_sec'] = inters['delta_sec'].fillna(0)

In [24]:
inters['delta_day'] = inters.sort_values(by=['datetime']).groupby('user_id')['day'].diff()
inters['delta_day'] = inters['delta_day'].fillna(0)

inters['delta_hour'] = inters.sort_values(by=['datetime']).groupby('user_id')['hour'].diff()
inters['delta_hour'] = inters['delta_hour'].fillna(0)

inters['delta_min'] = inters.sort_values(by=['datetime']).groupby('user_id')['minute'].diff()
inters['delta_min'] = inters['delta_min'].fillna(0)

In [25]:
inters = inters.drop(columns=['day', 'hour', 'minute'])

In [26]:
inters = inters.drop(columns=['price', 'old_price', 'date', 'add_date', 'loc_user_id', 'shop_id'])

In [27]:
inters = inters[inters['item_id'] != '/*']
inters = inters[inters['user_id'] != '/*']

In [28]:
inters[inters.user_id == 'e4362420-f6ca-11ec-86e0-002590c0647c'].head(500)

Unnamed: 0,user_id,weight,item_id,pageId,datetime,category_id,title,day_of_week,price_diff,loc_user_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
379,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:26,0,,5,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
383,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:41,0,,5,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,15.0,0.0,0.0,0.0
387,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:51,0,,5,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,10.0,0.0,0.0,0.0
388,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,4263215769,2024-09-07 00:00:52,0,,5,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
404,e4362420-f6ca-11ec-86e0-002590c0647c,1,-1,3450082821,2024-09-07 00:01:37,0,,5,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,25.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6295889,e4362420-f6ca-11ec-86e0-002590c0647c,5,6770769,252042197,2024-09-27 06:14:04,50448,18203 15838,4,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
6295906,e4362420-f6ca-11ec-86e0-002590c0647c,5,6990711,252042197,2024-09-27 06:14:21,50448,18203 15838,4,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
6295912,e4362420-f6ca-11ec-86e0-002590c0647c,5,6770769,252042197,2024-09-27 06:14:25,50448,18203 15838,4,0.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
6847200,e4362420-f6ca-11ec-86e0-002590c0647c,9,5422963,4263215769,2024-09-07 00:01:12,2247,30744 37270 46732 29313 52878 60036 63016,5,-5700.0,384,...,0.0,57.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0


In [29]:
inters.head()

Unnamed: 0,user_id,weight,item_id,pageId,datetime,category_id,title,day_of_week,price_diff,loc_user_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
0,5b08b2fc-0e71-11ee-86e0-002590c0647c,0,-1,89919732,2024-09-07 05:11:00,0,,5,0.0,141,...,0.0,27.0,2.0,1.0,1.0,0.0,24.0,0.0,0.0,1.0
1,1af7e04c-6cd1-11ef-9b7b-002590c82436,0,-1,851020872,2024-09-07 05:50:01,0,,5,0.0,91,...,0.0,25.0,0.0,2.0,5.0,1.0,6.0,0.0,0.0,1.0
2,d41cdad4-3b4c-11ef-86e0-002590c0647c,0,-1,1515288183,2024-09-07 07:37:02,0,,5,0.0,85,...,0.0,14.0,0.0,2.0,6.0,0.0,8.0,0.0,0.0,1.0
3,59b9d89a-1de2-11eb-86e0-002590c0647c,0,-1,1497267364,2024-09-07 11:52:12,0,,5,0.0,8270,...,44.0,694.0,1.0,125.0,1729.0,112.0,3.0,0.0,0.0,0.0
4,9c087566-8065-11ed-86e0-002590c0647c,0,-1,3039170152,2024-09-07 12:14:30,0,,5,0.0,182,...,1.0,26.0,0.0,3.0,8.0,0.0,9.0,0.0,0.0,0.0


#### Тренировка модели для распознования ключевого события

In [30]:
inters[((inters.weight == 1) | (inters.weight == 4)) & (inters.user_id == '0038911e-79a3-11ef-9b7b-002590c82436')].sort_values(by=['user_id', 'pageId', 'item_id'])

Unnamed: 0,user_id,weight,item_id,pageId,datetime,category_id,title,day_of_week,price_diff,loc_user_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
3522281,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1015778690,2024-09-25 18:09:12,0,,2,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,23166.0,0.0,7.0,-34.0
3337690,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,125129932,2024-09-24 18:18:04,0,,1,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,2.0,0.0,0.0,0.0
3138513,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1281889168,2024-09-23 18:17:15,0,,0,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,149.0,0.0,0.0,3.0
3338009,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1326997590,2024-09-24 18:20:22,0,,1,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,0.0,0.0,0.0,0.0
3338048,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,1326997590,2024-09-24 18:20:40,0,,1,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,18.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3539012,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:20,0,,2,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,1.0,0.0,0.0,0.0
3539032,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:25,0,,2,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,4.0,0.0,0.0,0.0
3539048,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:31,0,,2,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,6.0,0.0,0.0,0.0
3539063,0038911e-79a3-11ef-9b7b-002590c82436,1,-1,996145740,2024-09-25 19:43:34,0,,2,0.0,155,...,3.0,22.0,0.0,2.0,10.0,1.0,3.0,0.0,0.0,0.0


In [38]:
max_date = inters['datetime'].max()
train_raw = inters[inters['datetime'] >= max_date - pd.Timedelta(days=22)].copy()
train_raw = train_raw[train_raw['datetime'] <= max_date - pd.Timedelta(days=3)]
test_raw = inters[inters['datetime'] > max_date - pd.Timedelta(days=3)].copy()

In [39]:
train = train_raw.drop(columns=['datetime', 'item_id', 'pageId'])
test = test_raw.drop(columns=['datetime', 'item_id', 'pageId'])

In [40]:
test.sample()

Unnamed: 0,user_id,weight,category_id,title,day_of_week,price_diff,loc_user_count,category_id_count,action_0_count,action_1_count,...,action_4_count,action_5_count,action_6_count,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min
4073300,ca963d3e-e8ef-11ee-86e0-002590c0647c,1,0,,5,0.0,973,973,0.0,642.0,...,0.0,68.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0


In [41]:
train.shape, test.shape

((5796546, 22), (1062417, 22))

### Классы не сбалансированы

In [42]:
train.weight.value_counts()

weight
1    3554985
2     876035
5     875680
8     314936
3      69354
7      44249
6      26346
4      24150
9      10478
0        333
Name: count, dtype: int64

### Кооректируем баланс классов

In [43]:
train = train[train.weight.isin([0, 1, 3, 6, 7, 9])].drop_duplicates()
train.shape

(2540506, 22)

In [44]:
train_portion = pd.concat(
    [
        train[train.weight.isin([0, 1, 3, 6, 7, 9])],
        train_raw.sample(130_000)],
    axis='rows'
)

In [45]:
train_portion.weight.unique()

array([0, 1, 3, 6, 7, 9, 2, 5, 8, 4])

In [46]:
h2o.init(verbose=False)
print(f'{h2o.cluster().show_status()}')
h2o.no_progress()

0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,1 month and 13 days
H2O_cluster_name:,H2O_from_python_varsey_v7h4ed
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,15.62 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


None


In [47]:
train_portion[train_portion.weight == 0]

Unnamed: 0,user_id,weight,category_id,title,day_of_week,price_diff,loc_user_count,category_id_count,action_0_count,action_1_count,...,action_7_count,action_8_count,action_9_count,delta_sec,delta_day,delta_hour,delta_min,item_id,pageId,datetime
0,5b08b2fc-0e71-11ee-86e0-002590c0647c,0,0,,5,0.0,141,141,1.0,101.0,...,1.0,1.0,0.0,24.0,0.0,0.0,1.0,,,NaT
1,1af7e04c-6cd1-11ef-9b7b-002590c82436,0,0,,5,0.0,91,91,1.0,47.0,...,2.0,5.0,1.0,6.0,0.0,0.0,1.0,,,NaT
2,d41cdad4-3b4c-11ef-86e0-002590c0647c,0,0,,5,0.0,85,85,2.0,48.0,...,2.0,6.0,0.0,8.0,0.0,0.0,1.0,,,NaT
3,59b9d89a-1de2-11eb-86e0-002590c0647c,0,0,,5,0.0,8270,8270,2.0,5244.0,...,125.0,1729.0,112.0,3.0,0.0,0.0,0.0,,,NaT
4,9c087566-8065-11ed-86e0-002590c0647c,0,0,,5,0.0,182,182,1.0,118.0,...,3.0,8.0,0.0,9.0,0.0,0.0,0.0,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,3200ffea-fbc5-11ed-86e0-002590c0647c,0,0,,4,0.0,343,343,1.0,230.0,...,3.0,12.0,1.0,16.0,0.0,0.0,0.0,-1.0,3505743548,2024-09-20 14:18:24
39,3365d13e-0160-11ef-9b7b-002590c82436,0,0,,1,0.0,3331,3331,11.0,1729.0,...,13.0,23.0,0.0,7.0,0.0,0.0,0.0,-1.0,3155583250,2024-09-10 10:05:21
23,f36c080e-9118-11ed-86e0-002590c0647c,0,0,,0,0.0,235,235,1.0,158.0,...,1.0,5.0,0.0,19.0,0.0,0.0,1.0,-1.0,358330127,2024-09-09 06:56:01
252,52a97bf6-3c82-11ef-86e0-002590c0647c,0,0,,5,0.0,8318,8318,11.0,5866.0,...,136.0,1557.0,86.0,3.0,0.0,0.0,0.0,-1.0,3473946959,2024-09-21 12:44:20


In [48]:
train_h2o = h2o.H2OFrame(train_portion)
test_h2o = h2o.H2OFrame(test.sample(40_000))

In [49]:
y = "weight"
x = set(train_h2o.names) - set([y, 'user_id', 'shop_id', 'day', 'hour', 'pageId', 'datetime', 'item_id', 'is_useful'])
x

{'action_0_count',
 'action_1_count',
 'action_2_count',
 'action_3_count',
 'action_4_count',
 'action_5_count',
 'action_6_count',
 'action_7_count',
 'action_8_count',
 'action_9_count',
 'category_id',
 'category_id_count',
 'day_of_week',
 'delta_day',
 'delta_hour',
 'delta_min',
 'delta_sec',
 'loc_user_count',
 'price_diff',
 'title'}

In [50]:
train_h2o[y] = train_h2o[y].asfactor()
test_h2o[y] = test_h2o[y].asfactor()

In [51]:
train_h2o['category_id'] = train_h2o['category_id'].asfactor()
test_h2o['category_id'] = test_h2o['category_id'].asfactor()

### для снижения уровня рутины и фокусировки на цели - качесвтенных предсказаниях - используем automl

In [52]:
gbm_model = H2OGradientBoostingEstimator(seed=1234)
gbm_model.train(x=list(x), y=y, training_frame = train_h2o,  validation_frame = test_h2o, max_runtime_secs=120)

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,46.0,460.0,249598.0,5.0,5.0,5.0,16.0,32.0,30.173914

0,1,2,3,4,5,6,7,8,9,Error,Rate
0.0,343.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,343 / 343
0.0,2511926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"0 / 2,511,926"
0.0,19600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"19,600 / 19,600"
0.0,31177.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"31,177 / 31,177"
0.0,52.0,0.0,0.0,12.0,11.0,82.0,366.0,52.0,17.0,0.9797297,580 / 592
0.0,1540.0,0.0,0.0,0.0,13442.0,1189.0,2961.0,94.0,249.0,0.3097818,"6,033 / 19,475"
0.0,2461.0,0.0,0.0,0.0,532.0,20176.0,3117.0,108.0,99.0,0.2384403,"6,317 / 26,493"
0.0,2606.0,0.0,0.0,1.0,475.0,4164.0,34774.0,335.0,1955.0,0.215211,"9,536 / 44,310"
0.0,525.0,0.0,0.0,3.0,325.0,743.0,4025.0,999.0,286.0,0.8553432,"5,907 / 6,906"
0.0,515.0,0.0,0.0,0.0,43.0,1067.0,4997.0,61.0,3001.0,0.6901074,"6,683 / 9,684"

k,hit_ratio
1,0.9677305
2,0.9922524
3,0.9975166
4,0.9990829
5,0.9996742
6,0.9998816
7,0.9999599
8,0.9999943
9,0.9999996
10,0.9999999

0,1,2,3,4,5,6,7,8,9,Error,Rate
0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4 / 4
0.0,24812.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"0 / 24,812"
0.0,6351.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"6,351 / 6,351"
0.0,451.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,451 / 451
0.0,21.0,0.0,0.0,0.0,5.0,16.0,70.0,9.0,1.0,1.0,122 / 122
0.0,1562.0,0.0,1.0,1.0,3548.0,369.0,622.0,12.0,50.0,0.4244931,"2,617 / 6,165"
0.0,137.0,0.0,1.0,0.0,2.0,119.0,21.0,0.0,0.0,0.575,161 / 280
0.0,32.0,0.0,0.0,0.0,2.0,27.0,148.0,3.0,9.0,0.3303167,73 / 221
0.0,248.0,0.0,0.0,0.0,63.0,207.0,818.0,165.0,33.0,0.8924381,"1,369 / 1,534"
0.0,10.0,0.0,0.0,0.0,0.0,4.0,28.0,0.0,18.0,0.7,42 / 60

k,hit_ratio
1,0.72025
2,0.89085
3,0.96065
4,0.979825
5,0.989475
6,0.9955
7,0.9987251
8,0.9998001
9,1.0000001
10,1.0000001

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,training_auc,training_pr_auc,validation_rmse,validation_logloss,validation_classification_error,validation_auc,validation_pr_auc
,2024-10-12 20:35:56,0.063 sec,0.0,0.9000000,2.3025851,0.1160765,,,0.9000000,2.3025851,0.415075,,
,2024-10-12 20:36:00,4.481 sec,1.0,0.7781856,1.5146341,0.0342729,,,0.8168486,1.7621945,0.30115,,
,2024-10-12 20:36:06,10.071 sec,3.0,0.6092156,0.9580229,0.0342542,,,0.7103374,1.4081751,0.3017,,
,2024-10-12 20:36:11,15.451 sec,5.0,0.4776310,0.6703206,0.0340059,,,0.6375062,1.2493834,0.301075,,
,2024-10-12 20:36:16,20.795 sec,7.0,0.3779102,0.4931141,0.0339153,,,0.5902592,1.1677441,0.300075,,
,2024-10-12 20:36:22,26.029 sec,9.0,0.3062072,0.3774323,0.0338925,,,0.5617186,1.1284758,0.299625,,
,2024-10-12 20:36:27,31.333 sec,11.0,0.2574597,0.3001557,0.0338543,,,0.5453694,1.1160517,0.299275,,
,2024-10-12 20:36:32,36.630 sec,13.0,0.2261463,0.2476728,0.0338000,,,0.5362473,1.1155689,0.2988,,
,2024-10-12 20:36:38,41.874 sec,15.0,0.2069331,0.2117263,0.0337509,,,0.5315163,1.1281388,0.297975,,
,2024-10-12 20:36:43,47.151 sec,17.0,0.1953624,0.1869971,0.0336899,,,0.5285579,1.1391715,0.2977,,

variable,relative_importance,scaled_importance,percentage
title,480332.28125,1.0,0.7317623
action_6_count,53739.2617188,0.1118793,0.0818691
action_7_count,43213.5546875,0.089966,0.0658337
delta_min,16077.0546875,0.0334707,0.0244926
action_3_count,15105.6015625,0.0314482,0.0230126
action_1_count,10722.1455078,0.0223224,0.0163347
action_9_count,8314.3828125,0.0173096,0.0126665
delta_sec,8230.6972656,0.0171354,0.0125391
action_8_count,6930.2700195,0.0144281,0.0105579
loc_user_count,5562.1347656,0.0115798,0.0084736


In [53]:
gbm_model.model_performance(test_h2o)

0,1,2,3,4,5,6,7,8,9,Error,Rate
0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4 / 4
0.0,24812.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"0 / 24,812"
0.0,6351.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"6,351 / 6,351"
0.0,451.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,451 / 451
0.0,21.0,0.0,0.0,0.0,5.0,16.0,70.0,9.0,1.0,1.0,122 / 122
0.0,1562.0,0.0,1.0,1.0,3548.0,369.0,622.0,12.0,50.0,0.4244931,"2,617 / 6,165"
0.0,137.0,0.0,1.0,0.0,2.0,119.0,21.0,0.0,0.0,0.575,161 / 280
0.0,32.0,0.0,0.0,0.0,2.0,27.0,148.0,3.0,9.0,0.3303167,73 / 221
0.0,248.0,0.0,0.0,0.0,63.0,207.0,818.0,165.0,33.0,0.8924381,"1,369 / 1,534"
0.0,10.0,0.0,0.0,0.0,0.0,4.0,28.0,0.0,18.0,0.7,42 / 60

k,hit_ratio
1,0.72025
2,0.89085
3,0.96065
4,0.979825
5,0.989475
6,0.9955
7,0.9987251
8,0.9998001
9,1.0000001
10,1.0000001


In [54]:
gbm_model.varimp(use_pandas=True)

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,title,480332.28125,1.0,0.731762
1,action_6_count,53739.261719,0.111879,0.081869
2,action_7_count,43213.554688,0.089966,0.065834
3,delta_min,16077.054688,0.033471,0.024493
4,action_3_count,15105.601562,0.031448,0.023013
5,action_1_count,10722.145508,0.022322,0.016335
6,action_9_count,8314.382812,0.01731,0.012667
7,delta_sec,8230.697266,0.017135,0.012539
8,action_8_count,6930.27002,0.014428,0.010558
9,loc_user_count,5562.134766,0.01158,0.008474


#### Расчитываем recall для локальных тестов

#### Тестовые данные - три дня и ключевое событие - 9 (заказ)

In [55]:
last_3_days = inters['datetime'].max() - pd.Timedelta(3, unit='D')
last_13_days = inters['datetime'].max() - pd.Timedelta(0, unit='D')
test_purchasers = inters.sort_values(by=['datetime', 'weight'], ascending=True)
test_purchasers = test_purchasers[
    (test_purchasers['weight'] == 9) 
    & (test_purchasers['datetime'] > last_3_days)
    & (test_purchasers['datetime'] < last_13_days)
]
print(test_purchasers['datetime'].max(), test_purchasers['datetime'].min(),)
test_purchasers = test_purchasers.groupby(['user_id'])['item_id'].apply(processing)
test_purchasers = test_purchasers.reset_index()
test_purchasers.shape

2024-09-28 23:46:12 2024-09-26 00:38:57


(496, 2)

In [56]:
inters['datetime'].max() - pd.Timedelta(7, unit='D')

Timestamp('2024-09-21 23:59:59')

In [57]:
inters.columns

Index(['user_id', 'weight', 'item_id', 'pageId', 'datetime', 'category_id',
       'title', 'day_of_week', 'price_diff', 'loc_user_count',
       'category_id_count', 'action_0_count', 'action_1_count',
       'action_2_count', 'action_3_count', 'action_4_count', 'action_5_count',
       'action_6_count', 'action_7_count', 'action_8_count', 'action_9_count',
       'delta_sec', 'delta_day', 'delta_hour', 'delta_min'],
      dtype='object')

#### Предсказания фильтрацией

    

In [58]:
last_3_days = inters['datetime'].max() - pd.Timedelta(3, unit='D')
last_8_days = inters['datetime'].max() - pd.Timedelta(8*24, unit='hours')
pred_purchasers = inters.sort_values(by=['datetime', 'weight'], ascending=True)
pred_purchasers = pred_purchasers[
    (pred_purchasers['weight'].isin([8, 1]))
    & (pred_purchasers['datetime'] > last_8_days)
    & (pred_purchasers['datetime'] < last_3_days)
    & (
            (pred_purchasers.action_7_count > 50) 
            | (pred_purchasers.action_8_count > 20) 
            | (pred_purchasers.action_9_count > 0) 
            # & (pred_purchasers.action_6_count > 0) 
            # & (pred_purchasers.action_3_count > 0)
    )
]
pred_purchasers = pred_purchasers.groupby(['user_id'])['item_id'].apply(processing)
pred_purchasers = pred_purchasers.reset_index()
pred_purchasers.shape

(3074, 2)

#### Проверяем сколько пользователей из теста удалось "зацепить" фильтрацией

In [59]:
pred_purchasers[pred_purchasers.user_id.isin(test_purchasers.user_id.unique())].shape

(314, 2)

In [60]:
check_merge = test_purchasers.merge(pred_purchasers[-3000:], on='user_id', how='left')
check_merge.item_id_y.loc[check_merge.item_id_y.isnull()] = check_merge.item_id_y.loc[check_merge.item_id_y.isnull()].apply(lambda x: [])
check_merge.shape

(496, 3)

In [61]:
check_merge.sample(100)

Unnamed: 0,user_id,item_id_x,item_id_y
411,cc2e0348-8d2d-11ee-86e0-002590c0647c,"[4987300, 4088073, 5066200]",[]
323,9d3b1990-12bb-11ef-86e0-002590c0647c,"[4966585, 615593, 6773805, 5111692, 6168001]",[]
379,be757f4a-1938-11eb-a6e9-002590c82437,[6530860],[]
314,96c440be-5f04-11eb-86e0-002590c0647c,[5919193],[]
91,24aff1ca-564c-11ee-86e0-002590c0647c,"[1312778, 4946035]","[7064121, 6926697, 7065537, 7064949, 6075575, ..."
...,...,...,...
283,894491ae-badd-11ee-bbb1-002590c82436,"[6436573, 6929214]",[]
254,76393776-549e-11ef-86e0-002590c0647c,[5182840],[]
164,4a69d0c8-bf99-11ee-bbb1-002590c82436,[4008375],[3758833]
143,4147a044-6bfb-11ed-86e0-002590c0647c,[2133910],[]


In [62]:
#### Проверяем полноту фильтрации

In [63]:
check_merge['recall'] = check_merge.apply(calculate_recall, axis=1)
check_merge['recall'].mean().round(4)

0.0563

#### Используем классификатор чтобы предсказать пользователей с целевыми действиями

In [64]:
inters['datetime'].min(), inters['datetime'].max()

(Timestamp('2024-09-07 00:00:04'), Timestamp('2024-09-28 23:59:59'))

In [65]:
last_3_days = inters['datetime'].max() - pd.Timedelta(3, unit='D')
last_8_days = inters['datetime'].max() - pd.Timedelta(27, unit='D')
pred_data = inters[(inters['datetime'] > last_8_days) & (inters['datetime'] < last_3_days)]

In [66]:
t = h2o.H2OFrame(pred_data[list(x)])

In [67]:
t['category_id'] = t['category_id'].asfactor()

In [68]:
preds = gbm_model.predict(t)

#### Обьединяем предсказания с исходными данными, чтобы фильтровать дальше

In [69]:
res = pd.concat(
    [
        pred_data[list(x) + ['weight', 'user_id', 'item_id', 'datetime']].reset_index(drop=True),
        preds.as_data_frame().reset_index(drop=True)
    ],
    axis='columns',
)

In [70]:
# Максимальный скор с которым предсказали целевое действие
res['score'] = res[res.columns[-10:]].max(axis=1)

In [71]:
# Смотрим корреляцию по данным чтобы понять по чему фильтровать
# res[list(set(res.columns) - set(['user_id']))].sample(1000).corr()

In [72]:
mask = (
    (res.predict.isin([9]) & (res.score > 0))
    # |
    # (res.predict.isin([8]) & (res.score > 0.75))
)

In [73]:
# Оцениваем размер таблицы после фильрации
res[
    mask
].groupby('user_id')['item_id'].agg(lambda x: list(map(str, x))[:25]).reset_index().shape

(1606, 2)

In [74]:
# Оцениваем количество пользователей, которое удалось "зацепить" фильтрацией из теста
res[
    mask
    & (res.user_id.isin(test_purchasers.user_id.unique()))
].groupby('user_id')['item_id'].agg(lambda x: list(set(map(str, x)))[:25]).reset_index().shape

(146, 2)

In [75]:
ml_users = res[
    mask
    & (res.user_id.isin(test_purchasers.user_id.unique()))
].groupby('user_id')['item_id'].agg(lambda x: list(set(map(str, x)))[:25]).reset_index().user_id.to_list()

In [76]:
print(len(pred_purchasers.user_id.unique()))
print(len(ml_users))
print(len(set(ml_users).intersection(set(pred_purchasers.user_id.unique()))))

3074
146
116


In [77]:
# Генерим таблицу с кандидатами по результатам мл - предсказания
ml_pred = res[
    mask
].sort_values(by=['weight'], ascending=False).groupby('user_id')['item_id'].agg(
    lambda x: [y for y in (set(map(str, x))) if y != '-1'][:25]
).reset_index()
ml_pred.shape

(1606, 2)

In [78]:
# Напоминание о размере таблицы с кандидатами после обычной фильтрации - оставшееся от 3000 строк место мы заполнили результатами мл
pred_purchasers.shape

(3074, 2)

In [79]:
data_to_check = pd.concat(
    [
        ml_pred.reset_index(drop=True),
        pred_purchasers.reset_index(drop=True),
    ],
    axis='rows'
)
print(data_to_check.shape)
data_to_check = data_to_check.drop_duplicates(subset='user_id', keep='last') # .reset_index(drop=True)
print(data_to_check[data_to_check.user_id.isin(test_purchasers.user_id.unique())].shape)
data_to_check.shape

(4680, 2)
(344, 2)


(3733, 2)

In [80]:
# Ограничим резульат 3000 строк (как по условию). Так как данные обычной фильтрации эффективнее и они в конце обьединенных данных - ограничим с конца
data_to_check[-3000:].shape

(3000, 2)

#### Проверяем результат локально на данных фильтрация + мл

In [81]:
# check_merge_ml = test_purchasers.merge(data_to_check.sample(3000, random_state=42), on='user_id', how='left')
check_merge_ml = test_purchasers.merge(data_to_check[-3000:], on='user_id', how='left')
check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()] = check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()].apply(lambda x: [])
print(check_merge_ml.shape)
check_merge_ml['recall'] = check_merge_ml.apply(calculate_recall, axis=1)
check_merge_ml['recall'].mean().round(4)

(496, 3)


0.0563

### Построение рекомендаций 
дополним данные после фильрации товарами на основе "оценок" пользователей (ценность действия интерпретируем как оценку)

#### Построение модели

In [82]:
K_RECOS = 25
NUM_THREADS = 24
RANDOM_STATE = 32
ITERATIONS = 25

In [83]:
def make_base_model(factors: int, regularization: float, alpha: float, fit_features_together: bool=False):
    return ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            alpha=alpha,
            random_state=RANDOM_STATE,
            use_gpu=False,
            num_threads = NUM_THREADS,
            iterations=ITERATIONS),
        fit_features_together = fit_features_together,
        )

In [84]:
last_3_days = inters['datetime'].max() - pd.Timedelta(3, unit='D')
last_8_days = inters['datetime'].max() - pd.Timedelta(27, unit='D')
recos_data = inters[
    (inters['datetime'] > last_8_days)
    & (inters['datetime'] < last_3_days) 
    # & ((inters['weight'] > 3))   # дейсвтия ниже 5 не привязаны к товару, чтобы не рекомендовать -1 и сэкономить на размере входных данных - фильтруем
]

In [731]:
items = _catalog.rename(columns={'product_id': 'item_id'})

items = items.loc[items['item_id'].isin(recos_data['item_id'])].copy()

genre_feature = items[["item_id", "category_id"]].explode("category_id")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "category_id"
item_features = pd.concat((genre_feature,))

In [732]:
item_features.sample(3)

Unnamed: 0,id,value,feature
65657,6220858,16647,category_id
10966,1813223,8927,category_id
54318,5480137,7022,category_id


In [747]:
len(actions.loc_user_id.unique()), len(actions.pageId.unique()), len(items.category_id.unique()), len(inters.loc_user_count.unique()) 

(488970, 1421245, 1235, 1294)

In [752]:
users = inters.dropna(subset=['loc_user_count']).copy()

users = users.loc[users['user_id'].isin(recos_data['user_id'])].copy()

feature_frame = users.reindex(columns=['user_id', 'loc_user_count'])
feature_frame.columns = ["id", "value"]
feature_frame["feature"] = 'loc_user_count'
user_features = pd.concat((feature_frame,))

In [755]:
user_features.sample(3)

Unnamed: 0,id,value,feature
1021367,e0e61258-7111-11ef-9b7b-002590c82436,8,loc_user_count
5105930,0143e38e-f8c8-11ea-8a53-0cc47a6d2fef,2962,loc_user_count
5950962,bd8d2610-94a7-11eb-a6e9-002590c82437,74,loc_user_count


In [756]:
recos_data.weight.unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [758]:
recs_dataset = Dataset.construct(
    interactions_df=recos_data[['user_id', 'item_id', 'weight', 'datetime']],
    # user_features_df=user_features,
    # cat_user_features=["loc_user_count"],
    item_features_df=item_features,
    cat_item_features=["category_id"],
)

model = make_base_model(factors=256, regularization=0.2, alpha=100)
model.fit(recs_dataset)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x74548015eed0>

In [759]:
recos = model.recommend(
    users=data_to_check[-3000:].user_id,
    dataset=recs_dataset,
    k=20,
    filter_viewed=False,
)

In [760]:
recos = recos[recos.item_id > -1]

#### Дополняем данные с предыдущих предсказаний (фильтрация + ml) 

In [761]:
recos_agg = recos.groupby('user_id')['item_id'].agg(lambda x: list(map(str, x))).reset_index()

In [762]:
recos_agg_selected_uid = recos_agg[
    recos_agg.user_id.isin(
        data_to_check[-3000:].user_id
    )
]

In [763]:
recos_agg_selected_uid.shape

(3000, 2)

In [764]:
# Объединяем рекомендации с предсказанями фильтрацией и мл
enriched_data_to_check = data_to_check[-3000:].merge(recos_agg_selected_uid, on='user_id', how='left')

In [765]:
enriched_data_to_check.columns

Index(['user_id', 'item_id_x', 'item_id_y'], dtype='object')

In [766]:
# Заполняем nan пустыми списками для корректного расчета метрики
enriched_data_to_check.item_id_y.loc[enriched_data_to_check.item_id_y.isnull()] = enriched_data_to_check.item_id_y.loc[enriched_data_to_check.item_id_y.isnull()].apply(lambda x: [])

In [767]:
# Обьединяем исходное предсказание рекомендациями по rectools - сначала исходные айди, потом рекоммендации (порядок важен, тк далее оставляем только 25 элементов списка)
enriched_data_to_check['item_id'] = enriched_data_to_check['item_id_x'] + enriched_data_to_check['item_id_y']

In [768]:
# Убеждаемся что добавлением новых товаров мы не вышли 
enriched_data_to_check['item_id'] = enriched_data_to_check['item_id'].apply(lambda x: sorted(list(set(x[:25])), reverse=True))

In [769]:
enriched_data_to_check['products_count'] = enriched_data_to_check['item_id'].apply(len)
enriched_data_to_check['products_count'].mean()

21.514333333333333

In [770]:
enriched_data_to_check = enriched_data_to_check[['user_id', 'item_id']]

In [771]:
enriched_data_to_check.drop_duplicates(subset=['user_id']).shape

(3000, 2)

In [772]:
check_merge_ml = test_purchasers.merge(enriched_data_to_check[-3000:], on='user_id', how='left')
check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()] = check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()].apply(lambda x: [])
print(check_merge_ml.shape)
check_merge_ml['recall'] = check_merge_ml.apply(calculate_recall, axis=1)
check_merge_ml['recall'].mean().round(4)

(496, 3)


0.0857

### Сохраняем результаты

In [1446]:
enriched_data_to_check.rename(columns={'item_id': 'products'}).reset_index().to_csv("r_final.csv", index=False)

In [1447]:
pd.read_csv('r_final.csv').shape

(3000, 3)

In [775]:
catalog = pd.read_parquet(f'{Path.cwd().parent}/data/stokman_catalog_preprocessed.pq', engine='pyarrow')
catalog.head()

catalog['desc'] = catalog['category_id'].astype(str) + ' ' + catalog['title']
catalog = catalog.drop_duplicates(subset='product_id').reset_index(drop=True)
catalog = catalog.dropna(subset=['desc'])
catalog.shape

(89483, 8)

In [776]:
documents = catalog.desc.dropna().to_list()
print(len(documents))
vectorizer = TfidfVectorizer(lowercase=False)
tfidf_matrix = vectorizer.fit_transform(documents).toarray().astype('float32')

89483


In [527]:
index = create_index_hnsw(tfidf_matrix)

In [529]:
# dict(zip(vectorizer.vocabulary_, vectorizer.idf_)) 

In [780]:
query_vector = vectorizer.transform(['230 38721 36072 46732 26240 32815',])

N = 1
_, indices = index.search(query_vector.toarray().reshape(1, -1), 4) 
for indx in range(N):
    match = catalog.iloc[indices[0][indx]].to_dict()
    print(match['product_id'], ' ', match['desc'])
    

5920453   230 38721 36072 46732 26240 32815


In [786]:
def get_similar(x):
    res = []
    for y in x[:4]:
        cand = catalog[catalog.product_id == y]
        if cand.shape[0] > 0:
            to_match = cand.desc.values[0]
            query_vector = vectorizer.transform([to_match, ])
            _, indices = index.search(query_vector.toarray().reshape(1, -1), 2)
            match1 = catalog.iloc[indices[0][0]].to_dict()
            match2 = catalog.iloc[indices[0][1]].to_dict()
            res.append(match1['product_id'])
            res.append(match2['product_id'])
    return res

In [787]:
enriched_data_to_check_w_similar = enriched_data_to_check.copy()

In [788]:
enriched_data_to_check_w_similar['items_sim']  = enriched_data_to_check_w_similar['item_id'].apply(lambda x: get_similar(x))

In [789]:
# Убеждаемся что добавлением новых товаров мы не вышли 
enriched_data_to_check_w_similar['item_id'] = enriched_data_to_check_w_similar['item_id'] + enriched_data_to_check_w_similar['items_sim'].apply(lambda x: list(set(x)))

In [790]:
enriched_data_to_check_w_similar['item_id'] = enriched_data_to_check_w_similar['item_id'].apply(lambda x: sorted(list(set(x[:25])), reverse=True))

In [791]:
enriched_data_to_check_w_similar['products_count'] = enriched_data_to_check_w_similar['item_id'].apply(len)
enriched_data_to_check_w_similar['products_count'].mean()

23.474

In [792]:
enriched_data_to_check_w_similar.sample(5)

Unnamed: 0,user_id,item_id,items_sim,products_count
1207,6ce9e6ac-4383-11ee-86e0-002590c0647c,"[787768, 734339, 6606440, 6438154, 6438151, 64...","[787768, 4964878, 1334129, 1335152, 6652005, 6...",25
166,15492920-4c9f-11ed-8644-002590c82437,"[718659, 718652, 6886686, 6588035, 6422179, 64...","[718652, 718659, 6886686, 1041541, 6588035, 59...",24
1978,a95cc5ca-025f-11ed-bf62-002590c82437,"[5880976, 5880442, 5418100, 5313529, 5313490, ...","[5880976, 5880442, 5418100, 5399602, 5313490, ...",23
2459,d162bc0e-31b3-11e8-a0d7-002590e45c38,"[6655476, 6627677, 6627675, 6481510, 6481492, ...","[6655476, 6110206, 6627675, 6627677, 6481492, ...",24
2157,b8ae1e90-5b84-11ef-86e0-002590c0647c,"[6534112, 6534088, 6534079, 6534028, 6534001, ...","[6533962, 6534088, 6533962, 6534088, 6533944, ...",21


In [793]:
check_merge_ml = test_purchasers.merge(enriched_data_to_check_w_similar[-3000:], on='user_id', how='left')
check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()] = check_merge_ml.item_id_y.loc[check_merge_ml.item_id_y.isnull()].apply(lambda x: [])
print(check_merge_ml.shape)
check_merge_ml['recall'] = check_merge_ml.apply(calculate_recall, axis=1)
check_merge_ml['recall'].mean().round(4)

(496, 5)


0.0857

In [176]:
# Не забываем выключить кластер h2o на локальной машине
h2o.cluster().shutdown()