# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm.notebook import tqdm
# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


# Ошибка в prefilter_items

В работе функции prefilter_items ошибка связанная с 2-мя певыми пунктами.

Уберем самые популярные товары (их и так купят)

Уберем самые НЕ популярные товары (их и так НЕ купят)

В процессе обьявления popularity мы делим все столбцы на количество юзеров. Соответственно дальше проверка по номеру юзера не может пройти, так как они все разделены на 2499. Так что пропускаются дальше все item_id. И если все сделать правильно(вроде я сделал), то сотается меньше 5000 товаров. так что я взял товары более популярные чем 0.002.

In [3]:
data_val_lvl_1[data_val_lvl_1['user_id'] == 1984]

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2187037,1984,40865056205,613,843171,1,2.0,445,-0.49,1415,88,0.0,0.0
2187038,1984,40865056205,613,867188,2,2.0,445,-1.58,1415,88,0.0,0.0
2187039,1984,40865056205,613,1078023,1,2.0,445,-0.49,1415,88,0.0,0.0
2247777,1984,41125116269,628,995242,2,2.0,445,-1.78,1404,90,0.0,0.0


In [4]:
data_val_lvl_1.shape

(169711, 12)

In [5]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
Decreased # items from 83685 to 5001


In [6]:
recommender = MainRecommender(data_train_lvl_1)

100%|██████████| 15/15 [00:06<00:00,  2.30it/s]
100%|██████████| 5001/5001 [00:00<00:00, 23322.62it/s]


In [7]:
recommender.get_als_recommendations(1000, N=5)

[999250, 863632, 888543, 961979, 932420]

In [8]:
recommender.get_own_recommendations(1000, N=5)

[12427353, 881607, 863632, 1139418, 7167836]

In [9]:
recommender.get_similar_items_recommendation(1000, N=5)

[1076199, 1042616, 899229, 7431134, 1040346]

In [10]:
recommender.get_similar_users_recommendation(1000, N=5)

[10285454, 923670, 935578, 894360, 979505]

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall? 


In [11]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [12]:
result_lvl_1[result_lvl_1['user_id'] == 1984] 

Unnamed: 0,user_id,actual
1715,1984,"[843171, 867188, 1078023, 995242]"


In [13]:
data_train_lvl_1[data_train_lvl_1['user_id'] == 1984] 

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price


In [14]:
methods = [
    recommender.get_similar_users_recommendation,
    recommender.get_similar_items_recommendation,
    recommender.get_own_recommendations,
    recommender.get_als_recommendations
    ]

# ошибки в recomendere
d  _get_recommendations и get_similar_users_recommendation пришлось добавить защиту от юзерво которых небыло, им разается топ популярных. иначе 1984 юзер крашит recommender. его нету в обучающей выборке data_train_lvl_1 но он есть в data_val_lvl_1.

In [15]:
for method in tqdm(methods):
    print(method.__name__)
    result_lvl_1[method.__name__] = result_lvl_1['user_id'].apply(lambda x: method(x, N=50))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

get_similar_users_recommendation
get_similar_items_recommendation
get_own_recommendations
get_als_recommendations



In [16]:
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,get_similar_users_recommendation,get_similar_items_recommendation,get_own_recommendations,get_als_recommendations
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[997987, 1023196, 832678, 6552318, 1092937, 91...","[921763, 1007512, 994223, 5577022, 5567876, 10...","[856942, 9297615, 5577022, 9655212, 1124029, 1...","[885290, 1037332, 1062572, 1097909, 916993, 83..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[879504, 965772, 7160764, 8090956, 9881593, 69...","[880888, 819978, 1007512, 10344714, 860703, 97...","[911974, 1076580, 1103898, 5567582, 1007414, 1...","[9677366, 1021324, 875392, 950206, 945998, 889..."


In [17]:
columns = ['get_similar_users_recommendation', 'get_similar_items_recommendation', 'get_own_recommendations', 'get_als_recommendations']
recall_at_5, recall_at_50 =  [] , []
for algoritm in columns:
    recall_at_5.append(result_lvl_1.apply(lambda row: recall_at_k(row[algoritm], row['actual'], k=5), axis=1).mean())
    recall_at_50.append(result_lvl_1.apply(lambda row: recall_at_k(row[algoritm], row['actual'], k=50), axis=1).mean())

pd.DataFrame({'Algoritms': columns, 'Recall@5': recall_at_5,'Recall@50': recall_at_50})

Unnamed: 0,Algoritms,Recall@5,Recall@50
0,get_similar_users_recommendation,0.001272,0.008264
1,get_similar_items_recommendation,0.00557,0.024042
2,get_own_recommendations,0.021254,0.067177
3,get_als_recommendations,0.007845,0.036438


get_own_recomendations дают наилучший результат.

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?

In [18]:
result_lvl_1_2 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1_2.columns=['user_id', 'actual']
result_lvl_1_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [19]:
N = [20, 50, 100, 200, 500]
for n in tqdm(N):
    print()
    result_lvl_1_2[n] = result_lvl_1_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=n))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))









In [20]:
result_lvl_1_2.head(2)

Unnamed: 0,user_id,actual,20,50,100,200,500
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[856942, 9297615, 5577022, 9655212, 1124029, 1...","[856942, 9297615, 5577022, 9655212, 1124029, 1...","[856942, 9297615, 5577022, 9655212, 1124029, 1...","[856942, 9297615, 5577022, 9655212, 1124029, 1...","[856942, 9297615, 5577022, 9655212, 1124029, 1..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[911974, 1076580, 1103898, 5567582, 1007414, 1...","[911974, 1076580, 1103898, 5567582, 1007414, 1...","[911974, 1076580, 1103898, 5567582, 1007414, 1...","[911974, 1076580, 1103898, 5567582, 1007414, 1...","[911974, 1076580, 1103898, 5567582, 1007414, 1..."


In [21]:
recall_2_at_5 = []
for n in N:
    recall_2_at_5.append(result_lvl_1_2.apply(lambda row: recall_at_k(row[n], row['actual'], k=n), axis=1).mean())

pd.DataFrame({'N': N, 'Recall@N': recall_2_at_5})

Unnamed: 0,N,Recall@N
0,20,0.044035
1,50,0.067177
2,100,0.090297
3,200,0.112957
4,500,0.142392


Recall возвращает процент купленых среди нами предположеных товаров. Соотвественно, чем больше К тем больше recall. В связи с этим, будет разумно выбирать K такого же размера как и формируемая нами рекомендация, если дальше мы не идем на второй круг рекомендаций. Если мы идем на второй лвл для ранжирования, тогда К нужно брать размером с переходящую выборку. Что бы в ней содержалось как можно больше правильных кандидатов.

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [523]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
own_recomend = users_lvl_2.copy()

In [524]:
users_lvl_2

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1092937, 917033, 950202, 944588, 879..."
1,2021,"[950935, 1119454, 835578, 863762, 1013928, 653..."
2,1753,"[967041, 963686, 9553382, 883186, 942166, 9359..."
3,2120,"[1126899, 1070820, 5569471, 1075368, 5585510, ..."
4,1346,"[5569309, 5574377, 1135983, 1129982, 5569993, ..."
...,...,...
2149,1446,"[6391134, 827332, 12487356, 13007435, 981677, ..."
2150,1784,"[917406, 849578, 886395, 956127, 950935, 10745..."
2151,436,"[934399, 953180, 915459, 1092363, 918046, 8787..."
2152,1697,"[993838, 922847, 886103, 1016800, 5568489, 131..."


In [525]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2

Unnamed: 0,user_id,item_id,flag
0,2070,1105426,1
0,2070,1092937,1
0,2070,917033,1
0,2070,950202,1
0,2070,944588,1
...,...,...,...
2153,1745,9419525,1
2153,1745,1076769,1
2153,1745,1023196,1
2153,1745,968359,1


In [526]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)
for_predict = targets_lvl_2.copy()

In [527]:
targets_lvl_2

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1092937,1.0
2,2070,1092937,1.0
3,2070,917033,0.0
4,2070,950202,0.0
...,...,...,...
112199,1745,9419525,0.0
112200,1745,1076769,0.0
112201,1745,1023196,0.0
112202,1745,968359,0.0


In [528]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1092937,1.0,1089,MEAT-PCKGD,National,LUNCHMEAT,BOLOGNA,16OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [529]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [530]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [531]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  return f(**kwargs)


In [532]:
train_preds.shape

(112204, 2)

In [533]:
train_preds[1]

array([0.42292141, 0.57707859])

In [534]:
for_predict['predict_proba'] = train_preds[:,1]

In [535]:
for_predict.head(2)

Unnamed: 0,user_id,item_id,target,predict_proba
0,2070,1105426,0.0,0.083592
1,2070,1092937,1.0,0.577079


In [536]:
def items_by_proba(data, user_id):
    return data[data['user_id'] == user_id].sort_values(['predict_proba'], ascending=False).item_id.tolist()

In [537]:
# result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
# result_lvl_2.columns=['user_id', 'actual']
# result_lvl_2.head(2)

In [538]:
result_lvl_2 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [539]:
result_lvl_2['proba_recomend'] = result_lvl_2['user_id'].apply(lambda x: items_by_proba(for_predict, x))


In [540]:
result_lvl_2.head(2)

Unnamed: 0,user_id,actual,proba_recomend
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[8293439, 8293439, 8293439, 9655212, 9655212, ..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[898847, 847241, 1110244, 862725, 9416729, 936..."


будем жить без холодного старта. поэтому иннер.

In [541]:
result_lvl_2 = result_lvl_2.merge(own_recomend, on=['user_id'], how='inner')


In [542]:
result_lvl_2.rename(columns={'candidates':'own_recommend'}, inplace=True)

presicion на трейне

In [560]:
for lvl in ['proba_recomend', 'own_recommend']:
    print('precision', lvl, result_lvl_2.apply(lambda row: precision_at_k(row[lvl], row['actual'], k=5), axis=1).mean())

precision proba_recomend 0.19730608453320947
precision own_recommend 0.2109614491407339


In [561]:
for lvl in ['proba_recomend', 'own_recommend']:
    print('recall', lvl, result_lvl_2.apply(lambda row: recall_at_k(row[lvl], row['actual'], k=5), axis=1).mean())

recall proba_recomend 0.018848187719341346
recall own_recommend 0.021264263774254562


а вроде лучше же должно было стать =))) перепроверил, вроде все верно. может это связано с тем что для own_recommend некоторые товары пришли из top_50 популярных. так как для них я пееределал возможность холодного старта, с помощью top50 популярных товаров.

теперь тест

In [545]:
users_val_lvl_2 = pd.DataFrame(data_val_lvl_2['user_id'].unique())
users_val_lvl_2.columns = ['user_id']

# Пока только warm start
train_users_2 = data_train_lvl_1['user_id'].unique()
users_val_lvl_2 = users_val_lvl_2[users_val_lvl_2['user_id'].isin(train_users_2)]

users_val_lvl_2['candidates'] = users_val_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
own_recomend_test = users_val_lvl_2.copy()

In [546]:
targets_lvl_3 = data_val_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_3['target'] = 1  # тут только покупки 

targets_lvl_3 = users_lvl_2.merge(targets_lvl_3, on=['user_id', 'item_id'], how='left')

targets_lvl_3['target'].fillna(0, inplace= True)
targets_lvl_3.drop('flag', axis=1, inplace=True)
for_predict_test = targets_lvl_2.copy()

In [547]:
targets_lvl_3

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1092937,0.0
2,2070,917033,0.0
3,2070,950202,0.0
4,2070,944588,0.0
...,...,...,...
109753,1745,9419525,0.0
109754,1745,1076769,0.0
109755,1745,1023196,0.0
109756,1745,968359,0.0


In [548]:
targets_lvl_3 = targets_lvl_3.merge(item_features, on='item_id', how='left')
targets_lvl_3 = targets_lvl_3.merge(user_features, on='user_id', how='left')

targets_lvl_3.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1092937,0.0,1089,MEAT-PCKGD,National,LUNCHMEAT,BOLOGNA,16OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [549]:
X_test = targets_lvl_3.drop('target', axis=1)
y_test = targets_lvl_3[['target']]

In [550]:
test_preds = lgb.predict_proba(X_train)

In [551]:
for_predict_test['predict_proba'] = test_preds[:,1]

In [552]:
result_lvl_3 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_3.columns=['user_id', 'actual']
result_lvl_3.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [553]:
result_lvl_3['proba_recomend'] = result_lvl_3['user_id'].apply(lambda x: items_by_proba(for_predict_test, x))


In [554]:
result_lvl_3 = result_lvl_3.merge(own_recomend_test, on=['user_id'], how='inner')


In [555]:
result_lvl_3.rename(columns={'candidates':'own_recommend'}, inplace=True)

In [558]:
for lvl in ['proba_recomend', 'own_recommend']:
    print('precision', lvl, result_lvl_3.apply(lambda row: precision_at_k(row[lvl], row['actual'], k=5), axis=1).mean())

precision proba_recomend 0.12000000000000001
precision own_recommend 0.17236648701616858


In [559]:
for lvl in ['proba_recomend', 'own_recommend']:
    print('recall', lvl, result_lvl_3.apply(lambda row: recall_at_k(row[lvl], row['actual'], k=5), axis=1).mean())

recall proba_recomend 0.012339072322878144
recall own_recommend 0.02154902967940795


In [None]:
на val2 все тоже очень плохо. непонимаю почему так происходит. времени добавить фичей нету к сожалению. буду думать что тут не так.