# Homework 6. Двухуровневые модели рекомендаций


# Import libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

## Read data

In [527]:
data = pd.read_csv('./data/retail_train.csv')
item_features = pd.read_csv('./data/product.csv')
user_features = pd.read_csv('./data/hh_demographic.csv')

# Process features dataset

In [528]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'

In [529]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [530]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [531]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [7]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [8]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


# Prefilter items

In [532]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


# Make cold-start to warm-start

In [533]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (803208, 13) Users: 2494 Items: 5001
val_matcher
Shape: (169615, 12) Users: 2151 Items: 27644
train_ranker
Shape: (169615, 12) Users: 2151 Items: 27644
val_ranker
Shape: (118282, 12) Users: 2040 Items: 24325


# Init/train recommender

In [534]:
recommender = MainRecommender(data_train_matcher)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




## Задание 1.
### Измеряем recall@k

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_matcher: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

In [12]:
ACTUAL_COL = 'actual'

In [13]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [14]:
# # сырой и простой пример как можно обернуть в функцию
def evalRecall(df_result, target_col_name, recommend_model, N=50):
    result_col_name = recommend_model.__name__.replace('get_', '')
    df_result[result_col_name] = df_result[target_col_name].apply(lambda x: recommend_model(x, N=N))
    return df_result.apply(lambda row: recall_at_k(row[result_col_name], row[ACTUAL_COL], k=N), axis=1).mean()

In [15]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [16]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [17]:
%%time
for model in [recommender.get_own_recommendations, recommender.get_als_recommendations, 
              recommender.get_similar_items_recommendation, recommender.get_similar_users_recommendation]:
    print(f"Model {model.__name__.replace('get_', '')} result: {evalRecall(result_eval_matcher, USER_COL, model)}")

Model own_recommendations result: 0.06610641265447958
Model als_recommendations result: 0.05195119583686378
Model similar_items_recommendation result: 0.025560835796067008
Model similar_users_recommendation result: 0.00816944263782052
CPU times: user 27min 12s, sys: 12min 14s, total: 39min 26s
Wall time: 35min 26s


**Лучший результат у own_recommendations, similar_users_recommendation с заданным подходом (самый покупаемый товар у каждого из пяти самых похожих пользователей) достаточно плох. Будем использовать own_recommendations - быстрый и наиболее качественный отбор кандидатов.**

### B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
### C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?

In [19]:
for k in [20, 50, 100, 200, 500]:
    print(f"k={k} result: {evalRecall(result_eval_matcher, USER_COL, recommender.get_own_recommendations, N=k)}")

k=20 result: 0.03950560931238893
k=50 result: 0.06610641265447958
k=100 result: 0.09855560339294622
k=200 result: 0.1386213494692769
k=500 result: 0.18640458463171924


**При k = 100 получаем оптимальный прирост единиц метрики, приходящейся на k. При k > 200 прирост метрики замедляется, большое число кандидатов также может ухудшить результаты ранжирования и увеличить время выдачи рекомендации**

In [37]:
result_eval_matcher['final_own_recommendation'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=100))

**Precision@5 для выбранного числа кандидатов**

In [50]:
result_eval_matcher.apply(lambda row: precision_at_k(row['final_own_recommendation'], row[ACTUAL_COL], k=5), axis=1).mean()

0.1767549976754978

# Ranking part

## Подготовка данных для трейна

In [535]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [536]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=100))

In [537]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1097350, 879194, 948640, 928263, 944..."
1,2021,"[950935, 1119454, 835578, 863762, 1019142, 102..."


In [538]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [539]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [540]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,1105426
0,2070,1097350
0,2070,879194
0,2070,948640


### Check warm start

In [541]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (215100, 2) Users: 2151 Items: 4564


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [542]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

In [543]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1
2107470,2021,869344,1
2107471,2021,896862,1


#### Не хватает нулей в датасете, поэтому добавляем наших кандитатов в качество нулей

In [544]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

In [545]:
df_ranker_train.target.value_counts()

0.0    199962
1.0     11934
Name: target, dtype: int64

In [546]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1097350,0.0


(!) На каждого юзера 100 item_id-кандидатов

In [547]:
df_ranker_train['target'].mean()

0.05632008154943935

## Задание 2.
#### Обучите модель 2-ого уровня, при этом:
#### - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
#### - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_ranker
#### - Вырос ли precision@5 при использовании двухуровневой модели?

In [548]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


**Фичи user_id:**
    - Средний чек
    - Средняя сумма покупки 1 товара в каждой категории
    - Кол-во покупок в каждой категории
    - Частотность покупок раз/месяц
    - Долю покупок в выходные
    - Долю покупок утром/днем/вечером

**Фичи item_id**:
    - Кол-во покупок в неделю
    - Среднее кол-во покупок 1 товара в категории в неделю
    - (Кол-во покупок в неделю) / (Среднее ол-во покупок 1 товара в категории в неделю)
    - Цена (Можно посчитать из retil_train.csv)
    - Цена / Средняя цена товара в категории
    
**Фичи пары user_id - item_id**
    - (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
    - (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
    - (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

### Фичи юзера

**Средний чек юзера**

In [549]:
average_check = data_train_ranker.groupby('user_id').agg({'basket_id': 'nunique', 'sales_value': 'sum'}).reset_index()
average_check.rename(columns={'basket_id': 'baskets_sum', 'sales_value': 'sales_sum'}, inplace=True)
average_check['average_check'] = average_check['sales_sum'] / average_check['baskets_sum']
average_check.head()

Unnamed: 0,user_id,baskets_sum,sales_sum,average_check
0,1,7,341.78,48.825714
1,2,4,190.29,47.5725
2,4,3,119.39,39.796667
3,6,16,329.0,20.5625
4,7,5,187.65,37.53


In [550]:
df_ranker_train = df_ranker_train.merge(average_check[['user_id', 'average_check']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,average_check
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,14.355581
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,14.355581


**Частотность покупок (раз в месяц)**

Для простоты считаем, что в месяце ровно 4 недели.

In [551]:
purchases_freq = data_train_ranker.groupby('user_id').agg({'week_no': 'nunique', 'basket_id': 'nunique'}).reset_index()
purchases_freq['purchases_freq'] = purchases_freq['basket_id'] / (4 * purchases_freq['week_no'])
purchases_freq.head()

Unnamed: 0,user_id,week_no,basket_id,purchases_freq
0,1,6,7,0.291667
1,2,3,4,0.333333
2,4,3,3,0.25
3,6,6,16,0.666667
4,7,4,5,0.3125


In [552]:
df_ranker_train = df_ranker_train.merge(purchases_freq[['user_id', 'purchases_freq']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,average_check,purchases_freq
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667


### Фичи товара

**Цена товара**

In [553]:
data_train_ranker['price'] = data_train_ranker['sales_value'] / (np.maximum(data_train_ranker['quantity'], 1))
data_train_ranker.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0,1.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0,0.99
2107469,2021,40618753059,594,856060,1,1.77,443,-0.09,101,86,0.0,0.0,1.77
2107470,2021,40618753059,594,869344,1,1.67,443,-0.22,101,86,0.0,0.0,1.67
2107471,2021,40618753059,594,896862,2,5.0,443,-2.98,101,86,0.0,0.0,2.5


In [554]:
mean_good_price = data_train_ranker.groupby('item_id').agg({'price': 'mean'}).reset_index()
mean_good_price.head()

Unnamed: 0,item_id,price
0,28116,0.33
1,28117,0.34
2,28143,0.33
3,28186,0.79
4,28304,0.53


In [555]:
df_ranker_train = df_ranker_train.merge(mean_good_price, on='item_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,average_check,purchases_freq,price
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667,3.99
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667,10.99


**Кол-во покупок в неделю**

In [556]:
item_purchases_per_week = data_train_ranker.groupby('item_id').agg({'week_no': 'nunique', 'quantity': 'sum'}).reset_index()
item_purchases_per_week['item_purchases_freq'] = item_purchases_per_week['quantity'] / item_purchases_per_week['week_no']
item_purchases_per_week.head(2)

Unnamed: 0,item_id,week_no,quantity,item_purchases_freq
0,28116,1,1,1.0
1,28117,1,1,1.0


In [557]:
df_ranker_train = df_ranker_train.merge(item_purchases_per_week[['item_id', 'item_purchases_freq']], on='item_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,average_check,purchases_freq,price,item_purchases_freq
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667,3.99,1.666667
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667,10.99,1.0


### Фичи user-item

**(Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)**

In [558]:
data_train_ranker = data_train_ranker.merge(item_features, on = 'item_id', how = 'left')

In [559]:
mean_category_check = data_train_ranker.groupby(['user_id', 'department']).agg({'price': 'mean'}).reset_index()
mean_category_check.rename(columns={'price': 'mean_category_check'}, inplace=True)
mean_category_check.head()

Unnamed: 0,user_id,department,mean_category_check
0,1,,0.0
1,1,DELI,2.4375
2,1,DRUG GM,4.143846
3,1,GROCERY,2.268656
4,1,MEAT-PCKGD,3.101667


In [560]:
df_ranker_train = df_ranker_train.merge(mean_category_check, on=['user_id', 'department'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,average_check,purchases_freq,price,item_purchases_freq,mean_category_check
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667,3.99,1.666667,3.596667
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667,10.99,1.0,1.628849


In [561]:
df_ranker_train['mean_category_check_price'] = df_ranker_train['mean_category_check'] - df_ranker_train['price']
df_ranker_train.drop(columns=['mean_category_check'], inplace=True)
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,average_check,purchases_freq,price,item_purchases_freq,mean_category_check_price
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667,3.99,1.666667,-0.393333
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667,10.99,1.0,-9.361151
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,...,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667,,,
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,...,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667,,,
4,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,...,50-74K,Unknown,Unknown,1,None/Unknown,14.355581,1.791667,7.99,2.2,-5.403448


**(Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)**

**(Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)**

In [562]:
user_category_buy_per_week = data_train_ranker.groupby(['user_id', 'department']).agg({'week_no': ['count', 'nunique']}).reset_index()
user_category_buy_per_week['user_category_buy_per_week'] = user_category_buy_per_week['week_no']['count'] / user_category_buy_per_week['week_no']['nunique']
user_category_buy_per_week.drop(columns=['week_no'], inplace=True)
user_category_buy_per_week.head()

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0,user_id,department,user_category_buy_per_week
,,,
0.0,1.0,,1.0
1.0,1.0,DELI,2.0
2.0,1.0,DRUG GM,3.25
3.0,1.0,GROCERY,18.6
4.0,1.0,MEAT-PCKGD,2.0


In [563]:
all_users_category_buy_per_week = data_train_ranker.groupby(['department']).agg({'week_no': ['count', 'nunique']}).reset_index()
all_users_category_buy_per_week['all_users_category_buy_per_week'] = all_users_category_buy_per_week['week_no']['count'] / all_users_category_buy_per_week['week_no']['nunique']
all_users_category_buy_per_week.drop(columns=['week_no'], inplace=True)
all_users_category_buy_per_week.head()

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0,department,all_users_category_buy_per_week
,,
0.0,,126.0
1.0,AUTOMOTIVE,1.0
2.0,CHEF SHOPPE,10.5
3.0,CNTRL/STORE SUP,2.0
4.0,COSMETICS,83.166667


In [564]:
user_category_buy_per_week = user_category_buy_per_week.merge(all_users_category_buy_per_week, on='department', how='left')
user_category_buy_per_week['user_category_buy_per_week_1'] = user_category_buy_per_week['user_category_buy_per_week'] - user_category_buy_per_week['all_users_category_buy_per_week']
user_category_buy_per_week['user_category_buy_per_week_2'] = user_category_buy_per_week['user_category_buy_per_week'] / user_category_buy_per_week['all_users_category_buy_per_week']
user_category_buy_per_week = user_category_buy_per_week[['user_id', 'user_category_buy_per_week_1', 'user_category_buy_per_week_2', 'department']]
user_category_buy_per_week.head()

Unnamed: 0,user_id,user_category_buy_per_week_1,user_category_buy_per_week_2,department
,,,,
0.0,1.0,-125.0,0.007937,
1.0,1.0,-638.5,0.003123,DELI
2.0,1.0,-3084.25,0.001053,DRUG GM
3.0,1.0,-18354.733333,0.001012,GROCERY
4.0,1.0,-1074.666667,0.001858,MEAT-PCKGD


In [565]:
df_ranker_train = df_ranker_train.merge(user_category_buy_per_week, on=['user_id', 'department'], how='left')
df_ranker_train.head(2)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,hh_comp_desc,household_size_desc,kid_category_desc,average_check,purchases_freq,price,item_purchases_freq,mean_category_check_price,"(user_category_buy_per_week_1, )","(user_category_buy_per_week_2, )"
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,Unknown,1,None/Unknown,14.355581,1.791667,3.99,1.666667,-0.393333,-639.0,0.002342
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,Unknown,1,None/Unknown,14.355581,1.791667,10.99,1.0,-9.361151,-18350.166667,0.001261


In [566]:
df_ranker_train.rename(columns={('user_category_buy_per_week_1', '') : "user_category_buy_per_week_1",
                                ('user_category_buy_per_week_2', '') : "user_category_buy_per_week_2"},
                      inplace=True) 

**Обучение**

In [568]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [569]:
cat_feats = X_train.columns[2:-7].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

## Обучение модели ранжирования

In [570]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=8,
                     n_estimators=300,
                     learning_rate=0.05,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  return f(**kwargs)




In [571]:
df_ranker_predict = df_ranker_train.copy()

In [572]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

## Подведем итоги

Мы обучили модель ранжирования на покупках из сета data_train_ranker и на кандитатах от own_recommendations, что является тренировочным сетом, и теперь наша задача предсказать и оценить именно на тестовом сете.

# Evaluation on test dataset

In [573]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


## Eval matching on test dataset

In [574]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=100))

CPU times: user 5.08 s, sys: 331 ms, total: 5.41 s
Wall time: 5.63 s


In [575]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_eval_ranker, 5), key=lambda x: x[1], reverse=True)

[('own_rec', 0.14411764705882238)]

## Eval re-ranked matched result on test dataset
Вспомним df_match_candidates сет, который был получен own_recommendations на юзерах, набор пользователей мы фиксировали и он одинаков, значи и прогноз одинаков, поэтому мы можем использовать этот датафрейм для переранжирования.
    

In [576]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [577]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [578]:
print(*sorted(calc_precision(result_eval_ranker, 5), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.1636553524804163)
('own_rec', 0.14411764705882238)


  return flags.sum() / len(recommended_list)


**После ранжирования результат существенно улучшился, при этом необходимо будет дополнительно:**

1. Обработать пропуски в датасете для ранжирования
2. Попробовать нагенерировать другие фичи для user, item, user-item
3. Подобрать гиперпараметры модели / попробовать другую модель
4. Собрать все в пайплайн