# Курсовой проект


# Import libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Read data

In [3]:
data = pd.read_csv('./data/retail_train.csv')
item_features = pd.read_csv('./data/product.csv')
user_features = pd.read_csv('./data/hh_demographic.csv')
df_test = pd.read_csv('retail_test1.csv')

# Process features dataset

In [4]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'

In [5]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [6]:
# Оставляем 3 недель на валидацию первой модели и тренировку второй, тест на отдельном датасете
TRAIN_RANKER_WEEKS = 3

In [7]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - TRAIN_RANKER_WEEKS]

# берем данные для валидации matching модели
data_val_matcher = data[data['week_no'] >= data['week_no'].max() - TRAIN_RANKER_WEEKS]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

In [8]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [9]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(df_test,'test')

train_matcher
Shape: (2278490, 12) Users: 2499 Items: 86865
val_matcher
Shape: (118314, 12) Users: 2042 Items: 24329
train_ranker
Shape: (118314, 12) Users: 2042 Items: 24329
test
Shape: (88734, 12) Users: 1885 Items: 20497


# Prefilter items

#### Лучшие результаты на предварительном прогоне показал вариант с топ-3000.

In [10]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=3000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 3001


# Make cold-start to warm-start

In [11]:
# ищем общих пользователей (убираем холодный старт)
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
df_test = df_test[df_test.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(df_test,'test')

train_matcher
Shape: (863782, 13) Users: 2496 Items: 3001
val_matcher
Shape: (118282, 12) Users: 2040 Items: 24325
train_ranker
Shape: (118282, 12) Users: 2040 Items: 24325
test
Shape: (88665, 12) Users: 1883 Items: 20492


# Init/train recommender

#### По результатам экспериментов лучшие результаты были у own_recommendations, в user_item_matrix - сумма покупки, взвешивание с кастомными параметрами bm25_weight(K1=200, B=0.5)

In [12]:
recommender = MainRecommender(data_train_matcher)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3001.0), HTML(value='')))




In [13]:
ACTUAL_COL = 'actual'

In [14]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [15]:
# # сырой и простой пример как можно обернуть в функцию
def evalRecall(df_result, target_col_name, recommend_model, N=50):
    result_col_name = recommend_model.__name__.replace('get_', '')
    df_result[result_col_name] = df_result[target_col_name].apply(lambda x: recommend_model(x, N=N))
    return df_result.apply(lambda row: recall_at_k(row[result_col_name], row[ACTUAL_COL], k=N), axis=1).mean()

In [16]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [17]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

# Ranking part

## Подготовка данных для трейна

In [18]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

#### Отбираем 50 кандидатов - показало лучший результат

In [19]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=50))

In [20]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [21]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

### Check warm start

In [22]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (102000, 2) Users: 2040 Items: 2835


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [23]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

In [24]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
2277416,338,840173,1
2277417,338,1037348,1
2277418,338,5592737,1
2277419,338,7441679,1
2277420,338,7442317,1


#### Не хватает нулей в датасете, поэтому добавляем наших кандитатов в качество нулей

In [25]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

In [26]:
df_ranker_train.target.value_counts()

0.0    92808
1.0     8231
Name: target, dtype: int64

In [27]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,338,13002975,1.0
2,338,1026118,1.0


(!) На каждого юзера 100 item_id-кандидатов

In [28]:
df_ranker_train['target'].mean()

0.08146359326596661

## Генерим фичи (взяты из вебинара + из ДЗ 6)

In [29]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,338,13002975,1.0,2843,MEAT,National,BEEF,RIBS,,,,,,,,
1,338,1026118,1.0,2,PRODUCE,National,TOMATOES,ROMA TOMATOES (BULK/PKG),25 LB,,,,,,,


In [30]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

#### Фичи из вебинара (результат с ними лучше, чем только с собственно сгенерированными)

In [31]:
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)

### Фичи юзера

**Средний чек юзера**

In [32]:
average_check = df_join_train_matcher.groupby('user_id').agg({'basket_id': 'nunique', 'sales_value': 'sum'}).reset_index()
average_check.rename(columns={'basket_id': 'baskets_sum', 'sales_value': 'sales_sum'}, inplace=True)
average_check['average_check'] = average_check['sales_sum'] / average_check['baskets_sum']
average_check.head()

Unnamed: 0,user_id,baskets_sum,sales_sum,average_check
0,1,73,2559.4,35.060274
1,2,41,1258.55,30.696341
2,3,39,1154.06,29.591282
3,4,30,924.1,30.803333
4,5,33,561.71,17.021515


In [33]:
df_ranker_train = df_ranker_train.merge(average_check[['user_id', 'average_check']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,average_check
0,338,13002975,1.0,2843,MEAT,National,BEEF,RIBS,,,...,81,325,1308.75,1.305263,4.168421,0.000658,0.002101,0.00043,0.001724,13.0875
1,338,1026118,1.0,2,PRODUCE,National,TOMATOES,ROMA TOMATOES (BULK/PKG),25 LB,,...,721,325,1308.75,7.568421,4.168421,0.003814,0.002101,0.003824,0.001724,13.0875


**Частотность покупок (раз в месяц)**

Для простоты считаем, что в месяце ровно 4 недели.

In [34]:
purchases_freq = df_join_train_matcher.groupby('user_id').agg({'week_no': 'nunique', 'basket_id': 'nunique'}).reset_index()
purchases_freq['purchases_freq'] = purchases_freq['basket_id'] / (4 * purchases_freq['week_no'])
purchases_freq.head()

Unnamed: 0,user_id,week_no,basket_id,purchases_freq
0,1,64,73,0.285156
1,2,34,41,0.301471
2,3,32,39,0.304688
3,4,26,30,0.288462
4,5,23,33,0.358696


In [35]:
df_ranker_train = df_ranker_train.merge(purchases_freq[['user_id', 'purchases_freq']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,average_check,purchases_freq
0,338,13002975,1.0,2843,MEAT,National,BEEF,RIBS,,,...,325,1308.75,1.305263,4.168421,0.000658,0.002101,0.00043,0.001724,13.0875,0.510204
1,338,1026118,1.0,2,PRODUCE,National,TOMATOES,ROMA TOMATOES (BULK/PKG),25 LB,,...,325,1308.75,7.568421,4.168421,0.003814,0.002101,0.003824,0.001724,13.0875,0.510204


### Фичи товара

**Цена товара**

In [36]:
df_join_train_matcher['price'] = df_join_train_matcher['sales_value'] / (np.maximum(df_join_train_matcher['quantity'], 1))
df_join_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99
11,1364,26984896261,1,999999,1,2.19,31742,0.0,1520,1,0.0,0.0,2.19
12,1364,26984896261,1,999999,1,2.99,31742,-0.4,1520,1,0.0,0.0,2.99
14,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,0.0,0.0,2.5
32,1172,26985025264,1,999999,1,3.59,396,0.0,946,1,0.0,0.0,3.59


In [37]:
mean_good_price = df_join_train_matcher.groupby('item_id').agg({'price': 'mean'}).reset_index()
mean_good_price.head()

Unnamed: 0,item_id,price
0,29512,0.99
1,30356,2.39
2,32392,1.39
3,32439,1.88
4,34873,1.99


In [38]:
df_ranker_train = df_ranker_train.merge(mean_good_price, on='item_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,average_check,purchases_freq,price
0,338,13002975,1.0,2843,MEAT,National,BEEF,RIBS,,,...,1308.75,1.305263,4.168421,0.000658,0.002101,0.00043,0.001724,13.0875,0.510204,4.12018
1,338,1026118,1.0,2,PRODUCE,National,TOMATOES,ROMA TOMATOES (BULK/PKG),25 LB,,...,1308.75,7.568421,4.168421,0.003814,0.002101,0.003824,0.001724,13.0875,0.510204,3.376886


**Кол-во покупок в неделю**

In [39]:
item_purchases_per_week = df_join_train_matcher.groupby('item_id').agg({'week_no': 'nunique', 'quantity': 'sum'}).reset_index()
item_purchases_per_week['item_purchases_freq'] = item_purchases_per_week['quantity'] / item_purchases_per_week['week_no']
item_purchases_per_week.head(2)

Unnamed: 0,item_id,week_no,quantity,item_purchases_freq
0,29512,1,1,1.0
1,30356,1,1,1.0


In [40]:
df_ranker_train = df_ranker_train.merge(item_purchases_per_week[['item_id', 'item_purchases_freq']], on='item_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,average_check,purchases_freq,price,item_purchases_freq
0,338,13002975,1.0,2843,MEAT,National,BEEF,RIBS,,,...,1.305263,4.168421,0.000658,0.002101,0.00043,0.001724,13.0875,0.510204,4.12018,2.952381
1,338,1026118,1.0,2,PRODUCE,National,TOMATOES,ROMA TOMATOES (BULK/PKG),25 LB,,...,7.568421,4.168421,0.003814,0.002101,0.003824,0.001724,13.0875,0.510204,3.376886,7.648936


### Фичи user-item

**(Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)**

In [41]:
df_join_train_matcher = df_join_train_matcher.merge(item_features, on = 'item_id', how = 'left')

In [42]:
mean_category_check = df_join_train_matcher.groupby(['user_id', 'department']).agg({'price': 'mean'}).reset_index()
mean_category_check.rename(columns={'price': 'mean_category_check'}, inplace=True)
mean_category_check.head()

Unnamed: 0,user_id,department,mean_category_check
0,1,DELI,3.338621
1,1,DRUG GM,3.57125
2,1,GROCERY,3.066459
3,1,MEAT,3.971667
4,1,MEAT-PCKGD,3.08061


In [43]:
df_ranker_train = df_ranker_train.merge(mean_category_check, on=['user_id', 'department'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,average_check,purchases_freq,price,item_purchases_freq,mean_category_check
0,338,13002975,1.0,2843,MEAT,National,BEEF,RIBS,,,...,4.168421,0.000658,0.002101,0.00043,0.001724,13.0875,0.510204,4.12018,2.952381,3.733478
1,338,1026118,1.0,2,PRODUCE,National,TOMATOES,ROMA TOMATOES (BULK/PKG),25 LB,,...,4.168421,0.003814,0.002101,0.003824,0.001724,13.0875,0.510204,3.376886,7.648936,2.422162


In [44]:
df_ranker_train['mean_category_check_price'] = df_ranker_train['mean_category_check'] - df_ranker_train['price']
df_ranker_train.drop(columns=['mean_category_check'], inplace=True)
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,average_check,purchases_freq,price,item_purchases_freq,mean_category_check_price
0,338,13002975,1.0,2843,MEAT,National,BEEF,RIBS,,,...,4.168421,0.000658,0.002101,0.00043,0.001724,13.0875,0.510204,4.12018,2.952381,-0.386702
1,338,1026118,1.0,2,PRODUCE,National,TOMATOES,ROMA TOMATOES (BULK/PKG),25 LB,,...,4.168421,0.003814,0.002101,0.003824,0.001724,13.0875,0.510204,3.376886,7.648936,-0.954724
2,338,831628,0.0,2949,MEAT,National,BEEF,CHOICE BEEF,,,...,4.168421,0.004371,0.002101,0.003156,0.001724,13.0875,0.510204,4.766751,9.471264,-1.033273
3,338,959737,1.0,2343,GROCERY,National,BEERS/ALES,BEERALEMALT LIQUORS,12 OZ,,...,4.168421,0.000812,0.002101,0.000743,0.001724,13.0875,0.510204,13.328571,2.25,-10.144454
4,338,1082627,0.0,2,GROCERY,National,SOFT DRINKS,SOFT DRINKS 20PK&24PK CAN CARB,12 OZ,,...,4.168421,0.000875,0.002101,0.000764,0.001724,13.0875,0.510204,7.615139,2.291667,-4.431021


**(Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)**

**(Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)**

In [45]:
user_category_buy_per_week = df_join_train_matcher.groupby(['user_id', 'department']).agg({'week_no': ['count', 'nunique']}).reset_index()
user_category_buy_per_week['user_category_buy_per_week'] = user_category_buy_per_week['week_no']['count'] / user_category_buy_per_week['week_no']['nunique']
user_category_buy_per_week.drop(columns=['week_no'], inplace=True)
user_category_buy_per_week.head()

Unnamed: 0,user_id,department,user_category_buy_per_week
,,,
0.0,1.0,DELI,1.26087
1.0,1.0,DRUG GM,2.0
2.0,1.0,GROCERY,5.0
3.0,1.0,MEAT,1.0
4.0,1.0,MEAT-PCKGD,2.0


In [46]:
all_users_category_buy_per_week = df_join_train_matcher.groupby(['department']).agg({'week_no': ['count', 'nunique']}).reset_index()
all_users_category_buy_per_week['all_users_category_buy_per_week'] = all_users_category_buy_per_week['week_no']['count'] / all_users_category_buy_per_week['week_no']['nunique']
all_users_category_buy_per_week.drop(columns=['week_no'], inplace=True)
all_users_category_buy_per_week.head()

Unnamed: 0,department,all_users_category_buy_per_week
,,
0.0,,76.0
1.0,AUTOMOTIVE,1.0
2.0,CHEF SHOPPE,10.0
3.0,CNTRL/STORE SUP,2.5
4.0,COSMETICS,96.25


In [47]:
user_category_buy_per_week = user_category_buy_per_week.merge(all_users_category_buy_per_week, on='department', how='left')
user_category_buy_per_week['user_category_buy_per_week_1'] = user_category_buy_per_week['user_category_buy_per_week'] - user_category_buy_per_week['all_users_category_buy_per_week']
user_category_buy_per_week['user_category_buy_per_week_2'] = user_category_buy_per_week['user_category_buy_per_week'] / user_category_buy_per_week['all_users_category_buy_per_week']
user_category_buy_per_week = user_category_buy_per_week[['user_id', 'user_category_buy_per_week_1', 'user_category_buy_per_week_2', 'department']]
user_category_buy_per_week.head()

Unnamed: 0,user_id,user_category_buy_per_week_1,user_category_buy_per_week_2,department
,,,,
0.0,1.0,-345.897025,0.003632,DELI
1.0,1.0,-348.4,0.005708,DRUG GM
2.0,1.0,-3825.789474,0.001305,GROCERY
3.0,1.0,-612.663158,0.00163,MEAT
4.0,1.0,-604.136842,0.0033,MEAT-PCKGD


In [48]:
df_ranker_train = df_ranker_train.merge(user_category_buy_per_week, on=['user_id', 'department'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,average_check,purchases_freq,price,item_purchases_freq,mean_category_check_price,"(user_category_buy_per_week_1, )","(user_category_buy_per_week_2, )"
0,338,13002975,1.0,2843,MEAT,National,BEEF,RIBS,,,...,0.002101,0.00043,0.001724,13.0875,0.510204,4.12018,2.952381,-0.386702,-611.740081,0.003134
1,338,1026118,1.0,2,PRODUCE,National,TOMATOES,ROMA TOMATOES (BULK/PKG),25 LB,,...,0.002101,0.003824,0.001724,13.0875,0.510204,3.376886,7.648936,-0.954724,-788.433956,0.003019


In [49]:
df_ranker_train.rename(columns={('user_category_buy_per_week_1', '') : "user_category_buy_per_week_1",
                                ('user_category_buy_per_week_2', '') : "user_category_buy_per_week_2"},
                      inplace=True) 

#### В данных остались некоторые пропуски (в категориальных фичах), но их простое заполнение модой ухудшало общий результат. Теоретически можно было бы попробовать заполнение через similar_users, предполагая, что пользователи с похожими покупками имеют одинаковый возраст, семейное положение и т.д. Здесь не реализовано.

**Обучение**

In [50]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

Делаем OHE для категориальных фичей

In [51]:
cat_feats = X_train.columns[2:15].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

## Обучение модели ранжирования

#### Параметры модели подобраны

In [52]:
lgb = LGBMClassifier(objective='binary',
                     num_leaves=30,
                     max_depth=8,
                     n_estimators=600,
                     learning_rate=0.15,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

In [53]:
df_ranker_predict = df_ranker_train.copy()

In [54]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

## Подведем итоги

Мы обучили модель ранжирования на покупках из сета data_train_ranker и на кандитатах от own_recommendations, что является тренировочным сетом, и теперь наша задача предсказать и оценить именно на тестовом сете.

# Evaluation on test dataset

In [55]:
result_eval_ranker = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


## Eval matching on test dataset

In [56]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=50))

CPU times: user 3.84 s, sys: 254 ms, total: 4.1 s
Wall time: 4.15 s


In [57]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_eval_ranker, 5), key=lambda x: x[1], reverse=True)

[('own_rec', 0.15092936802973875)]

## Eval re-ranked matched result on test dataset

In [58]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [59]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [60]:
print(*sorted(calc_precision(result_eval_ranker, 5), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.21887666473653541)
('own_rec', 0.15092936802973875)


In [61]:
save_to_csv = result_eval_ranker[['user_id', 'reranked_own_rec']].rename({'reranked_own_rec': 'recommendations'})

In [62]:
save_to_csv.to_csv('recommendations.csv')

## Выводы

Двухуровневая модель дала прирост > 0.065 в абсолютных единицах и на 45 % в относительных при сравнении с базовой моделью.

Направления улучшения:
1. Корректное заполнение пропусков.
2. OHE-Encoding.
3. Генерация дополнительныхз бизнес-фич.
4. Анализ других моделей для второго уровня (CatBoostClassifier).
5. Улучшение базовой модели MainRecommender для улучшения метрики recall.
6. Упаковка всего в пайплайн для подготовки в продакшн.