## Проект

1. [Загрузка данных](#load_data)
2. [Полезные функции](#useful_functions)
3. [Подготовка данных](#data_preparation)
4. [Построение модели первого уровня](#first_level_model)
5. [Генерация признаков для модели второго уровня](#features_engineering)
6. [Построение модели второго уровня](#second_level_model)
7. [Рекомендации для тестового датасета](#final_recommendations)

### 1. Загрузка данных <a class="anchor" id="load_data"></a>

Загрузим необходимые библиотеки и данные.

In [41]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

from gensim.models import Word2Vec

# Модель второго уровня
from catboost import CatBoostClassifier

import os, sys
sys.path.insert(1, os.getcwd() + '/src/')

from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

In [42]:
data = pd.read_csv('webinar_8/course_project/data/retail_train.csv')
item_features = pd.read_csv('webinar_8/course_project/data/product.csv')
user_features = pd.read_csv('webinar_8/course_project/data/hh_demographic.csv')

### 2. Полезные функции <a class="anchor" id="useful_functions"></a>

Опишем функции, которые будут использоваться в дальнейшем.

In [43]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")
    
def make_recommendations(df_result, recommend_model, N_PREDICT=50, USER_COL='user_id'):
    return df_result[USER_COL].apply(lambda x: recommend_model(x, N=N_PREDICT))

def calc_recall(df_data, top_k, ACTUAL_COL='actual'):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()
        
def calc_precision(df_data, top_k, ACTUAL_COL='actual'):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()
        
def rerank(user_id, df, USER_COL='user_id', proba_col_name='proba_item_purchase', N=5):
    return df[df[USER_COL]==user_id].sort_values(proba_col_name, ascending=False).head(N).item_id.tolist()

def get_scores(df_result, recommend_model, N_PREDICT=50, USER_COL='user_id'):
    return df_result[USER_COL].apply(lambda x: recommend_model(x, N=N_PREDICT))

### 3. Подготовка данных <a class="anchor" id="data_preparation"></a>

Проведем переименования определенных признаков и приведем названия к единообразному формату.

In [44]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

Разделим выборку на обучающий, валидационный и тестовый датасеты.

In [45]:
VAL_MATCHER_WEEKS = 5
VAL_RANKER_WEEKS = 3

# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [46]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2136728, 12) Users: 2498 Items: 84180
val_matcher
Shape: (141762, 12) Users: 2097 Items: 25770
train_ranker
Shape: (141762, 12) Users: 2097 Items: 25770
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


Проведем префильтрацию данных.

In [47]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=10000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 84180 to 10001


Оставим только пользователей, которые встречаются в тренировочном датасете, чтобы избежать проблемы холодного старта.

In [48]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (872777, 13) Users: 2496 Items: 10001
val_matcher
Shape: (141737, 12) Users: 2095 Items: 25768
train_ranker
Shape: (141737, 12) Users: 2095 Items: 25768
val_ranker
Shape: (118282, 12) Users: 2040 Items: 24325


### 4. Построение модели первого уровня <a class="anchor" id="first_level_model"></a>

Создадим экземляр класса MainRecommender.

In [49]:
recommender = MainRecommender(data_train_matcher)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10001 [00:00<?, ?it/s]

  0%|          | 0/10001 [00:00<?, ?it/s]

  0%|          | 0/10001 [00:00<?, ?it/s]

  0%|          | 0/10001 [00:00<?, ?it/s]

In [50]:
ACTUAL_COL = 'actual'
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [51]:
N_PREDICT = 100

result_eval_matcher['own_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_own_recommendations, 
                                                      N_PREDICT=N_PREDICT)

result_eval_matcher['als_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_als_recommendations, 
                                                      N_PREDICT=N_PREDICT)

result_eval_matcher['bm25_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_bm25_recommendations, 
                                                      N_PREDICT=N_PREDICT)

result_eval_matcher['tfidf_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_tfidf_recommendations, 
                                                      N_PREDICT=N_PREDICT)

result_eval_matcher['cosine_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_cosine_recommendations, 
                                                      N_PREDICT=N_PREDICT)

In [52]:
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual,own_rec,als_rec,bm25_rec,tfidf_rec,cosine_rec
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067...","[940947, 856942, 10149640, 865456, 1004906, 55...","[1029743, 878285, 865456, 871570, 1006184, 872...","[1127831, 940947, 916122, 844179, 1004906, 878...","[916122, 1127831, 844179, 866211, 1029743, 102...","[6391068, 877391, 9297615, 5577022, 5582712, 9..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[8090521, 1075368, 1040807, 5569230, 940947, 1...","[930118, 5569845, 5569230, 1106523, 12301109, ...","[1044078, 844179, 5569230, 916122, 8090521, 11...","[916122, 844179, 1029743, 1004906, 1044078, 55...","[8090521, 1103898, 978332, 1076580, 911974, 10..."


In [53]:
TOPK_RECALL = 100
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('own_rec', 0.1277189626164173),
 ('tfidf_rec', 0.12683248535219407),
 ('als_rec', 0.12391194179926528),
 ('cosine_rec', 0.12390732213587671),
 ('bm25_rec', 0.11469747149566534)]

Как видим лучшие результаты показывает модель на основе предыдущих покупок пользователя.

### 5. Генерация признаков для модели второго уровня <a class="anchor" id="features_engineering"></a>

Cформируем датасет для построения модели ранжирования, сгенерируем новые признаки и построим модель, которая будет работать с отобранными товарами для построения окончательных рекомендаций. 

In [54]:
# взяли пользователей из трейна для ранжирования

df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

df_match_candidates['candidates'] = make_recommendations(df_match_candidates, 
                                                         recommender.get_own_recommendations, 
                                                         N_PREDICT=N_PREDICT)

df_match_candidates['candidates_scores'] = get_scores(df_match_candidates, 
                                                      recommender.get_own_scores, 
                                                      N_PREDICT=N_PREDICT)

df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().\
            reset_index(level=1, drop=True)
df_scores = df_match_candidates.apply(lambda x: pd.Series(x['candidates_scores']), axis=1).stack().\
            reset_index(level=1, drop=True)
df_items.name = 'item_id'
df_match_candidates = df_match_candidates.drop(['candidates', 'candidates_scores'], axis=1).join(df_items)

df_match_candidates['item_score'] = df_scores

df_match_candidates.head()

Unnamed: 0,user_id,item_id,item_score
0,1827,907631,57976.974666
0,1827,940947,42612.148521
0,1827,5568378,40380.408152
0,1827,854405,39497.493468
0,1827,1029743,28444.228691


In [55]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

df_ranker_train.head()

Unnamed: 0,user_id,item_id,item_score,target
0,1827,907631,57976.974666,0.0
1,1827,940947,42612.148521,0.0
2,1827,5568378,40380.408152,0.0
3,1827,854405,39497.493468,0.0
4,1827,1029743,28444.228691,1.0


In [56]:
df_ranker_train.target.value_counts()

0.0    192109
1.0     14645
Name: target, dtype: int64

In [57]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1827,907631,57976.974666,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,,,,,,,
1,1827,940947,42612.148521,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,,,,,,,


Сгенерируем новые признаки и добавим их к датасету для обучения.

In [58]:
# Добавим параметр категории к исходному обучающему датасету для удобства создания новых фичей
data_department = data_train_ranker.merge(item_features[['item_id', 'department']], on='item_id', how='inner')
data_department.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,department
0,1827,40702967646,601,891141,2,2.73,33923,0.0,7,87,0.0,0.0,PRODUCE
1,496,40739402373,603,891141,1,1.83,445,0.0,2226,87,0.0,0.0,PRODUCE


In [59]:
# Средная цена купленных товаров пользователем
users_sales = data_train_ranker.groupby(USER_COL)[['sales_value', 'quantity']].sum().reset_index()
users_sales['avg_price'] = users_sales['sales_value'] / users_sales['quantity']
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_price']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_price
0,1827,907631,57976.974666,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,,,,,,,,2.208947
1,1827,940947,42612.148521,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,,,,,,,,2.208947


In [60]:
# Количество покупок в каждой категории и средная сумма покупки в каждой категории для пользователя
users_sales_department = data_department.groupby([USER_COL, 'department'])\
                        [['sales_value', 'quantity']].sum().reset_index()
users_sales_department.rename(columns={'quantity': 'n_sold_category'}, inplace=True)
users_sales_department['avg_transaction_category'] = users_sales_department['sales_value']\
                                                    /users_sales_department['n_sold_category']
users_sales_department.drop(columns=['sales_value'], inplace=True)

df_ranker_train = df_ranker_train.merge(
    users_sales_department, on=[USER_COL, 'department'], how='left')
df_ranker_train['Missing n_sold_category'] = 0
df_ranker_train.loc[df_ranker_train['n_sold_category'].isna(), 'Missing n_sold_category'] = 1
df_ranker_train['n_sold_category'].fillna(0, inplace=True)

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_price,n_sold_category,avg_transaction_category,Missing n_sold_category
0,1827,907631,57976.974666,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,...,,,,,,,2.208947,25.0,2.2408,0
1,1827,940947,42612.148521,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,...,,,,,,,2.208947,2.0,1.49,0


In [61]:
# Средняя сумма покупки в категории
department_sales = data_department.groupby('department')['sales_value'].mean().reset_index()
department_sales.rename(columns={'sales_value': 'mean_sales_value_category'}, inplace=True)
department_sales.tail(2)

n_weeks = data_department['week_no'].max() - data_department['week_no'].min() + 1

# Количество покупок юзером конкретной категории в неделю
users_department = data_department.groupby([USER_COL, 'department'])['quantity'].sum().reset_index()
users_department['quantity'] /= n_weeks
users_department.rename(columns={'quantity': 'n_sold_category_user_week'}, inplace=True)

df_ranker_train = df_ranker_train.merge(department_sales, on='department', how='left')
df_ranker_train = df_ranker_train.merge(users_department, on=[USER_COL, 'department'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_price,n_sold_category,avg_transaction_category,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week
0,1827,907631,57976.974666,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,...,,,,,2.208947,25.0,2.2408,0,2.541433,5.0
1,1827,940947,42612.148521,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,...,,,,,2.208947,2.0,1.49,0,3.82821,0.4


In [62]:
# Цена
items_sales = data_department.groupby(ITEM_COL)[['sales_value', 'quantity']].sum().reset_index()
items_sales['price'] = items_sales['sales_value'] / items_sales['quantity']
items_sales['price'].fillna(0, inplace=True)

# Количество покупок товара в неделю
items_sales['quantity_per_week'] = items_sales['quantity'] / n_weeks


df_ranker_train = df_ranker_train.merge(items_sales[[ITEM_COL,'price', 'quantity_per_week']],
                                        on=ITEM_COL, how='left')

df_ranker_train['Missing price'] = 0
df_ranker_train.loc[df_ranker_train['price'].isna(), 'Missing price'] = 1
df_ranker_train['price'].fillna(0, inplace=True)

df_ranker_train['Missing quantity per week'] = 0
df_ranker_train.loc[df_ranker_train['quantity_per_week'].isna(), 'Missing quantity per week'] = 1
df_ranker_train['quantity_per_week'].fillna(0, inplace=True)

df_ranker_train.head()

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,avg_price,n_sold_category,avg_transaction_category,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week
0,1827,907631,57976.974666,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,...,2.208947,25.0,2.2408,0,2.541433,5.0,2.985366,16.4,0,0
1,1827,940947,42612.148521,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,...,2.208947,2.0,1.49,0,3.82821,0.4,2.650096,20.8,0,0
2,1827,5568378,40380.408152,0.0,544,GROCERY,National,BAG SNACKS,TORTILLA/NACHO CHIPS,12.5 OZ,...,2.208947,25.0,2.2408,0,2.541433,5.0,2.381258,30.2,0,0
3,1827,854405,39497.493468,0.0,397,MEAT-PCKGD,National,MEAT - MISC,BREAST - BONELESS(IQF),40 OZ,...,2.208947,2.0,1.49,0,3.82821,0.4,6.779474,11.4,0,0
4,1827,1029743,28444.228691,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,2.208947,25.0,2.2408,0,2.541433,5.0,2.424173,198.4,0,0


In [63]:
# Количество уникальных магазинов, в которых продавался товар
items_stores = data_department.groupby(ITEM_COL)['store_id'].nunique().reset_index()
items_stores.rename(columns={'store_id': 'n_unique_stores'}, inplace=True)
df_ranker_train = df_ranker_train.merge(items_stores, on=ITEM_COL, how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,n_sold_category,avg_transaction_category,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores
0,1827,907631,57976.974666,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,...,25.0,2.2408,0,2.541433,5.0,2.985366,16.4,0,0,41.0
1,1827,940947,42612.148521,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,...,2.0,1.49,0,3.82821,0.4,2.650096,20.8,0,0,49.0


In [64]:
# Среднее количество транзакций клиента в неделю
users_transactions = data_department.groupby(USER_COL)[ITEM_COL].count().reset_index()
users_transactions.rename(columns={'item_id': 'n_transactions_per_week'}, inplace=True)
users_transactions['n_transactions_per_week'] /= n_weeks


df_ranker_train = df_ranker_train.merge(users_transactions, on=USER_COL, how='left')

df_ranker_train.tail(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,avg_transaction_category,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week
206752,1745,841443,3106.164464,0.0,1087,SEAFOOD-PCKGD,National,SEAFOOD - FROZEN,FRZN BRD STICK/PORTON,24.5 OZ,...,,1,5.810053,,0.0,0.0,1,1,,0.6
206753,1745,949120,3083.968051,0.0,1318,GROCERY,National,PICKLE/RELISH/PKLD VEG,PICKLES,24 OZ,...,2.663333,0,2.541433,0.6,0.0,0.0,1,1,,0.6


In [65]:
# Средний чек
users_sales = data_train_ranker.groupby(USER_COL)['sales_value'].mean().reset_index()
users_sales.rename(columns={'sales_value': 'avg_cheque'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_cheque']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque
0,1827,907631,57976.974666,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,...,0,2.541433,5.0,2.985366,16.4,0,0,41.0,6.0,2.798
1,1827,940947,42612.148521,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,...,0,3.82821,0.4,2.650096,20.8,0,0,49.0,6.0,2.798


In [66]:
# Среднее количество уникальных категорий в корзине
users_baskets = data_department.groupby([USER_COL, 'basket_id'])['department'].nunique().reset_index()
users_baskets = users_baskets.groupby(USER_COL)['department'].mean().reset_index()
users_baskets.rename(columns={'department': 'avg_basket_department'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_baskets[['user_id', 'avg_basket_department']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department
0,1827,907631,57976.974666,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,...,2.541433,5.0,2.985366,16.4,0,0,41.0,6.0,2.798,2.4
1,1827,940947,42612.148521,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,...,3.82821,0.4,2.650096,20.8,0,0,49.0,6.0,2.798,2.4


Построим признак, отражающий средний интервал между покупками пользователя.

In [67]:
users_days = data_department.groupby(USER_COL)['day'].unique().reset_index()
users_days['day'] = users_days['day'].apply(lambda x: sorted(x))
users_days.head()

Unnamed: 0,user_id,day
0,1,"[606, 608, 610, 620, 622, 632]"
1,2,"[608, 614, 620, 622]"
2,4,"[605, 617, 627]"
3,6,"[603, 607, 610, 611, 616, 619, 620, 624, 627, ..."
4,7,"[606, 610, 614, 623, 629]"


In [68]:
def avg_ndays(days):
    diff = 0
    if len(days) > 1:
        for i in range(len(days) - 1):
            diff += days[i+1] - days[i]
        return diff / (len(days) - 1)
    else:
        return 0
    
users_days['avg_interval'] = users_days['day'].apply(avg_ndays)

df_ranker_train = df_ranker_train.merge(users_days[['user_id', 'avg_interval']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval
0,1827,907631,57976.974666,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,...,5.0,2.985366,16.4,0,0,41.0,6.0,2.798,2.4,7.0
1,1827,940947,42612.148521,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,...,0.4,2.650096,20.8,0,0,49.0,6.0,2.798,2.4,7.0


Построим признак, в котором будет закодировано место товара в пяти последних покупках клиента.

In [69]:
users_items = data_train_ranker.groupby(USER_COL)[ITEM_COL].apply(list).reset_index()
users_items['item_id'] = users_items['item_id'].apply(lambda x: x[-5:])
users_items.head()

Unnamed: 0,user_id,item_id
0,1,"[5577022, 8293439, 9526676, 9527558, 10149640]"
1,2,"[7407562, 10149597, 13776981, 15572067, 17215077]"
2,4,"[963365, 1038692, 1133312, 5570830, 7431408]"
3,6,"[1099058, 895268, 1017061, 1082185, 1119051]"
4,7,"[9837501, 12524016, 13072715, 13987153, 13987338]"


In [70]:
def code_last_sales(x, df=users_items):
    last_sales = df.loc[df['user_id'] == x[0], 'item_id'].item()
    code = str()
    last_sales.reverse()
    for item in last_sales:
        code += '1' if item == x[1] else '0'
    return code

df_ranker_train['Last5sales'] = df_ranker_train[[USER_COL, ITEM_COL]].apply(code_last_sales, axis=1)
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval,Last5sales
0,1827,907631,57976.974666,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,...,2.985366,16.4,0,0,41.0,6.0,2.798,2.4,7.0,0
1,1827,940947,42612.148521,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,...,2.650096,20.8,0,0,49.0,6.0,2.798,2.4,7.0,0


Построим модель Word2Vec для получения эмбеддингов товаров, а затем посчитаем длину вектора, описывающего каждый товар, и расстояние от word2vec эмбеддинга товара до среднего эмбеддинга товаров, купленных юзером.

In [71]:
df_ = data_train_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
df_.head()

Unnamed: 0,user_id,item_id
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67..."
3,6,"[873654, 994928, 1098844, 1122879, 8357613, 98..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886..."


In [72]:
purchases = []

for user in df_['user_id']:
    purchases.append([str(item) for item in df_[df_['user_id'] == user].item_id.values[0]])
    
print(f"Total # of Sessions: {len(purchases)}")

Total # of Sessions: 2095


In [73]:
w2v_model = Word2Vec(min_count=1, vector_size=100, sg=1, workers=3)
w2v_model.build_vocab(purchases, progress_per=100)
w2v_model.train(purchases, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)

(1209222, 1214930)

In [74]:
def word2vec_len(itemid):
    try:
        return sum([i**2 for i in w2v_model.wv[str(itemid)]])
    except:
        return -1

df_ranker_train['Word2Vec_length'] = df_ranker_train[ITEM_COL].apply(lambda x: word2vec_len(x))
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval,Last5sales,Word2Vec_length
0,1827,907631,57976.974666,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,...,16.4,0,0,41.0,6.0,2.798,2.4,7.0,0,7.754568
1,1827,940947,42612.148521,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,...,20.8,0,0,49.0,6.0,2.798,2.4,7.0,0,5.292544


In [75]:
def avg_word2vec(items):
    return sum([w2v_model.wv[str(item)] for item in items]) / len(items)

df_['Avg_Word2Vec'] = df_[ITEM_COL].apply(avg_word2vec)
df_.head()

Unnamed: 0,user_id,item_id,Avg_Word2Vec
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067...","[-0.101323694, 0.18119045, 0.20122087, 0.02709..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[-0.089792326, 0.2482764, 0.19895303, 0.067954..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[-0.09244502, 0.22552814, 0.20331387, 0.038374..."
3,6,"[873654, 994928, 1098844, 1122879, 8357613, 98...","[-0.084583975, 0.22696072, 0.1969174, 0.045245..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[-0.09863084, 0.21351701, 0.21236424, 0.027541..."


In [76]:
def get_w2v_distance(x, df=df_):
    avg_w2v = df.loc[df_[USER_COL] == x[0], 'Avg_Word2Vec'].item()
    try:
        return sum((w2v_model.wv[str(x[1])] - avg_w2v) ** 2)
    except:
        return -1
    
df_ranker_train['Word2Vec_distance_from_avg'] = df_ranker_train[[USER_COL, ITEM_COL]].\
                                                apply(get_w2v_distance, axis=1)
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval,Last5sales,Word2Vec_length,Word2Vec_distance_from_avg
0,1827,907631,57976.974666,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,...,0,0,41.0,6.0,2.798,2.4,7.0,0,7.754568,3.749243
1,1827,940947,42612.148521,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,...,0,0,49.0,6.0,2.798,2.4,7.0,0,5.292544,1.855993


### 6. Построение модели второго уровня <a class="anchor" id="second_level_model"></a>

Теперь обучим модель ранжирования и посчитаем метрики.

In [77]:
X_train = df_ranker_train.drop(['target', 
                                'Missing n_sold_category', 
                                'n_sold_category_user_week', 
                                'mean_sales_value_category',], axis=1)
y_train = df_ranker_train['target']

In [78]:
cat_feats = ['manufacturer', 
             'department', 
             'brand', 
             'commodity_desc',
             'sub_commodity_desc',
             'curr_size_of_product',
             'age_desc',
             'marital_status_code',
             'income_desc',
             'homeowner_desc',
             'hh_comp_desc',
             'household_size_desc',
             'kid_category_desc',
             'Missing price',
             'Missing quantity per week',
             'Last5sales',
            ]


for column in cat_feats:
    X_train[column].fillna(0, inplace=True)
    
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [79]:
%%time
cb = CatBoostClassifier(learning_rate=0.1,
                        max_depth=12,
                        n_estimators=500,
                        random_state=42, 
                        cat_features=cat_feats, 
                        silent=False)

cb.fit(X_train, y_train)

train_preds = cb.predict_proba(X_train)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.5463991	total: 668ms	remaining: 5m 33s
1:	learn: 0.4593857	total: 921ms	remaining: 3m 49s
2:	learn: 0.3922666	total: 1.6s	remaining: 4m 25s
3:	learn: 0.3465231	total: 2.31s	remaining: 4m 46s
4:	learn: 0.3138859	total: 2.61s	remaining: 4m 18s
5:	learn: 0.2792883	total: 3.69s	remaining: 5m 3s
6:	learn: 0.2533568	total: 4.75s	remaining: 5m 34s
7:	learn: 0.2368378	total: 5.37s	remaining: 5m 30s
8:	learn: 0.2254653	total: 6.37s	remaining: 5m 47s
9:	learn: 0.2155984	total: 7.3s	remaining: 5m 57s
10:	learn: 0.2080455	total: 8.16s	remaining: 6m 2s
11:	learn: 0.2024231	total: 9.12s	remaining: 6m 10s
12:	learn: 0.1988072	total: 10.2s	remaining: 6m 20s
13:	learn: 0.1947724	total: 11.1s	remaining: 6m 26s
14:	learn: 0.1932724	total: 11.2s	remaining: 6m 3s
15:	learn: 0.1902265	total: 12.1s	remaining: 6m 4s
16:	learn: 0.1884176	total: 13s	remaining: 6m 9s
17:	learn: 0.1866174	total: 14s	remaining: 6m 15s
18:	learn: 0.1852144	total: 14.9s	remaining: 6m 16s
19:	learn: 0.1847484	total: 15s	r

157:	learn: 0.1303635	total: 2m 28s	remaining: 5m 22s
158:	learn: 0.1300820	total: 2m 29s	remaining: 5m 20s
159:	learn: 0.1298321	total: 2m 30s	remaining: 5m 19s
160:	learn: 0.1292162	total: 2m 31s	remaining: 5m 18s
161:	learn: 0.1288727	total: 2m 32s	remaining: 5m 18s
162:	learn: 0.1285483	total: 2m 33s	remaining: 5m 16s
163:	learn: 0.1283428	total: 2m 34s	remaining: 5m 15s
164:	learn: 0.1281675	total: 2m 34s	remaining: 5m 14s
165:	learn: 0.1278910	total: 2m 35s	remaining: 5m 13s
166:	learn: 0.1277002	total: 2m 36s	remaining: 5m 12s
167:	learn: 0.1272375	total: 2m 37s	remaining: 5m 11s
168:	learn: 0.1270294	total: 2m 38s	remaining: 5m 10s
169:	learn: 0.1268416	total: 2m 39s	remaining: 5m 9s
170:	learn: 0.1264960	total: 2m 40s	remaining: 5m 9s
171:	learn: 0.1262525	total: 2m 41s	remaining: 5m 8s
172:	learn: 0.1259197	total: 2m 42s	remaining: 5m 7s
173:	learn: 0.1255224	total: 2m 43s	remaining: 5m 6s
174:	learn: 0.1252158	total: 2m 44s	remaining: 5m 5s
175:	learn: 0.1250189	total: 2m 45

310:	learn: 0.0939054	total: 5m 10s	remaining: 3m 8s
311:	learn: 0.0938173	total: 5m 11s	remaining: 3m 7s
312:	learn: 0.0936595	total: 5m 12s	remaining: 3m 6s
313:	learn: 0.0935067	total: 5m 13s	remaining: 3m 5s
314:	learn: 0.0933550	total: 5m 14s	remaining: 3m 4s
315:	learn: 0.0932699	total: 5m 15s	remaining: 3m 3s
316:	learn: 0.0930932	total: 5m 16s	remaining: 3m 2s
317:	learn: 0.0928463	total: 5m 17s	remaining: 3m 1s
318:	learn: 0.0927481	total: 5m 17s	remaining: 3m
319:	learn: 0.0923208	total: 5m 18s	remaining: 2m 59s
320:	learn: 0.0922537	total: 5m 19s	remaining: 2m 58s
321:	learn: 0.0921765	total: 5m 20s	remaining: 2m 57s
322:	learn: 0.0920876	total: 5m 21s	remaining: 2m 56s
323:	learn: 0.0919654	total: 5m 22s	remaining: 2m 55s
324:	learn: 0.0914982	total: 5m 23s	remaining: 2m 54s
325:	learn: 0.0914001	total: 5m 25s	remaining: 2m 53s
326:	learn: 0.0912464	total: 5m 25s	remaining: 2m 52s
327:	learn: 0.0909903	total: 5m 26s	remaining: 2m 51s
328:	learn: 0.0905150	total: 5m 27s	rema

464:	learn: 0.0695384	total: 7m 43s	remaining: 34.9s
465:	learn: 0.0694070	total: 7m 44s	remaining: 33.9s
466:	learn: 0.0692703	total: 7m 45s	remaining: 32.9s
467:	learn: 0.0692030	total: 7m 46s	remaining: 31.9s
468:	learn: 0.0690725	total: 7m 46s	remaining: 30.9s
469:	learn: 0.0689900	total: 7m 47s	remaining: 29.9s
470:	learn: 0.0687428	total: 7m 48s	remaining: 28.9s
471:	learn: 0.0685796	total: 7m 49s	remaining: 27.9s
472:	learn: 0.0684039	total: 7m 50s	remaining: 26.9s
473:	learn: 0.0683000	total: 7m 51s	remaining: 25.9s
474:	learn: 0.0682397	total: 7m 52s	remaining: 24.9s
475:	learn: 0.0680422	total: 7m 53s	remaining: 23.9s
476:	learn: 0.0677601	total: 7m 54s	remaining: 22.9s
477:	learn: 0.0677148	total: 7m 55s	remaining: 21.9s
478:	learn: 0.0675272	total: 7m 56s	remaining: 20.9s
479:	learn: 0.0674147	total: 7m 57s	remaining: 19.9s
480:	learn: 0.0671361	total: 7m 58s	remaining: 18.9s
481:	learn: 0.0671116	total: 7m 59s	remaining: 17.9s
482:	learn: 0.0669597	total: 8m	remaining: 16.

In [80]:
fi = pd.DataFrame(cb.feature_importances_, index=X_train.columns, columns=['importance'])
fi.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
n_sold_category,11.739362
avg_transaction_category,10.223615
item_score,6.996947
n_transactions_per_week,5.606859
price,5.601787
Word2Vec_distance_from_avg,5.012876
item_id,4.487601
user_id,4.347251
avg_basket_department,4.181837
sub_commodity_desc,4.078552


Оценим качество построенной модели с помощью метрики precision@5 на валидационном датасете.

In [81]:
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [82]:
N_PREDICT = 100
TOPK_PRECISION = 5

result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker['own_rec'] = make_recommendations(result_eval_ranker, 
                                                     recommender.get_own_recommendations, N_PREDICT=N_PREDICT)

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.24029411764705885)]

In [83]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].\
                                            apply(lambda user_id: rerank(user_id, df_ranker_predict))
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.2965333333333333)
('own_rec', 0.24029411764705885)


  return flags.sum() / len(recommended_list)


Как видим с помощью модели второго уровня удалось значительно поднять метрики для построенных рекомендаций.

### 7. Рекомендации для тестового датасета <a class="anchor" id="final_recommendations"></a>

Теперь построим предсказания для итогового датасета.

In [84]:
data_test = pd.read_csv('retail_test1.csv')
data_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [85]:
df_test_candidates = pd.DataFrame(data_test[USER_COL].unique())
df_test_candidates.columns = [USER_COL]

df_test_candidates['recommendations'] = df_test_candidates[USER_COL].\
                                        apply(lambda user_id: rerank(user_id, df_ranker_predict))

In [94]:
df_test_candidates.to_csv('recommendations.csv', index=False)
df_test_candidates.head()

Unnamed: 0,user_id,recommendations
0,1340,"[912704, 1037840, 1087755, 6602685, 971646]"
1,588,"[1106523, 907631, 866211, 919534, 1041796]"
2,2070,"[865456, 12810391, 933067, 913210, 916122]"
3,1602,"[1082185, 1070820, 944836, 1104343, 848319]"
4,447,"[899624, 1051211, 5569327, 866488, 1031864]"
