<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#1.-Загрузка-данных-" data-toc-modified-id="1.-Загрузка-данных--1"><span class="toc-item-num">1&nbsp;&nbsp;</span>1. Загрузка данных <a class="anchor" id="load_data" rel="nofollow"></a></a></span></li><li><span><a href="#2.-Вспомогательные-функции" data-toc-modified-id="2.-Вспомогательные-функции-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>2. Вспомогательные функции</a></span></li><li><span><a href="#3.-Подготовка-данных" data-toc-modified-id="3.-Подготовка-данных-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>3. Подготовка данных</a></span></li><li><span><a href="#4.-Построение-модели-первого-уровня" data-toc-modified-id="4.-Построение-модели-первого-уровня-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>4. Построение модели первого уровня</a></span></li><li><span><a href="#5.-Генерация-признаков-для-модели-второго-уровня" data-toc-modified-id="5.-Генерация-признаков-для-модели-второго-уровня-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>5. Генерация признаков для модели второго уровня</a></span></li><li><span><a href="#6.-Построение-модели-второго-уровня-" data-toc-modified-id="6.-Построение-модели-второго-уровня--6"><span class="toc-item-num">6&nbsp;&nbsp;</span>6. Построение модели второго уровня <a class="anchor" id="second_level_model" rel="nofollow"></a></a></span></li><li><span><a href="#7.-Рекомендации-для-тестового-датасета" data-toc-modified-id="7.-Рекомендации-для-тестового-датасета-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>7. Рекомендации для тестового датасета</a></span></li></ul></div>

### 1. Загрузка данных <a class="anchor" id="load_data"></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from implicit import als

from gensim.models import Word2Vec

from catboost import CatBoostClassifier

import os, sys
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [3]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

### 2. Вспомогательные функции

In [4]:
item_col = 'item_id'
user_col = 'user_id'
    
def make_recommendations(df_result, recommend_model, n_predict=50, user_col='user_id'):
    return df_result[user_col].apply(lambda x: recommend_model(x, N=n_predict))

def calc_precision(df_data, top_k, actual_col='actual'):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[actual_col], k=top_k), axis=1).mean()
        
def rerank(user_id, df, user_col='user_id', proba_col_name='proba_item_purchase', N=5):
    return df[df[user_col]==user_id].sort_values(proba_col_name, ascending=False).head(N).item_id.tolist()

def get_scores(df_result, recommend_model, n_predict=50, user_col='user_id'):
    return df_result[user_col].apply(lambda x: recommend_model(x, N=n_predict))

### 3. Подготовка данных

Проведем переименования определенных признаков и приведем названия к единообразному формату.

In [5]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': item_col}, inplace=True)
user_features.rename(columns={'household_key': user_col }, inplace=True)

Разделим выборку на обучающий, валидационный и тестовый датасеты.

In [6]:
train_weeks = 6
val_weeks = 3

# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (train_weeks + val_weeks)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (train_weeks + val_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_weeks))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - val_weeks]

In [7]:
dfs = {
    'train_matcher': data_train_matcher,
    'val_matcher': data_val_matcher,
    'train_ranker': data_train_ranker,
    'val_ranker': data_val_ranker
}

In [8]:
for key, value in dfs.items():
    print(key)
    print(f"Shape: {value.shape} Users: {value[user_col].nunique()} Items: {value[item_col].nunique()}")
    print()

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685

val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649

train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649

val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329



Проведем префильтрацию данных.

In [9]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


Оставим только пользователей, которые встречаются в тренировочном датасете

In [10]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]


for key, value in dfs.items():
    print(key)
    print(f"Shape: {value.shape} Users: {value[user_col].nunique()} Items: {value[item_col].nunique()}")
    print()

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685

val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649

train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649

val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329



### 4. Построение модели первого уровня

Создадим экземляр класса MainRecommender.

In [11]:
recommender = MainRecommender(data_train_matcher)



  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [12]:
actual_col = 'actual'
result_eval_matcher = data_val_matcher.groupby(user_col)[item_col].unique().reset_index()
result_eval_matcher.columns=[user_col, actual_col]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [13]:
n_predict = 100

result_eval_matcher['own_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_own_recommendations, 
                                                      n_predict=n_predict)

result_eval_matcher['als_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_als_recommendations, 
                                                      n_predict=n_predict)

result_eval_matcher['bm25_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_bm25_recommendations, 
                                                      n_predict=n_predict)

In [14]:
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual,own_rec,als_rec,bm25_rec
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[940947, 1004906, 865456, 856942, 10149640, 10...","[1082212, 823721, 5569374, 10149640, 965766, 1...","[916122, 1004906, 1127831, 844179, 866211, 878..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[8090521, 5569230, 1106523, 1075368, 916122, 1...","[5569230, 1021324, 8090521, 940947, 916122, 10...","[916122, 5569230, 844179, 1044078, 1004906, 10..."


### 5. Генерация признаков для модели второго уровня

Cформируем датасет для построения модели ранжирования, сгенерируем новые признаки и построим модель, которая будет работать с отобранными товарами для построения окончательных рекомендаций. 

In [15]:
# взяли пользователей из трейна для ранжирования

df_match_candidates = pd.DataFrame(data_train_ranker[user_col].unique())
df_match_candidates.columns = [user_col]


df_match_candidates['candidates'] = make_recommendations(df_match_candidates, 
                                                         recommender.get_own_recommendations, 
                                                         n_predict=n_predict)


df_match_candidates['candidates_scores'] = get_scores(df_match_candidates, 
                                                      recommender.get_own_scores, 
                                                      n_predict=n_predict)

df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().\
            reset_index(level=1, drop=True)
df_scores = df_match_candidates.apply(lambda x: pd.Series(x['candidates_scores']), axis=1).stack().\
            reset_index(level=1, drop=True)
df_items.name = 'item_id'
df_match_candidates = df_match_candidates.drop(['candidates', 'candidates_scores'], axis=1).join(df_items)

df_match_candidates['item_score'] = df_scores

df_match_candidates.head()

Unnamed: 0,user_id,item_id,item_score
0,2070,1029743,98746.436459
0,2070,913210,88619.808098
0,2070,5569374,65887.813512
0,2070,838186,57590.136172
0,2070,1127831,56727.92847


In [16]:
df_ranker_train = data_train_ranker[[user_col, item_col]].copy()
df_ranker_train['target'] = 1  # только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[user_col, item_col], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[user_col, item_col])

df_ranker_train['target'].fillna(0, inplace= True)

df_ranker_train.head()

Unnamed: 0,user_id,item_id,item_score,target
0,2070,1029743,98746.436459,0.0
1,2070,913210,88619.808098,1.0
2,2070,5569374,65887.813512,0.0
3,2070,838186,57590.136172,1.0
4,2070,1127831,56727.92847,0.0


In [17]:
df_ranker_train.target.value_counts()

0.0    194875
1.0     16046
Name: target, dtype: int64

In [18]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1029743,98746.436459,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,913210,88619.808098,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


Сгенерируем новые признаки и добавим их к датасету для обучения.

In [19]:
# Добавим параметр категории к исходному обучающему датасету для удобства создания новых фичей
data_department = data_train_ranker.merge(item_features[['item_id', 'department']], on='item_id', how='inner')
data_department.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,department
0,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0,GROCERY
1,2070,40630625006,594,1019940,1,1.0,311,-0.29,201,86,0.0,0.0,GROCERY


In [20]:
# Средная цена купленных товаров пользователем
users_sales = data_train_ranker.groupby(user_col)[['sales_value', 'quantity']].sum().reset_index()
users_sales['avg_price'] = users_sales['sales_value'] / users_sales['quantity']
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_price']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_price
0,2070,1029743,98746.436459,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.035173
1,2070,913210,88619.808098,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.035173


In [21]:
# Количество покупок в каждой категории и средная сумма покупки в каждой категории для пользователя
users_sales_department = data_department.groupby([user_col, 'department'])\
                        [['sales_value', 'quantity']].sum().reset_index()
users_sales_department.rename(columns={'quantity': 'n_sold_category'}, inplace=True)
users_sales_department['avg_transaction_category'] = users_sales_department['sales_value']\
                                                    /users_sales_department['n_sold_category']
users_sales_department.drop(columns=['sales_value'], inplace=True)

df_ranker_train = df_ranker_train.merge(
    users_sales_department, on=[user_col, 'department'], how='left')
df_ranker_train['missing_n_sold_category'] = 0
df_ranker_train.loc[df_ranker_train['n_sold_category'].isna(), 'missing_n_sold_category'] = 1
df_ranker_train['n_sold_category'].fillna(0, inplace=True)

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_price,n_sold_category,avg_transaction_category,missing_n_sold_category
0,2070,1029743,98746.436459,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,U,50-74K,Unknown,Unknown,1,None/Unknown,0.035173,213.0,1.461549,0
1,2070,913210,88619.808098,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,...,U,50-74K,Unknown,Unknown,1,None/Unknown,0.035173,213.0,1.461549,0


In [22]:
# Средняя сумма покупки в категории
department_sales = data_department.groupby('department')['sales_value'].mean().reset_index()
department_sales.rename(columns={'sales_value': 'mean_sales_value_category'}, inplace=True)
department_sales.tail(2)

n_weeks = data_department['week_no'].max() - data_department['week_no'].min() + 1

# Количество покупок юзером конкретной категории в неделю
users_department = data_department.groupby([user_col, 'department'])['quantity'].sum().reset_index()
users_department['quantity'] /= n_weeks
users_department.rename(columns={'quantity': 'n_sold_category_user_week'}, inplace=True)

df_ranker_train = df_ranker_train.merge(department_sales, on='department', how='left')
df_ranker_train = df_ranker_train.merge(users_department, on=[user_col, 'department'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_price,n_sold_category,avg_transaction_category,missing_n_sold_category,mean_sales_value_category,n_sold_category_user_week
0,2070,1029743,98746.436459,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,Unknown,Unknown,1,None/Unknown,0.035173,213.0,1.461549,0,2.531969,35.5
1,2070,913210,88619.808098,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,...,Unknown,Unknown,1,None/Unknown,0.035173,213.0,1.461549,0,2.531969,35.5


In [23]:
# Цена
items_sales = data_department.groupby(item_col)[['sales_value', 'quantity']].sum().reset_index()
items_sales['price'] = items_sales['sales_value'] / items_sales['quantity']
items_sales['price'].fillna(0, inplace=True)

# Количество покупок товара в неделю
items_sales['quantity_per_week'] = items_sales['quantity'] / n_weeks


df_ranker_train = df_ranker_train.merge(items_sales[[item_col,'price', 'quantity_per_week']],
                                        on=item_col, how='left')

df_ranker_train.head()

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,household_size_desc,kid_category_desc,avg_price,n_sold_category,avg_transaction_category,missing_n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week
0,2070,1029743,98746.436459,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,1,None/Unknown,0.035173,213.0,1.461549,0,2.531969,35.5,2.336216,211.833333
1,2070,913210,88619.808098,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,...,1,None/Unknown,0.035173,213.0,1.461549,0,2.531969,35.5,3.976777,20.166667
2,2070,5569374,65887.813512,0.0,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,...,1,None/Unknown,0.035173,213.0,1.461549,0,2.531969,35.5,4.116522,3.833333
3,2070,838186,57590.136172,1.0,1790,GROCERY,National,BAKED SWEET GOODS,SW GDS:DONUTS,18.2 OZ,...,1,None/Unknown,0.035173,213.0,1.461549,0,2.531969,35.5,3.99,9.333333
4,2070,1127831,56727.92847,0.0,5937,PRODUCE,National,BERRIES,STRAWBERRIES,16 OZ,...,1,None/Unknown,0.035173,16.0,1.649375,0,2.314031,2.666667,3.911265,27.666667


In [24]:
# Количество уникальных магазинов, в которых продавался товар
items_stores = data_department.groupby(item_col)['store_id'].nunique().reset_index()
items_stores.rename(columns={'store_id': 'n_unique_stores'}, inplace=True)
df_ranker_train = df_ranker_train.merge(items_stores, on=item_col, how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,kid_category_desc,avg_price,n_sold_category,avg_transaction_category,missing_n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,n_unique_stores
0,2070,1029743,98746.436459,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,None/Unknown,0.035173,213.0,1.461549,0,2.531969,35.5,2.336216,211.833333,110.0
1,2070,913210,88619.808098,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,...,None/Unknown,0.035173,213.0,1.461549,0,2.531969,35.5,3.976777,20.166667,54.0


In [25]:
# Среднее количество транзакций клиента в неделю
users_transactions = data_department.groupby(user_col)[item_col].count().reset_index()
users_transactions.rename(columns={'item_id': 'n_transactions_per_week'}, inplace=True)
users_transactions['n_transactions_per_week'] /= n_weeks


df_ranker_train = df_ranker_train.merge(users_transactions, on=user_col, how='left')

df_ranker_train.tail(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,avg_price,n_sold_category,avg_transaction_category,missing_n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,n_unique_stores,n_transactions_per_week
210919,1745,1070820,26048.034628,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,2.794,3.0,2.663333,0,2.531969,0.5,2.273415,68.333333,84.0,0.5
210920,1745,1127831,25777.552607,0.0,5937,PRODUCE,National,BERRIES,STRAWBERRIES,16 OZ,...,2.794,0.0,,1,2.314031,,3.911265,27.666667,56.0,0.5


In [26]:
# Средний чек
users_sales = data_train_ranker.groupby(user_col)['sales_value'].mean().reset_index()
users_sales.rename(columns={'sales_value': 'avg_cheque'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_cheque']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,n_sold_category,avg_transaction_category,missing_n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,n_unique_stores,n_transactions_per_week,avg_cheque
0,2070,1029743,98746.436459,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,213.0,1.461549,0,2.531969,35.5,2.336216,211.833333,110.0,34.0,3.025931
1,2070,913210,88619.808098,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,...,213.0,1.461549,0,2.531969,35.5,3.976777,20.166667,54.0,34.0,3.025931


In [27]:
# Среднее количество уникальных категорий в корзине
users_baskets = data_department.groupby([user_col, 'basket_id'])['department'].nunique().reset_index()
users_baskets = users_baskets.groupby(user_col)['department'].mean().reset_index()
users_baskets.rename(columns={'department': 'avg_basket_department'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_baskets[['user_id', 'avg_basket_department']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,avg_transaction_category,missing_n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department
0,2070,1029743,98746.436459,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,1.461549,0,2.531969,35.5,2.336216,211.833333,110.0,34.0,3.025931,1.790698
1,2070,913210,88619.808098,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,...,1.461549,0,2.531969,35.5,3.976777,20.166667,54.0,34.0,3.025931,1.790698


Построим признак, отражающий средний интервал между покупками пользователя.

In [28]:
users_days = data_department.groupby(user_col)['day'].unique().reset_index()
users_days['day'] = users_days['day'].apply(lambda x: sorted(x))
users_days.head()

Unnamed: 0,user_id,day
0,1,"[597, 606, 608, 610, 620, 622, 632]"
1,2,"[608, 614, 620, 622]"
2,4,"[605, 617, 627]"
3,6,"[597, 598, 603, 607, 610, 611, 616, 619, 620, ..."
4,7,"[606, 610, 614, 623, 629]"


In [29]:
def avg_ndays(days):
    diff = 0
    if len(days) > 1:
        for i in range(len(days) - 1):
            diff += days[i+1] - days[i]
        return diff / (len(days) - 1)
    else:
        return 0
    
users_days['avg_interval'] = users_days['day'].apply(avg_ndays)

df_ranker_train = df_ranker_train.merge(users_days[['user_id', 'avg_interval']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,missing_n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval
0,2070,1029743,98746.436459,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,0,2.531969,35.5,2.336216,211.833333,110.0,34.0,3.025931,1.790698,1.444444
1,2070,913210,88619.808098,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,...,0,2.531969,35.5,3.976777,20.166667,54.0,34.0,3.025931,1.790698,1.444444


Построим признак, в котором будет закодировано место товара в пяти последних покупках клиента.

In [30]:
users_items = data_train_ranker.groupby(user_col)[item_col].apply(list).reset_index()
users_items['item_id'] = users_items['item_id'].apply(lambda x: x[-5:])
users_items.head()

Unnamed: 0,user_id,item_id
0,1,"[5577022, 8293439, 9526676, 9527558, 10149640]"
1,2,"[7407562, 10149597, 13776981, 15572067, 17215077]"
2,4,"[963365, 1038692, 1133312, 5570830, 7431408]"
3,6,"[1099058, 895268, 1017061, 1082185, 1119051]"
4,7,"[9837501, 12524016, 13072715, 13987153, 13987338]"


In [31]:
def code_last_sales(x, df=users_items):
    last_sales = df.loc[df['user_id'] == x[0], 'item_id'].item()
    code = str()
    last_sales.reverse()
    for item in last_sales:
        code += '1' if item == x[1] else '0'
    return code

df_ranker_train['last_5_sales'] = df_ranker_train[[user_col, item_col]].apply(code_last_sales, axis=1)
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval,last_5_sales
0,2070,1029743,98746.436459,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,2.531969,35.5,2.336216,211.833333,110.0,34.0,3.025931,1.790698,1.444444,0
1,2070,913210,88619.808098,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,...,2.531969,35.5,3.976777,20.166667,54.0,34.0,3.025931,1.790698,1.444444,0


Построим модель Word2Vec для получения эмбеддингов товаров, а затем посчитаем длину вектора, описывающего каждый товар, и расстояние от word2vec эмбеддинга товара до среднего эмбеддинга товаров, купленных юзером.

In [32]:
df_ = data_train_ranker.groupby(user_col)[item_col].unique().reset_index()
df_.head()

Unnamed: 0,user_id,item_id
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886..."


In [33]:
purchases = []

for user in df_['user_id']:
    purchases.append([str(item) for item in df_[df_['user_id'] == user].item_id.values[0]])
    
print(f"Total # of Sessions: {len(purchases)}")

Total # of Sessions: 2151


In [34]:
w2v_model = Word2Vec(min_count=1, vector_size=100, sg=1, workers=3)
w2v_model.build_vocab(purchases)
w2v_model.train(purchases, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)

(1412911, 1418460)

In [35]:
def word2vec_len(itemid):
    try:
        return sum([i**2 for i in w2v_model.wv[str(itemid)]])
    except:
        return -1

df_ranker_train['Word2Vec_length'] = df_ranker_train[item_col].apply(lambda x: word2vec_len(x))
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,n_sold_category_user_week,price,quantity_per_week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval,last_5_sales,Word2Vec_length
0,2070,1029743,98746.436459,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,35.5,2.336216,211.833333,110.0,34.0,3.025931,1.790698,1.444444,0,8.762437
1,2070,913210,88619.808098,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,...,35.5,3.976777,20.166667,54.0,34.0,3.025931,1.790698,1.444444,0,6.106701


In [36]:
def avg_word2vec(items):
    return sum([w2v_model.wv[str(item)] for item in items]) / len(items)

df_['avg_word2vec'] = df_[item_col].apply(avg_word2vec)
df_.head()

Unnamed: 0,user_id,item_id,avg_word2vec
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[-0.11209027, 0.19833992, 0.115546286, 0.00687..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[-0.09091413, 0.2487776, 0.095322825, 0.017014..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[-0.0974842, 0.21670204, 0.099092655, 0.008806..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[-0.106671475, 0.22247633, 0.106515184, 0.0114..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[-0.12720148, 0.21417151, 0.11428878, -0.00454..."


In [37]:
def get_w2v_distance(x, df=df_):
    avg_w2v = df.loc[df_[user_col] == x[0], 'avg_word2vec'].item()
    try:
        return sum((w2v_model.wv[str(x[1])] - avg_w2v) ** 2)
    except:
        return -1
    
df_ranker_train['word2vec_distance_from_avg'] = df_ranker_train[[user_col, item_col]].\
                                                apply(get_w2v_distance, axis=1)
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,price,quantity_per_week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval,last_5_sales,Word2Vec_length,word2vec_distance_from_avg
0,2070,1029743,98746.436459,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,2.336216,211.833333,110.0,34.0,3.025931,1.790698,1.444444,0,8.762437,4.986946
1,2070,913210,88619.808098,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,...,3.976777,20.166667,54.0,34.0,3.025931,1.790698,1.444444,0,6.106701,2.530061


### 6. Построение модели второго уровня <a class="anchor" id="second_level_model"></a>

Теперь обучим модель ранжирования и посчитаем метрики.

In [38]:
X_train = df_ranker_train.drop(['target', 
                                'missing_n_sold_category', 
                                'n_sold_category_user_week', 
                                'mean_sales_value_category',], axis=1)
y_train = df_ranker_train['target']

In [39]:
cat_feats = ['manufacturer', 
             'department', 
             'brand', 
             'commodity_desc',
             'sub_commodity_desc',
             'curr_size_of_product',
             'age_desc',
             'marital_status_code',
             'income_desc',
             'homeowner_desc',
             'hh_comp_desc',
             'household_size_desc',
             'kid_category_desc',
             'last_5_sales',
            ]


for column in cat_feats:
    X_train[column].fillna(0, inplace=True)
    
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [40]:
%%time
cb = CatBoostClassifier(learning_rate=0.1,
                        max_depth=10,
                        n_estimators=100,
                        random_state=42, 
                        cat_features=cat_feats, 
                        silent=False)

cb.fit(X_train, y_train)

train_preds = cb.predict_proba(X_train)

0:	learn: 0.5688726	total: 205ms	remaining: 20.3s
1:	learn: 0.4771405	total: 312ms	remaining: 15.3s
2:	learn: 0.4001963	total: 437ms	remaining: 14.1s
3:	learn: 0.3524875	total: 553ms	remaining: 13.3s
4:	learn: 0.3241909	total: 602ms	remaining: 11.4s
5:	learn: 0.2911653	total: 722ms	remaining: 11.3s
6:	learn: 0.2755928	total: 767ms	remaining: 10.2s
7:	learn: 0.2542446	total: 874ms	remaining: 10s
8:	learn: 0.2422407	total: 933ms	remaining: 9.43s
9:	learn: 0.2306505	total: 1.04s	remaining: 9.4s
10:	learn: 0.2230853	total: 1.16s	remaining: 9.35s
11:	learn: 0.2171845	total: 1.27s	remaining: 9.3s
12:	learn: 0.2131758	total: 1.38s	remaining: 9.26s
13:	learn: 0.2095645	total: 1.51s	remaining: 9.3s
14:	learn: 0.2078292	total: 1.56s	remaining: 8.83s
15:	learn: 0.2053612	total: 1.68s	remaining: 8.8s
16:	learn: 0.2030424	total: 1.79s	remaining: 8.75s
17:	learn: 0.2010822	total: 1.91s	remaining: 8.71s
18:	learn: 0.1990530	total: 2.03s	remaining: 8.65s
19:	learn: 0.1981270	total: 2.13s	remaining: 8.

In [41]:
fi = pd.DataFrame(cb.feature_importances_, index=X_train.columns, columns=['importance'])
fi.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
avg_transaction_category,26.120464
n_sold_category,15.639314
n_transactions_per_week,9.200169
item_score,7.976595
price,6.739225
department,4.550257
quantity_per_week,4.449867
word2vec_distance_from_avg,3.487894
Word2Vec_length,3.061242
commodity_desc,2.596142


Оценим качество построенной модели с помощью метрики precision@5 на валидационном датасете.

In [42]:
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [43]:
n_predict = 100
TOPK_PRECISION = 5

result_eval_ranker = data_val_ranker.groupby(user_col)[item_col].unique().reset_index()
result_eval_ranker.columns=[user_col, actual_col]
result_eval_ranker['own_rec'] = make_recommendations(result_eval_ranker, 
                                                     recommender.get_own_recommendations, n_predict=n_predict)

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.23225490196078435)]

In [44]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[user_col].\
                                            apply(lambda user_id: rerank(user_id, df_ranker_predict))
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.27582245430809404)
('own_rec', 0.23225490196078435)


C помощью модели второго уровня удалось поднять метрики для построенных рекомендаций.

### 7. Рекомендации для тестового датасета

In [48]:
data_test = pd.read_csv('data/retail_test1.csv')
data_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [49]:
df_test_candidates = pd.DataFrame(data_test[user_col].unique())
df_test_candidates.columns = [user_col]

df_test_candidates['recommendations'] = df_test_candidates[user_col].\
                                        apply(lambda user_id: rerank(user_id, df_ranker_predict))

In [50]:
df_test_candidates.to_csv('recommendations.csv', index=False)
df_test_candidates.head()

Unnamed: 0,user_id,recommendations
0,1340,"[912704, 1037840, 1087755, 1075368, 906864]"
1,588,"[1106523, 933835, 1050851, 866211, 1068719]"
2,2070,"[1029743, 913210, 999270, 933067, 12810391]"
3,1602,"[1070820, 1082185, 1098844, 1029743, 1044259]"
4,447,"[1029743, 926905, 1106523, 1075368, 899624]"
