### HW lesson 6

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from metrics import precision_at_k, recall_at_k
from utils import prefilter_items

In [2]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

In [3]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [4]:
VAL_LVL1_WEEKS = 6
VAL_LVL2_WEEKS = 3

In [5]:
data_train_lvl1 = data[data['week_no'] < data['week_no'].max() - (VAL_LVL1_WEEKS + VAL_LVL2_WEEKS)]

data_val_lvl1 = data[(data['week_no'] >= data['week_no'].max() - (VAL_LVL1_WEEKS + VAL_LVL2_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_LVL2_WEEKS))]


data_train_lvl2 = data_val_lvl1.copy()  

data_val_lvl2 = data[data['week_no'] >= data['week_no'].max() - VAL_LVL2_WEEKS]

In [6]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data['user_id'].nunique()} Items: {df_data['item_id'].nunique()}")

In [7]:
print_stats_data(data_train_lvl1,'train_matcher - data lvl1')
print_stats_data(data_val_lvl1,'val_matcher - data lvl1')
print_stats_data(data_train_lvl2,'train_ranker - data lvl2')
print_stats_data(data_val_lvl2,'val_ranker - data lvl2')

train_matcher - data lvl1
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher - data lvl1
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker - data lvl2
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker - data lvl2
Shape: (118314, 12) Users: 2042 Items: 24329


In [8]:
data_train_lvl1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


#### Префильтрация items

In [9]:
n_items_before = data_train_lvl1['item_id'].nunique()

data_train_lvl1 = prefilter_items(data_train_lvl1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [10]:
data_train_lvl1.user_id.values

array([2375, 1364, 1364, ...,  856,  856,  856], dtype=int64)

In [11]:
common_users = list(set(data_train_lvl1.user_id.values)&(set(data_val_lvl1.user_id.values))\
                    &set(data_val_lvl2.user_id.values))

data_train_lvl1 = data_train_lvl1[data_train_lvl1.user_id.isin(common_users)]
data_val_lvl1 = data_val_lvl1[data_val_lvl1.user_id.isin(common_users)]
data_train_lvl2 = data_train_lvl2[data_train_lvl2.user_id.isin(common_users)]
data_val_lvl2 = data_val_lvl2[data_val_lvl2.user_id.isin(common_users)]

print_stats_data(data_train_lvl1,'train_matcher - data lvl1')
print_stats_data(data_val_lvl1,'val_matcher - data lvl1')
print_stats_data(data_train_lvl2,'train_ranker - data lvl2')
print_stats_data(data_val_lvl2,'val_ranker - data lvl2')

train_matcher - data lvl1
Shape: (784420, 13) Users: 1915 Items: 4999
val_matcher - data lvl1
Shape: (163261, 12) Users: 1915 Items: 27118
train_ranker - data lvl2
Shape: (163261, 12) Users: 1915 Items: 27118
val_ranker - data lvl2
Shape: (115989, 12) Users: 1915 Items: 24042


#### recall@k

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?



In [12]:
ACTUAL_COL = 'actual'

def make_recommendations(df_result, recommend_model, N=50, user_col='user_id'):
    return df_result[user_col].apply(lambda x: recommend_model(x, N=N))

# расчет recall@k
def calc_recall_at_k(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, round((df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()), 4)

# расчет precision@k
def calc_precision_at_k(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [13]:
result_eval_lvl1 = data_val_lvl1.groupby('user_id')['item_id'].unique().reset_index()
result_eval_lvl1.columns=['user_id', 'actual']
result_eval_lvl1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."


In [14]:
%%time

models = {'als_rec': recommender.get_als_recommendations,
          'own_rec': recommender.get_own_recommendations, 
          'similar_item_rec': recommender.get_similar_items_recommendation, 
          'similar_user_rec': recommender.get_similar_users_recommendation}

for col_name, model in models.items():
    result_eval_lvl1[col_name] = make_recommendations(result_eval_lvl1, model)

NameError: name 'recommender' is not defined

In [15]:
result_eval_lvl1.head(8)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."
2,7,"[836281, 843306, 845294, 914190, 920456, 93886..."
3,8,"[868075, 886787, 945611, 1005186, 1008787, 101..."
4,9,"[883616, 1029743, 1039126, 1051323, 1082772, 1..."
5,13,"[6544236, 822407, 908317, 1056775, 1066289, 11..."
6,14,"[917277, 981760, 878234, 925514, 986394, 10220..."
7,15,"[996016, 1014509, 1044404, 1087353, 976199, 10..."


Переходим к расчету recall@k

### Recall@50

In [16]:
top_k_recall = 50

In [17]:
sorted(calc_recall_at_k(result_eval_lvl1, top_k_recall), key=lambda x: x[1],reverse=True)

[]

**Задание 2.**

Обучите модель 2-ого уровня, при этом:

- Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар

- Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_ranker

- Вырос ли precision@5 при использовании двухуровневой модели?

In [18]:
N_PREDICT = 100

In [23]:
USER_COL = 'user_id'
ITEM_COL = 'item_id'

df_lvl2_candidates = pd.DataFrame(data_train_lvl2[USER_COL].unique())
df_lvl2_candidates.columns = [USER_COL]

In [24]:
df_lvl2_candidates.head(2)

Unnamed: 0,user_id
0,2070
1,2021


In [26]:
df_lvl2_candidates.tail(5)

Unnamed: 0,user_id
1910,1446
1911,1784
1912,436
1913,1697
1914,1745


In [27]:
df_train_lvl2 = data_train_lvl2[[USER_COL, ITEM_COL]].copy()
df_train_lvl2['target'] = 1  # тут только покупки 

In [28]:
df_train_lvl2.head(3)



Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1


In [30]:
df_train_lvl2.target.value_counts()

1    163261
Name: target, dtype: int64

In [31]:
df_train_lvl2.tail(5)

Unnamed: 0,user_id,item_id,target
2282320,222,1120741,1
2282321,462,993339,1
2282322,462,995242,1
2282323,462,10180324,1
2282324,462,12731714,1


In [32]:
df_train_lvl2['target'].mean()

1.0

In [33]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [34]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [35]:
df_train_lvl2 = df_train_lvl2.merge(item_features, on='item_id', how='left')
df_train_lvl2 = df_train_lvl2.merge(user_features, on='user_id', how='left')

df_train_lvl2.head(4)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1019940,1,1232,GROCERY,National,SOFT DRINKS,SOFT DRINK BOTTLE NON-CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1.0,None/Unknown
1,2021,840361,1,69,GROCERY,Private,EGGS,EGGS - LARGE,1 DZ,,,,,,,
2,2021,856060,1,170,GROCERY,National,CANNED JUICES,FRUIT DRINKS: CANNED & GLASS (,128 OZ,,,,,,,
3,2021,869344,1,69,GROCERY,Private,FRZN VEGETABLE/VEG DSH,FRZN BAGGED VEGETABLES - PLAIN,16 OZ,,,,,,,


In [36]:
# Средний чек

users_sales = data_train_lvl2.groupby(USER_COL)[['sales_value', 'quantity']].sum().reset_index()

users_sales['avg_transaction'] = users_sales['sales_value'] / users_sales['quantity']

df_train_lvl2 = df_train_lvl2.merge(users_sales[['user_id', 'avg_transaction']], on='user_id', how='left')
df_train_lvl2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_transaction
0,2070,1019940,1,1232,GROCERY,National,SOFT DRINKS,SOFT DRINK BOTTLE NON-CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1.0,None/Unknown,0.035173
1,2021,840361,1,69,GROCERY,Private,EGGS,EGGS - LARGE,1 DZ,,,,,,,,1.195946


In [37]:
data_department = data_train_lvl2.merge(item_features[['item_id', 'department']], on='item_id', how='inner')
data_department.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,department
0,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0,GROCERY
1,2070,40630625006,594,1019940,1,1.0,311,-0.29,201,86,0.0,0.0,GROCERY


In [38]:
# Количество покупок в каждой категории

users_sales_by_department = data_department.groupby([USER_COL, 'department'])\
                        [['sales_value', 'quantity']].sum().reset_index()
users_sales_by_department.rename(columns={'quantity': 'n_sold_category'}, inplace=True)

# Средняя сумма покупки в каждой категории
users_sales_by_department['avg_users_transaction_by_category'] = users_sales_by_department['sales_value']\
                                                    /users_sales_by_department['n_sold_category']

In [39]:
users_sales_by_department.drop(columns=['sales_value'], inplace=True)

In [40]:
df_train_lvl2 = df_train_lvl2.merge(users_sales_by_department, on=[USER_COL, 'department'], how='left')
df_train_lvl2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_transaction,n_sold_category,avg_users_transaction_by_category
0,2070,1019940,1,1232,GROCERY,National,SOFT DRINKS,SOFT DRINK BOTTLE NON-CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1.0,None/Unknown,0.035173,213,1.461549
1,2021,840361,1,69,GROCERY,Private,EGGS,EGGS - LARGE,1 DZ,,,,,,,,1.195946,58,0.824828


In [41]:
# Средняя сумма покупки в категории
department_sales = data_department.groupby('department')['sales_value'].mean().reset_index()
department_sales.rename(columns={'sales_value': 'common_mean_sales_value_by_category'}, inplace=True)
department_sales.tail(2)

df_train_lvl2 = df_train_lvl2.merge(department_sales, on='department', how='left')


In [42]:
data_department['week_no'].max() - data_department['week_no'].min() + 1

6

In [43]:
# Рассчитываем количество недель в датасете
n_weeks = data_department['week_no'].max() - data_department['week_no'].min() + 1

users_department = data_department.groupby([USER_COL, 'department'])['quantity'].sum().reset_index()
users_department['quantity'] /= n_weeks
users_department.rename(columns={'quantity': 'n_sold_category_user_week'}, inplace=True)

df_train_lvl2 = df_train_lvl2.merge(users_department, on=[USER_COL, 'department'], how='left')
df_train_lvl2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_transaction,n_sold_category,avg_users_transaction_by_category,common_mean_sales_value_by_category,n_sold_category_user_week
0,2070,1019940,1,1232,GROCERY,National,SOFT DRINKS,SOFT DRINK BOTTLE NON-CARB (EX,20 OZ,45-54,...,50-74K,Unknown,Unknown,1.0,None/Unknown,0.035173,213,1.461549,2.535227,35.5
1,2021,840361,1,69,GROCERY,Private,EGGS,EGGS - LARGE,1 DZ,,...,,,,,,1.195946,58,0.824828,2.535227,9.666667


In [44]:
# Цена рассчитывается как 
data_department.loc[data_department['sales_value'] == 0, 'sales_value'].count()

1440

In [45]:
# 1) Цена

items_sales = data_department.groupby(ITEM_COL)[['sales_value', 'quantity']].sum().reset_index()
items_sales['price'] = items_sales['sales_value'] / items_sales['quantity']
items_sales['price'].fillna(0, inplace=True)


# 2) Количество покупок в неделю
items_sales['quantity_per_week'] = items_sales['quantity'] / n_weeks

In [46]:
items_sales.head(2)

Unnamed: 0,item_id,sales_value,quantity,price,quantity_per_week
0,28116,0.33,1,0.33,0.166667
1,28117,0.34,1,0.34,0.166667


In [47]:
df_train_lvl2 = df_train_lvl2.merge(items_sales[[ITEM_COL,'price', 'quantity_per_week']],
                                        on=ITEM_COL, how='left')

In [48]:
# если нет sales_value, т е нет цены
df_train_lvl2['Missing price'] = 0
df_train_lvl2.loc[df_train_lvl2['price'].isna(), 'Missing price'] = 1
df_train_lvl2['price'].fillna(0, inplace=True)

df_train_lvl2['Missing quantity per week'] = 0
df_train_lvl2.loc[df_train_lvl2['quantity_per_week'].isna(), 'Missing quantity per week'] = 1
df_train_lvl2['quantity_per_week'].fillna(0, inplace=True)

df_train_lvl2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,kid_category_desc,avg_transaction,n_sold_category,avg_users_transaction_by_category,common_mean_sales_value_by_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week
0,2070,1019940,1,1232,GROCERY,National,SOFT DRINKS,SOFT DRINK BOTTLE NON-CARB (EX,20 OZ,45-54,...,None/Unknown,0.035173,213,1.461549,2.535227,35.5,1.0,1.666667,0,0
1,2021,840361,1,69,GROCERY,Private,EGGS,EGGS - LARGE,1 DZ,,...,,1.195946,58,0.824828,2.535227,9.666667,1.002065,76.666667,0,0


In [49]:
X_train = df_train_lvl2.drop('target', axis=1)
y_train = df_train_lvl2[['target']]

In [50]:
X_train.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,kid_category_desc,avg_transaction,n_sold_category,avg_users_transaction_by_category,common_mean_sales_value_by_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week
0,2070,1019940,1232,GROCERY,National,SOFT DRINKS,SOFT DRINK BOTTLE NON-CARB (EX,20 OZ,45-54,U,...,None/Unknown,0.035173,213,1.461549,2.535227,35.5,1.0,1.666667,0,0
1,2021,840361,69,GROCERY,Private,EGGS,EGGS - LARGE,1 DZ,,,...,,1.195946,58,0.824828,2.535227,9.666667,1.002065,76.666667,0,0


In [51]:
X_train.dtypes

user_id                                  int64
item_id                                  int64
manufacturer                             int64
department                              object
brand                                   object
commodity_desc                          object
sub_commodity_desc                      object
curr_size_of_product                    object
age_desc                                object
marital_status_code                     object
income_desc                             object
homeowner_desc                          object
hh_comp_desc                            object
household_size_desc                     object
kid_category_desc                       object
avg_transaction                        float64
n_sold_category                          int64
avg_users_transaction_by_category      float64
common_mean_sales_value_by_category    float64
n_sold_category_user_week              float64
price                                  float64
quantity_per_

In [52]:
cat_feats = X_train.columns[2:15].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [53]:
X_train.dtypes

user_id                                   int64
item_id                                   int64
manufacturer                           category
department                             category
brand                                  category
commodity_desc                         category
sub_commodity_desc                     category
curr_size_of_product                   category
age_desc                               category
marital_status_code                    category
income_desc                            category
homeowner_desc                         category
hh_comp_desc                           category
household_size_desc                    category
kid_category_desc                      category
avg_transaction                         float64
n_sold_category                           int64
avg_users_transaction_by_category       float64
common_mean_sales_value_by_category     float64
n_sold_category_user_week               float64
price                                   

In [54]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=13,
                     n_estimators=900,
                     learning_rate=0.1,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  return f(*args, **kwargs)


In [55]:
df_lvl2_predict = df_train_lvl2.copy()
df_lvl2_predict['proba_item_purchase'] = train_preds[:,1]

In [56]:
df_lvl2_predict.loc[df_lvl2_predict['user_id']==2070].sort_values('proba_item_purchase', ascending=False)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,avg_transaction,n_sold_category,avg_users_transaction_by_category,common_mean_sales_value_by_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,proba_item_purchase
0,2070,1019940,1,1232,GROCERY,National,SOFT DRINKS,SOFT DRINK BOTTLE NON-CARB (EX,20 OZ,45-54,...,0.035173,213,1.461549,2.535227,35.500000,1.000000,1.666667,0,0,1.000000e-15
121380,2070,834103,1,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,...,0.035173,213,1.461549,2.535227,35.500000,1.000000,9.833333,0,0,1.000000e-15
95883,2070,15741861,1,194,GROCERY,National,DINNER MXS:DRY,SKILLET DINNERS,5.8 OZ,45-54,...,0.035173,213,1.461549,2.535227,35.500000,1.989130,3.833333,0,0,1.000000e-15
95884,2070,16220947,1,5111,CHEF SHOPPE,National,UNKNOWN,APPLES WEDGE/SLICES,8.25 OZ,45-54,...,0.035173,1,3.240000,2.869167,0.166667,2.462857,1.166667,0,0,1.000000e-15
95885,2070,16729296,1,794,GROCERY,National,COLD CEREAL,ALL FAMILY CEREAL,18 OZ,45-54,...,0.035173,213,1.461549,2.535227,35.500000,3.105882,2.833333,0,0,1.000000e-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66373,2070,989101,1,69,GROCERY,Private,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,GAL,45-54,...,0.035173,213,1.461549,2.535227,35.500000,0.784667,20.000000,0,0,1.000000e-15
66374,2070,997128,1,1603,GROCERY,National,BAG SNACKS,SALAD DRESSING MIXES,1 OZ,45-54,...,0.035173,213,1.461549,2.535227,35.500000,1.490000,5.666667,0,0,1.000000e-15
66375,2070,1092026,1,103,GROCERY,National,SOFT DRINKS,SFT DRNK 2 LITER BTL CARB INCL,2 LTR,45-54,...,0.035173,213,1.461549,2.535227,35.500000,1.118145,53.000000,0,0,1.000000e-15
66376,2070,834103,1,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,...,0.035173,213,1.461549,2.535227,35.500000,1.000000,9.833333,0,0,1.000000e-15


In [57]:
result_eval_lvl2 = data_val_lvl2.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_lvl2.columns=[USER_COL, ACTUAL_COL]
result_eval_lvl2.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


In [59]:
# Функция для переранжирования

def rerank(user_id, df, USER_COL='user_id', proba_col_name='proba_item_purchase', N=5):
    return df[df[USER_COL]==user_id].sort_values(proba_col_name, ascending=False).head(N).item_id.tolist()

In [60]:
result_eval_lvl2['reranked_own_rec_lightgbm'] = result_eval_lvl2[USER_COL].apply(lambda user_id: rerank(user_id, df_lvl2_predict))

In [61]:
TOPK_PRECISION = 5
# рассчитаем precision только модели матчинга (1 уровня -  ALS, get_own_recommendations), 
# чтобы понимать влияение ранжирования (2 уровня) на метрики

print(*sorted(calc_precision_at_k(result_eval_lvl2, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec_lightgbm', 0.1836118363794582)


In [62]:
#аналошично с recall
print(*sorted(calc_recall_at_k(result_eval_lvl2, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec_lightgbm', 0.0291)


In [63]:
ctb = LGBMClassifier(objective='binary',
                     max_depth=13,
                     n_estimators=900,
                     learning_rate=0.1,
                     categorical_column=cat_feats)

ctb.fit(X_train, y_train)

train_preds = ctb.predict_proba(X_train)

  return f(*args, **kwargs)


In [64]:
df_lvl2_predict = df_train_lvl2.copy()
df_lvl2_predict['proba_item_purchase'] = train_preds[:,1]

In [65]:
result_eval_lvl2['reranked_own_rec_catb'] = result_eval_lvl2[USER_COL].apply(lambda user_id: rerank(user_id, df_lvl2_predict))

In [66]:
print(*sorted(calc_precision_at_k(result_eval_lvl2, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec_lightgbm', 0.1836118363794582)
('reranked_own_rec_catb', 0.1836118363794582)
