### Финальный проект

Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ)

Целевая метрика - money precision @ 5. Порог для уcпешной сдачи проекта money precision @ 5 > 20%

Бизнес ограничения в топ-5 товарах:
- Для каждого юзера 5 рекомендаций (иногда модели могут возвращать < 5)
- 2 новых товара (юзер никогда не покупал)
- 1 дорогой товар, > 7 долларов
- Все товары из разных категорий (категория - sub_commodity_desc)
- Стоимость каждого рекомендованного товара > 1 доллара

    Будет public тестовый датасет, на котором вы сможете измерять метрику
    Также будет private тестовый датасет для измерения финального качества
    НЕ обязательно использовать 2-ух уровневые рекоммендательные системы в проекте
    Вы сдаете код проекта в виде github репозитория и .csv файл с рекомендациями. В .csv файле 2 столбца: user_id - (item_id1, item_id2, ..., item_id5)



### Импорт модулей

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from statistics import mean

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k, money_precision_at_k
from src.utils import prefilter_items, get_targets_sec_level, extend_new_user_features, extend_new_item_features, \
extend_user_item_new_features, get_important_features, get_popularity_recommendations, \
postfilter_items, get_final_recomendations
from src.recommenders import MainRecommender

from tqdm import tqdm
tqdm.pandas()

In [2]:
%load_ext autoreload

### Загрузка данных и разделение на train и test

In [4]:
data = pd.read_csv('raw_data/retail_train.csv')
data_test = pd.read_csv('raw_data/retail_test1.csv')

item_features = pd.read_csv('raw_data/product.csv')
user_features = pd.read_csv('raw_data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# Количество рекомендаций
N=150 

VAL_SIZE = 5

train_1 = data[data['week_no'] < data['week_no'].max() - (VAL_SIZE)]
val = data[data['week_no'] >= data['week_no'].max() - (VAL_SIZE)]

train_2 = val.copy()

### Предварительная фильтрация данных

In [5]:
n_items_before = train_1['item_id'].nunique()
train_1 = prefilter_items(train_1, item_features=item_features, take_n_popular= 3000)
n_items_after = train_1['item_id'].nunique()

print(f'Decreased # items from {n_items_before} to {n_items_after}')

Decreased # items from 85828 to 3001


### Обучаем модель первого уровня

In [6]:
recommender = MainRecommender(train_1)



HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))




#### Эмбеддинги

In [7]:
items_emb_df = recommender.items_emb_df
users_emb_df = recommender.users_emb_df

### Добавляем новые фичи

In [8]:
train = extend_user_item_new_features(train_2, train_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
train.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,19_y,mean_time,age,income,children,avr_bask,sum_per_week,count_purchases_week_mean,sum_purchases_week_mean,target
0,843,40955282722,622,845193,3,5.37,364,-1.5,19,90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002468,0.013529,1.0
1,843,40955282722,622,845193,3,5.37,364,-1.5,19,90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002468,0.013529,1.0
2,843,40955282722,622,865891,2,2.0,364,0.0,19,90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002148,0.008069,0.0
3,843,40955282722,622,883404,2,1.76,364,-0.82,19,90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001922,0.003097,0.0
4,843,40955282722,622,904375,4,4.0,364,0.0,19,90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002148,0.008069,0.0


In [9]:
X_train = train.drop(['target'], axis=1)
y_train = train[['target']]

In [10]:
cat_features=[]
for col in X_train.columns:
    if(X_train[col].dtype == np.object):
          cat_features.append(col)
            
X_train[cat_features + ['user_id', 'item_id']] = X_train[cat_features + ['user_id', 'item_id']].astype('category')

In [12]:
test = extend_user_item_new_features(data_test, train_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
X_test = test.drop(['target'], axis=1)
y_test = test[['target']]
X_test[cat_features + ['user_id', 'item_id']] = X_test[cat_features + ['user_id', 'item_id']].astype('category')

### Инициализируем lgb чтобы определить наиболее важные фичи

In [14]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_features)
important_features = get_important_features(lgb, X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


### Обучаем модель второго уровня

In [16]:
lgb = LGBMClassifier(
    objective='binary',
    max_depth=7,
    categorical_feature=cat_features
)
lgb.fit(X_train[important_features], y_train)

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


LGBMClassifier(boosting_type='gbdt',
        categorical_feature=['department', 'brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'marital_status_code', 'homeowner_desc', 'hh_comp_desc', 'household_size_desc'],
        class_weight=None, colsample_bytree=1.0, importance_type='split',
        learning_rate=0.1, max_depth=7, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [17]:
preds = lgb.predict(X_test[important_features])
test_preds_proba = lgb.predict_proba(X_test[important_features])[:, 1]

### Финальная фильтрация данных

In [18]:
result = get_final_recomendations(X_test, test_preds_proba, data, train_1, item_features)

100%|██████████████████████████████████████████████████████████████████████████████| 2499/2499 [19:55<00:00,  2.16it/s]


In [19]:
price = train_1.groupby('item_id')['price'].mean().reset_index()

### Money precision @ k 

In [20]:
final_result = result.apply(lambda row: money_precision_at_k(row['recomendations'], row['actual'], price), axis=1).mean()

In [21]:
final_result

0.37643065538963527

### Сохраняем предсказания

In [22]:
result.drop('actual', axis=1, inplace=True)

In [23]:
result.to_csv('recommendations.csv', index=False)