# Рекомендательные системы

## Урок 2. Бейзлайны и детерминированные алгоритмы item-item

### Домашняя работа к уроку 2

In [1]:
# !pip install implicit

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

#### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [3]:
data = pd.read_csv('data/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
users, items, interactions = data.user_id.nunique(), data.item_id.nunique(), data.shape[0]

print('# users: ', users)
print('# items: ', items)
print('# interactions: ', interactions)

# users:  2499
# items:  89051
# interactions:  2396804


In [5]:
2396804 / (users * items)

0.010770291654185115

In [6]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [7]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(10)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."
5,9,"[864335, 990865, 1029743, 9297474, 10457112, 8..."
6,13,"[6534178, 1104146, 829197, 840361, 862070, 884..."
7,14,"[840601, 867293, 933067, 951590, 952408, 96569..."
8,15,"[910439, 1082185, 959076, 1023958, 1082310, 13..."
9,16,"[1062973, 1082185, 13007710]"


In [8]:
result.iloc[0].actual

array([  821867,   834484,   856942,   865456,   889248,   907957,
         914190,   943316,   951954,   954486,   958046,   962568,
         969231,   971585,   979707,   986947,   990656,   995242,
        1004906,  1005186,  1042083,  1050310,  1060819,  1062002,
        1064441,  1069733,  1074612,  1082185,  1131115,  1132771,
        6534544, 13876341, 15971874, 17178953,   883616,   917704,
         931860,   961554,  1002032,  1031190,  8090541,  8293439,
        9297615,  9527329, 15926712,  1049998,   861272,   869465,
         877373,   908213,   933913,   940947,   945809,   959316,
         978974,  1031697,  1041796,  1048918,  1081189,  1101422,
        1115576,  1122428,  1132231,  1132814,  5577022,  8091601,
        9296986,  9677939, 10356149, 13417048, 15741823, 15830875],
      dtype=int64)

In [9]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


In [10]:
data_train.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [11]:
popularity = data_train.groupby('item_id')['sales_value'].sum().reset_index()
popularity['sales_value'] = popularity['sales_value'] / popularity['sales_value'].sum()

In [12]:
def weighted_random_recommendation(items_weights, popularity, n=5):
    """Случайные рекоммендации"""

    items = np.array(items_weights.item_id.unique())
    recs = np.random.choice(items, size=n, replace=False, p=popularity['sales_value'])
    
    return recs.tolist()

In [13]:
%%time

result['random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(data_train, popularity, n=5))
result.head(2)

Wall time: 1min 1s


Unnamed: 0,user_id,actual,random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[880333, 8160393, 6424232, 13134187, 956681]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[9705548, 1060415, 9931475, 854483, 9526951]"


#### Задание 2. Расчет метрик

Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?