## Homework 2

### Подготовка

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split

# Функции из 1-ого вебинара
import os, sys   
from metrics import precision_at_k, recall_at_k

# Базовые модели
from baseline_models import random_recommendation, popularity_recommendation

In [2]:
data = pd.read_csv('data/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [4]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [5]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    items = items_weights['item_id']
    weights = items_weights['weight']
    recs = np.random.choice(items, size=n, p=weights, replace=False)
    
    return recs.tolist()

In [6]:
# получаем список самых популярных товаров
popular = data.groupby('item_id')['sales_value'].sum().reset_index()
popular.sort_values('sales_value', ascending=False, inplace=True)
# обнуляем веса "по умолчанию"
popular['weight'] = 0
# избегаем отрицательных весов, фильтруя значения меньше, чем основание логарифма
popular['weight'] = np.log2(popular['sales_value'], where=popular['sales_value'] > 2)
popular['weight'] = popular['weight'] / np.sum(popular['weight'])
popular.head(5)

Unnamed: 0,item_id,sales_value,weight
56233,6534178,467993.62,5.7e-05
56193,6533889,42645.75,4.6e-05
29195,1029743,37981.91,4.6e-05
56228,6534166,31298.96,4.5e-05
35054,1082185,27291.02,4.4e-05


In [7]:
%%time

result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: 
                                                                   weighted_random_recommendation(popular, n=5))

CPU times: user 1.77 s, sys: 6.05 ms, total: 1.78 s
Wall time: 1.78 s


In [8]:
result.head(2)

Unnamed: 0,user_id,actual,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[15572855, 1126746, 883665, 13512301, 1044619]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[959015, 1119667, 13007434, 894232, 12605078]"


### Расчет метрик
Рассчитаем Precision@5 для каждого алгоритма.

In [9]:
# загружаем предварительно сохраненные результаты из вебинара
result_loaded = pd.read_csv('predictions_basic.csv')
result_loaded.drop(columns=['actual'], inplace=True)
result_loaded['user_id'] = result_loaded['user_id'].astype(dtype=np.int64)
result_loaded.head(2)

Unnamed: 0,user_id,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[8248916, 1126995, 2455202, 1081189, 6981521]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1127831, 995242]","[1082185, 6666, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 6666, 1098066]","[1082185, 995242, 1029743, 840361, 904360]"
1,3,"[16769451, 15830893, 1801587, 1082597, 614588]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 6666, 826249]","[1082185, 981760, 1098066, 826249, 6666]","[1082185, 1098066, 6534178, 826249, 1127831]"


In [10]:
# поскольку рекомендации импортируются как строки, преобразуем их в список целых чисел
for model in result_loaded.columns[1:]:
    result_loaded[model] = result_loaded[model].apply(lambda x: list(map(int, x[1:-1].replace(',', '').split())))

In [11]:
# объединим загруженные результаты с результатом weighted_random_recommendation
result = result.merge(result_loaded, on='user_id')
result.head(2)

Unnamed: 0,user_id,actual,weighted_random_recommendation,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[15572855, 1126746, 883665, 13512301, 1044619]","[8248916, 1126995, 2455202, 1081189, 6981521]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1127831, 995242]","[1082185, 6666, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 6666, 1098066]","[1082185, 995242, 1029743, 840361, 904360]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[959015, 1119667, 13007434, 894232, 12605078]","[16769451, 15830893, 1801587, 1082597, 614588]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 6666, 826249]","[1082185, 981760, 1098066, 826249, 6666]","[1082185, 1098066, 6534178, 826249, 1127831]"


In [12]:
def show_metrics(recommendation, metric):
    """Функция расчета метрик
    
    Input
    -----
    recommendation: pd.DataFrame
        Датафрейм со столбцами user_id, actual, любое количество наименований_моделей_рекомендаций
    metric: function
        Функция расчета метрики         
    """
    results = {model: recommendation.apply(lambda row: metric(row[model], row['actual']), axis=1).mean() 
           for model in recommendation.columns[2:]}

    results = pd.DataFrame(results, index = ['precision@5'])
    
    return results.T.sort_values(by=['precision@5'], ascending=False)

In [13]:
# проверяем метрики
show_metrics(result, precision_at_k)

  return flags.sum() / len(recommended_list)


Unnamed: 0,precision@5
own_purchases,0.219858
popular_recommendation,0.15524
tfidf,0.138981
itemitem,0.136827
cosine,0.132909
weighted_random_recommendation,0.001175
random_recommendation,0.000784


Лучший результат у own_purchases, weighted_random_recommendation немного лучше random_recommendation.

### Задание 2. Улучшение бейзлайнов и ItemItem

**Попробуйте улучшить бейзлайны, считая случайный на топ-5000 товаров**

In [14]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [15]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [16]:
%%time
result['random_recommendation_top_5000'] = result['user_id'].apply(lambda x: 
                                                                   random_recommendation(top_5000, n=5))

CPU times: user 759 ms, sys: 3.33 ms, total: 762 ms
Wall time: 765 ms


In [17]:
# проверяем метрики
show_metrics(result, precision_at_k)

  return flags.sum() / len(recommended_list)


Unnamed: 0,precision@5
own_purchases,0.219858
popular_recommendation,0.15524
tfidf,0.138981
itemitem,0.136827
cosine,0.132909
random_recommendation_top_5000,0.005877
weighted_random_recommendation,0.001175
random_recommendation,0.000784


Результат действительно улучшился

**Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.**

In [18]:
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 6666

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [19]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', 
                                  columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,6666,202291,397896,420647,480014,545926,707683,731106,818980,819063,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

**Обучаем модели для K от 2 до 11. Результат для K=1 не проверяем, т.к. это аналог own_purchases**

In [21]:
k_step = np.arange(2, 12)

In [22]:
%%time
itemitem_models_list = []
for k in k_step:
    model = ItemItemRecommender(K=k, num_threads=4) # K - кол-во ближайших соседей

    model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
              show_progress=True)
    itemitem_models_list.append((k, model))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))


CPU times: user 12.8 s, sys: 170 ms, total: 12.9 s
Wall time: 13 s


In [23]:
for k, model in itemitem_models_list:
    result['itemitem_K_' + str(k)] = result['user_id'].\
        apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                        model.recommend(userid=userid_to_id[x], 
                                        user_items=sparse_user_item,   # на вход user-item matrix
                                        N=5, 
                                        filter_already_liked_items=False, 
                                        filter_items=[itemid_to_id[6666]], # фильтруем непопулярные товары
                                        recalculate_user=True)])

In [24]:
result.head(2)

Unnamed: 0,user_id,actual,weighted_random_recommendation,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,random_recommendation_top_5000,itemitem_K_2,itemitem_K_3,itemitem_K_4,itemitem_K_5,itemitem_K_6,itemitem_K_7,itemitem_K_8,itemitem_K_9,itemitem_K_10,itemitem_K_11
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[15572855, 1126746, 883665, 13512301, 1044619]","[8248916, 1126995, 2455202, 1081189, 6981521]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1127831, 995242]","[1082185, 6666, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 6666, 1098066]","[1082185, 995242, 1029743, 840361, 904360]","[977432, 12349795, 1070803, 6034964, 1046465]","[1082185, 995242, 1029743, 840361, 904360]","[1082185, 981760, 995242, 1029743, 840361]","[1082185, 981760, 995242, 1127831, 840361]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 995242, 840361]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[959015, 1119667, 13007434, 894232, 12605078]","[16769451, 15830893, 1801587, 1082597, 614588]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 6666, 826249]","[1082185, 981760, 1098066, 826249, 6666]","[1082185, 1098066, 6534178, 826249, 1127831]","[8015407, 829001, 836286, 935993, 1044724]","[1082185, 1098066, 6534178, 826249, 1127831]","[1082185, 981760, 1098066, 6534178, 826249]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 995242, 826249]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]"


**Посчитаем все метрики**

In [25]:
# проверяем метрики
show_metrics(result, precision_at_k)

  return flags.sum() / len(recommended_list)


Unnamed: 0,precision@5
own_purchases,0.219858
itemitem_K_2,0.219303
itemitem_K_3,0.218976
itemitem_K_4,0.167287
itemitem_K_10,0.166112
itemitem_K_11,0.166112
itemitem_K_8,0.164838
itemitem_K_9,0.164153
itemitem_K_7,0.160039
itemitem_K_6,0.155534


**По мере увеличения K результат ухудшается**