In [1]:
import os

import pandas as pd
import numpy as np
from scipy import sparse
from implicit.bpr import BayesianPersonalizedRanking

Класс DataLoader предоставляет доступ к нужным данным

In [2]:
class DataLoader:
    def __init__(self, root: str) -> None:
        self._root = root
        
    def load_table(self, table_name: str) -> pd.DataFrame:
        return pd.read_csv(f"{self._root}/{table_name}.csv")

In [3]:
data_loader = DataLoader(root="./data")

train_purchases = data_loader.load_table("train")
test_purchases = data_loader.load_table("test")

display(train_purchases.head(), test_purchases.head())

Unnamed: 0,customer_id,product_id
0,1705131,120554
1,1624457,120555
2,1481210,120556
3,1788337,120557
4,443708,120558


Unnamed: 0,customer_id,product_id
0,1788295,120570
1,1567045,120578
2,1782630,120586
3,1788165,120658
4,1666277,120659


## Train
В качестве бейзлайн модели обучим Bayesian Personalized Ranking

In [4]:
model = BayesianPersonalizedRanking(factors=5, iterations=100)

Преобразуем таблицу действий в разреженную матрицу *item-user* и обучим на ней модель

In [5]:
train_csr = sparse.coo_matrix(
    (
        np.ones(train_purchases.customer_id.size, dtype=np.float32),
        (train_purchases.product_id, train_purchases.customer_id),
    )
).tocsr()


model.fit(train_csr)

100%|████████████████████████████████████████████████| 100/100 [00:01<00:00, 80.70it/s, correct=91.49%, skipped=26.68%]


## Evaluate

В качестве основной метрики будем использовать *MeanAveragePrecision@5*

In [6]:
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

Вспомогательные функции для получения рекомендаций

In [7]:
def recommend(model, user_ids, item_users):
    user_items = item_users.T.tocsr()
    recommendations = []
    for user_id in user_ids:
        recommendations.append(
            [x[0] for x in model.recommend(userid=user_id, user_items=user_items, N=10)]
        )
    return recommendations


def get_customer_purchases(purchases):
    relevant = (
        purchases.groupby("customer_id")["product_id"]
        .apply(lambda s: s.values.tolist())
        .reset_index()
    )
    relevant.rename(columns={"product_id": "product_ids"}, inplace=True)
    return relevant["customer_id"].tolist(), relevant["product_ids"].tolist()

Оценка модели

In [8]:
ids, purchases = get_customer_purchases(test_purchases)
recommendations = recommend(model, ids, train_csr)
score = mapk(purchases, recommendations, k=5)
score

0.34945372866127583

## Assignment

Некоторые пользователи совершали покупки и на другой платформе и у нас есть история их покупок

In [9]:
external_purchases = data_loader.load_table("external")

display(external_purchases.head())

Unnamed: 0,customer_id,product_id
0,5496085,362171
1,5544270,362172
2,5190646,362173
3,1289883,362174
4,803540,362175


Известно также, как сматчить customer_id двух платформ

In [11]:
customer_id_mapping = data_loader.load_table("ids_matching")
display(customer_id_mapping.head())

Unnamed: 0,source_customer_id,external_customer_id
0,17878,18
1,512572,19
2,141324,24
3,727278,26
4,581438,28


## Solution