In [42]:
import pandas as pd
import numpy as np
import implicit

import scipy

import catboost

from implicit.evaluation import ndcg_at_k, mean_average_precision_at_k, precision_at_k

from sklearn.model_selection import train_test_split

from sklearn.utils import shuffle

import matplotlib.pyplot as plt

import rectools

from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender
from implicit.als import AlternatingLeastSquares

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, MAP, calc_metrics, MeanInvUserFreq
from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel, ImplicitALSWrapperModel, EASEModel
from rectools.model_selection import TimeRangeSplitter, cross_validate

In [59]:
data = pd.read_csv('recs.csv')

In [60]:
data.head()

Unnamed: 0,sale_date,user_id,Gender,Age,R,F,M,item_id,Qty,Amount,Product_Name,Product_Line_1,Color,Discount,Base_Price
0,2023-04-08,452030256342,U,52.0,1,1,2,147003,1.0,1230.0,Паддл Крик Рислинг Розе,ВИНО,РОЗОВОЕ,0.0,1640.0
1,2022-01-11,722758534602,U,0.0,0,0,0,147367,12.0,4020.0,Альма Романа Пино Гриджо,ВИНО,БЕЛОЕ,0.0,370.0
2,2022-07-25,416284110335,Ж,0.0,2,5,5,146069,2.0,2616.0,Просекко Супериоре Вальдоббьяд,ШАМПАНСКОЕ,БЕЛОЕ,0.0,1640.0
3,2022-11-02,846586656243,U,0.0,0,0,0,118402,6.0,8755.5,Ле Риме (Тоскана) в п/у,ВИНО,БЕЛОЕ,4714.5,1780.0
4,2022-02-13,228487079590,U,0.0,3,2,4,144236,1.0,1549.0,Божоле-Вилляж,ВИНО,КРАСНОЕ,0.0,1740.0


In [61]:
interactions = data[['sale_date', 'user_id', 'item_id', 'Qty']]

In [62]:
interactions.sort_values(by='sale_date', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions.sort_values(by='sale_date', inplace=True)


In [63]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2936535 entries, 2309021 to 2607094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   sale_date  object 
 1   user_id    int64  
 2   item_id    int64  
 3   Qty        float64
dtypes: float64(1), int64(2), object(1)
memory usage: 112.0+ MB


In [64]:
interactions.sale_date = pd.to_datetime(interactions.sale_date)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions.sale_date = pd.to_datetime(interactions.sale_date)


In [65]:
# Train/test

In [66]:
max_date = interactions['sale_date'].max()
min_date = interactions['sale_date'].min()

print(f"min дата в interactions: {min_date}")
print(f"max дата в interactions: {max_date}")

min дата в interactions: 2022-01-01 00:00:00
max дата в interactions: 2024-03-27 00:00:00


In [69]:
train = interactions[(interactions['sale_date'] < max_date - pd.Timedelta(days=120))]
test = interactions[(interactions['sale_date'] >= max_date - pd.Timedelta(days=120))]

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (2376623, 4)
test: (559912, 4)


In [70]:
train.head()

Unnamed: 0,sale_date,user_id,item_id,Qty
2309021,2022-01-01,791652193564,138415,1.0
2521819,2022-01-01,914509930443,147061,1.0
1315784,2022-01-01,663944080364,138421,1.0
688323,2022-01-01,114605570770,138416,2.0
367133,2022-01-01,262499958887,135283,1.0


In [13]:
client_mapping = {client_id: i for i, client_id in enumerate(train['user_id'].unique())}
inverse_client_mapping = {i: client_id for client_id, i in client_mapping.items()}

sku_mapping = {sku: i for i, sku in enumerate(train['item_id'].unique())}
inverse_sku_mapping = {i: sku for sku, i in sku_mapping.items()}

In [14]:
users = np.array(train['user_id'].map(client_mapping))
items = np.array(train['item_id'].map(sku_mapping))
ratings = np.array(train['Qty'])

In [15]:
user_item_matrix = scipy.sparse.csr_matrix((ratings, (users, items)))

In [16]:
als = implicit.als.AlternatingLeastSquares(factors = 10, \
      iterations = 10, use_gpu = False, calculate_training_loss = False, regularization = 0.1)

  check_blas_config()


In [17]:
als.fit(user_item_matrix)

  0%|          | 0/10 [00:00<?, ?it/s]

In [35]:
train.head()

Unnamed: 0,sale_date,user_id,item_id,Qty
2309021,2022-01-01,791652193564,138415,1.0
2521819,2022-01-01,914509930443,147061,1.0
1315784,2022-01-01,663944080364,138421,1.0
688323,2022-01-01,114605570770,138416,2.0
367133,2022-01-01,262499958887,135283,1.0


In [37]:
als.recommend(client_mapping[791652193564],
              N=10, user_items=user_item_matrix, 
              filter_already_liked_items=False)

(array([   0,   23,  474,   10,    8, 4565, 4706,    3,  115,  459],
       dtype=int32),
 array([0.40615636, 0.28894925, 0.2229215 , 0.19292818, 0.16321415,
        0.14012612, 0.13277313, 0.1222131 , 0.11138402, 0.10844271],
       dtype=float32))

In [38]:
# rectools

In [71]:
interactions.rename(columns={'sale_date':Columns.Datetime, 'Qty':Columns.Weight}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions.rename(columns={'sale_date':Columns.Datetime, 'Qty':Columns.Weight}, inplace=True)


In [72]:
%%time
dataset = Dataset.construct(interactions_df=interactions)

CPU times: user 325 ms, sys: 40.5 ms, total: 365 ms
Wall time: 364 ms


In [73]:
# Take few simple models to compare
models = {
    "random": RandomModel(random_state=42),
    "popular": PopularModel(),
    "most_raited": PopularModel(popularity="sum_weight"),
    "tfidf_k=5": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=5)),
    "tfidf_k=10": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=10)),
    "bm25_k=10_k1=0.05_b=0.1": ImplicitItemKNNWrapperModel(model=BM25Recommender(K=5, K1=0.05, B=0.1)),
    "als" : ImplicitALSWrapperModel(AlternatingLeastSquares(factors = 10, iterations = 10, use_gpu = False, calculate_training_loss = False, regularization = 0.1)),
    "ease" : EASEModel()
}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@1": Precision(k=1),
    "prec@10": Precision(k=10),
    "prec@1": Recall(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10)
}

K_RECS = 10

In [74]:
n_splits = 3

splitter = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [75]:
%%time

# For each fold generate train and test part of dataset
# Then fit every model, generate recommendations and calculate metrics

cv_results = cross_validate(
    dataset=dataset,
    splitter=splitter,
    models=models,
    metrics=metrics,
    k=K_RECS,
    filter_viewed=True,
)

CPU times: user 16min 39s, sys: 2h 9min 54s, total: 2h 26min 34s
Wall time: 5min 29s


In [76]:
pivot_results = (
    pd.DataFrame(cv_results["metrics"])
    .drop(columns="i_split")
    .groupby(["model"], sort=False)
    .agg(["mean", "std"])
)
mean_metric_subset = [(metric, "mean") for metric in pivot_results.columns.levels[0]]
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,prec@1,prec@1,prec@10,prec@10,recall@10,recall@10,novelty,novelty,serendipity,serendipity
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
random,0.001997,0.000233,0.000325,4e-05,0.001997,0.000233,13.109249,0.007145,1.9e-05,7e-06
popular,0.078602,0.018139,0.011093,0.002247,0.078602,0.018139,4.528365,0.002519,4e-06,1e-06
most_raited,0.04224,0.008069,0.005979,0.000904,0.04224,0.008069,5.389964,0.017026,4e-06,1e-06
tfidf_k=5,0.044091,0.006021,0.00618,0.000693,0.044091,0.006021,9.678062,0.026134,0.000217,3.6e-05
tfidf_k=10,0.042692,0.007254,0.005995,0.000723,0.042692,0.007254,9.556698,0.038297,0.000156,1.5e-05
bm25_k=10_k1=0.05_b=0.1,0.102244,0.014059,0.014495,0.001689,0.102244,0.014059,5.08203,0.011382,0.000153,1.6e-05
als,0.08737,0.015792,0.012522,0.001735,0.08737,0.015792,5.760624,0.035605,0.000103,4e-06
ease,0.002448,0.000662,0.00035,7.7e-05,0.002448,0.000662,12.250559,0.025046,5.3e-05,1.9e-05


In [77]:
random_model = RandomModel(random_state=42)

In [78]:
random_model.fit(dataset)

<rectools.models.random.RandomModel at 0x7f8ae8905db0>

In [79]:
tfidf = ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=5))

In [80]:
tfidf.fit(dataset)

<rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f8ae8907a00>

In [81]:
bm_model = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=5, K1=0.05, B=0.1))

In [82]:
bm_model.fit(dataset)

<rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f8ae8907fd0>

In [83]:
#test

In [84]:
test = test[test.user_id.isin(train.user_id.unique())]

In [85]:
random_test_recos = random_model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=150,
    filter_viewed=True
)

In [86]:
tfidf_test_recos = tfidf.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=150,
    filter_viewed=True
)

In [87]:
bm_test_recos = bm_model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=150,
    filter_viewed=True
)

In [88]:
metrics_name = {
    'MAP': MAP,
    'Precision': Precision,
    'Recall' : Recall,
    'MIUF': MeanInvUserFreq
    

}
metrics = {}
for metric_name, metric in metrics_name.items():
    for k in (1, 5, 10):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [91]:
two_recos = random_test_recos.merge(tfidf_test_recos, on = ['user_id', 'item_id'], how='left', suffixes=['_random', '_tfidf'])

In [92]:
two_recos = two_recos.merge(bm_test_recos, on = ['user_id', 'item_id'], how='left')

In [95]:
two_recos.fillna(0,inplace=True)

In [107]:
two_recos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12378900 entries, 0 to 12378899
Data columns (total 9 columns):
 #   Column        Dtype  
---  ------        -----  
 0   user_id       int64  
 1   item_id       int64  
 2   score_random  int64  
 3   rank_random   int64  
 4   score_tfidf   float64
 5   rank_tfidf    float64
 6   score         float64
 7   rank          float64
 8   avg_rank      float64
dtypes: float64(5), int64(4)
memory usage: 850.0 MB


In [106]:
two_recos['avg_rank'] = (two_recos.rank_random + two_recos.rank_tfidf + two_recos['rank']) // 3

In [113]:
two_recos['rank'] = two_recos['avg_rank'].astype(np.int32) + 1

In [114]:
metric_values = calc_metrics(metrics, two_recos, test[['user_id', 'item_id']], train[['user_id', 'item_id']])

In [115]:
metric_values

{'Precision@1': 0.0,
 'Recall@1': 0.0,
 'Precision@5': 0.0,
 'Recall@5': 0.0,
 'Precision@10': 0.0,
 'Recall@10': 0.0,
 'MAP@1': 0.0,
 'MAP@5': 0.0,
 'MAP@10': 0.0,
 'MIUF@1': 13.212664736275894,
 'MIUF@5': 13.21324943148537,
 'MIUF@10': 13.207370350283457}

In [116]:
# Похоже пока я шифровал данные, накосячил(( У Вас в вашей работе на этих метриках не должны быть нули