# Initialization

In [2]:
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

# Загрузка данных

In [4]:
items = pd.read_parquet("./goodsread/items.par")
events = pd.read_parquet("./goodsread/events.par")

# Разбиение с учётом хронологии

Рекомендательные системы на практике работают с учётом хронологии. Поэтому поток событий для тренировки и валидации полезно делить на то, что уже случилось, и что ещё случится. Это позволяет проводить валидацию на тех же пользователях, на которых тренировались, но на их событиях в будущем.

# === Знакомство: "холодный" старт

In [5]:
# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

# количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()
# количество пользователей, которые есть и в train, и в test
common_users = list(set(users_train) & set(users_test))

print(len(users_train), len(users_test), len(common_users))

428220 123223 120858


In [5]:
cold_users = set(users_test) - set(common_users)

print(len(cold_users)) 

2365


In [18]:
top_pop_start_date = pd.to_datetime("2015-01-01").date()

item_popularity = events_train \
    .query("started_at >= @top_pop_start_date") \
    .groupby(["item_id"]).agg(users=("user_id", "nunique"), avg_rating=("rating", "mean")).reset_index()
item_popularity["popularity_weighted"] = item_popularity["users"] * item_popularity["avg_rating"]

# сортируем по убыванию взвешенной популярности
item_popularity = item_popularity.sort_values(by="popularity_weighted",ascending=False).reset_index()

# выбираем первые 100 айтемов со средней оценкой avg_rating не меньше 4
top_k_pop_items = item_popularity.loc[item_popularity["avg_rating"] >= 4][0:100]
top_k_pop_items

Unnamed: 0,index,item_id,users,avg_rating,popularity_weighted
2,32387,18007564,20207,4.321275,87320.0
3,32623,18143977,19462,4.290669,83505.0
4,30695,16096824,16770,4.301014,72128.0
5,2,3,15139,4.706057,71245.0
7,3718,38447,14611,4.232770,61845.0
...,...,...,...,...,...
131,19596,2767052,4361,4.413437,19247.0
133,32835,18293427,4674,4.092640,19129.0
134,378,3636,4667,4.098564,19128.0
135,33611,18966819,4361,4.374914,19079.0


In [7]:
top_k_pop_items = top_k_pop_items.merge(
    items.set_index("item_id")[["author", "title", "genre_and_votes", "publication_year"]], 
    on="item_id"
    ).sort_values(by="popularity_weighted",ascending=False).copy()

with pd.option_context('display.max_rows', 100):
    display(top_k_pop_items[["item_id", "author", "title", "publication_year", "users", "avg_rating", "popularity_weighted", "genre_and_votes"]])

Unnamed: 0,item_id,author,title,publication_year,users,avg_rating,popularity_weighted,genre_and_votes
0,18007564,Andy Weir,The Martian,2014.0,20207,4.321275,87320.0,"{'Science Fiction': 11966, 'Fiction': 8430}"
1,18143977,Anthony Doerr,All the Light We Cannot See,2014.0,19462,4.290669,83505.0,"{'Historical-Historical Fiction': 13679, 'Fict..."
2,16096824,Sarah J. Maas,A Court of Thorns and Roses (A Court of Thorns...,2015.0,16770,4.301014,72128.0,"{'Fantasy': 14326, 'Young Adult': 4662, 'Roman..."
3,3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,1997.0,15139,4.706057,71245.0,"{'Fantasy': 59818, 'Fiction': 17918, 'Young Ad..."
4,38447,Margaret Atwood,The Handmaid's Tale,1998.0,14611,4.23277,61845.0,"{'Fiction': 15424, 'Classics': 9937, 'Science ..."
5,15881,"J.K. Rowling, Mary GrandPré",Harry Potter and the Chamber of Secrets (Harry...,1999.0,13043,4.632447,60421.0,"{'Fantasy': 50130, 'Young Adult': 15202, 'Fict..."
6,11235712,Marissa Meyer,"Cinder (The Lunar Chronicles, #1)",2012.0,14348,4.179189,59963.0,"{'Young Adult': 10539, 'Fantasy': 9237, 'Scien..."
7,17927395,Sarah J. Maas,A Court of Mist and Fury (A Court of Thorns an...,2016.0,12177,4.73064,57605.0,"{'Fantasy': 10186, 'Romance': 3346, 'Young Adu..."
8,18692431,"Nicola Yoon, David Yoon","Everything, Everything",2015.0,14121,4.071454,57493.0,"{'Young Adult': 5175, 'Romance': 3234, 'Contem..."
9,5,"J.K. Rowling, Mary GrandPré",Harry Potter and the Prisoner of Azkaban (Harr...,2004.0,11890,4.770143,56717.0,"{'Fantasy': 49784, 'Young Adult': 15393, 'Fict..."


# === Знакомство: первые персональные рекомендации

In [8]:
cold_users_events_with_recs = events_test[events_test["user_id"].isin(cold_users)].merge(top_k_pop_items[["avg_rating","item_id"]], on="item_id", how="left")
cold_users_events_with_recs

Unnamed: 0,item_id,started_at,read_at,is_read,rating,is_reviewed,user_id,avg_rating
0,6900,2017-10-09,2017-10-13,True,4,False,1361610,
1,12555,2017-09-21,2017-10-11,True,3,False,1361610,
2,25899336,2017-09-12,2017-09-17,True,4,True,1361610,4.427261
3,21936809,2017-08-20,2017-08-24,True,4,True,1361610,
4,6952,2017-09-18,2017-09-20,True,3,False,1361610,
...,...,...,...,...,...,...,...,...
9667,252499,2017-09-30,2017-10-06,True,4,False,1178502,
9668,51113,2017-09-25,2017-10-07,True,4,False,1253160,
9669,16181775,2017-09-24,2017-09-25,True,3,False,1253160,
9670,10210,2017-09-16,2017-09-24,True,5,False,1253160,


In [9]:

cold_user_items_no_avg_rating_idx = cold_users_events_with_recs["avg_rating"].isnull()
cold_user_recs = cold_users_events_with_recs[~cold_user_items_no_avg_rating_idx][["user_id", "item_id", "rating", "avg_rating"]]
print("количество холдных пользователей прочитавших книги из top100 рекомендаций",len(cold_user_recs))
cold_user_recs.head()

количество холдных пользователей прочитавших книги из top100 рекомендаций 1912


Unnamed: 0,user_id,item_id,rating,avg_rating
2,1361610,25899336,4,4.427261
5,1338996,16096824,5,4.301014
8,1338996,18692431,5,4.071454
9,1338996,28763485,2,4.194663
15,1276025,38447,5,4.23277


In [10]:
print(len(cold_user_recs)/len(cold_users))

0.8084566596194503


In [11]:
# посчитаем метрики рекомендаций
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse = mean_squared_error(cold_user_recs["rating"], cold_user_recs["avg_rating"], squared=False)
mae = mean_absolute_error(cold_user_recs["rating"], cold_user_recs["avg_rating"])
print(round(rmse, 2), round(mae, 2)) 

0.78 0.62


In [22]:
cold_users_hit_ratio = cold_users_events_with_recs.groupby("user_id").agg(hits=("avg_rating", lambda x: (~x.isnull()).mean()))

print(f"Доля пользователей без релевантных рекомендаций: {(cold_users_hit_ratio == 0).mean().iat[0]:.2f}")
print(f"Среднее покрытие пользователей: {cold_users_hit_ratio[cold_users_hit_ratio != 0].mean().iat[0]:.2f}") 

Доля пользователей без релевантных рекомендаций: 0.59
Среднее покрытие пользователей: 0.44


# === Базовые подходы: коллаборативная фильтрация

sparsity= количество всех ячеек / количество пустых ячеек

 - Количество всех ячеек это количество пользователей * количество объъектов
 - количество всех оценок в таблице events - это непустое значение на пересечении пользователь-книга

In [54]:
sparsity = (1 - len(events)/(events['user_id'].nunique() * events['item_id'].nunique())) * 100
print("sparsity =",sparsity,"%")

sparsity = 99.92890618975984 %


In [8]:
from surprise import Dataset, Reader
from surprise import SVD

# используем Reader из библиотеки surprise для преобразования событий (events)
# в формат, необходимый surprise
reader = Reader(rating_scale=(1, 5))
surprise_train_set = Dataset.load_from_df(events_train[['user_id', 'item_id', 'rating']], reader)
surprise_train_set = surprise_train_set.build_full_trainset()

# инициализируем модель
svd_model = SVD(n_factors=100, random_state=0)

# обучаем модель
svd_model.fit(surprise_train_set) 

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe03b13e140>

In [9]:
import joblib

joblib.dump(svd_model, 'models/svd_recsys.joblib') 

['models/svd_recsys.joblib']

In [12]:
surprise_test_set = list(events_test[['user_id', 'item_id', 'rating']].itertuples(index=False))

# получаем рекомендации для тестовой выборки
svd_predictions = svd_model.test(surprise_test_set)

from surprise import accuracy

rmse = accuracy.rmse(svd_predictions)
mae = accuracy.mae(svd_predictions)
                     
print(rmse, mae)

RMSE: 0.8289
MAE:  0.6474
0.8288711689059135 0.647437483750257


In [38]:
from surprise import NormalPredictor

# инициализируем состояние генератора, это необходимо для получения
# одной и той же последовательности случайных чисел, только в учебных целях
np.random.seed(0)

random_model = NormalPredictor()

random_model.fit(surprise_train_set)
random_predictions = random_model.test(surprise_test_set)

mae_rand = accuracy.mae(random_predictions)
print(mae_rand)

MAE:  1.0018
1.0017726877569562


# Факультативно

Удалите из events события для редких айтемов — таких, с которыми взаимодействовало менее N пользователей. Возьмите небольшое N, например 2–3 пользователя. Получите рекомендации, посчитайте метрики, оцените, как они изменились. Подумайте, с чем могут быть связаны такие изменения.

In [27]:
events["count_events_with_item"] = events.groupby(["item_id"])["user_id"].transform("count")
events = events[events["count_events_with_item"] >= 3].copy()

train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

# количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()
# количество пользователей, которые есть и в train, и в test
common_users = list(set(users_train) & set(users_test))

print(len(users_train), len(users_test), len(common_users))

428212 123177 120808


In [29]:
events["count_events_with_item"] = events.groupby(["item_id"])["user_id"].transform("count")
less_readers = events[events["count_events_with_item"] == 3].copy()
less_readers["user_id"][:10]

8807     1160232
27461    1274944
28863    1222949
32033    1227233
34221    1407657
34753    1063891
37581    1331968
38506    1267828
41042    1262466
53937    1397124
Name: user_id, dtype: int64

In [58]:
reader = Reader(rating_scale=(1, 5))
surprise_train_set = Dataset.load_from_df(events_train[['user_id', 'item_id', 'rating']], reader)
surprise_train_set = surprise_train_set.build_full_trainset()

# инициализируем модель
svd_model = SVD(n_factors=100, random_state=0)

# обучаем модель
svd_model.fit(surprise_train_set) 

surprise_test_set = list(events_test[['user_id', 'item_id', 'rating']].itertuples(index=False))

# получаем рекомендации для тестовой выборки
svd_predictions = svd_model.test(surprise_test_set)

rmse = accuracy.rmse(svd_predictions)
mae = accuracy.mae(svd_predictions)

RMSE: 0.8287
MAE:  0.6473


In [30]:
def get_recommendations_svd(user_id, all_items, events, model, include_seen=True, n=5):

    """ возвращает n рекомендаций для user_id """
    
    # получим список идентификаторов всех книг
    all_items = set(events['item_id'].unique())
        
    # учитываем флаг, стоит ли уже прочитанные книги включать в рекомендации
    if include_seen:
        items_to_predict = list(all_items)
    else:
        # получим список книг, которые пользователь уже прочитал ("видел")
        seen_items = set(events.query("user_id == @user_id")['item_id'].unique())
        
        # книги, которые пользователь ещё не читал
        # только их и будем включать в рекомендации
        items_to_predict = list(all_items - seen_items)
    
    # получаем скоры для списка книг, т. е. рекомендации
    predictions = [model.predict(user_id, item_id) for item_id in items_to_predict]
    
    # сортируем рекомендации по убыванию скора и берём только n первых
    recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    
    return pd.DataFrame([(pred.iid, pred.est) for pred in recommendations], columns=["item_id", "score"]) 

In [31]:
#несколько холодных
list(cold_users)[:7]

[1073158, 1114119, 1204233, 1024010, 1294347, 1384460, 1286158]

In [33]:
# несколько тренированных
events_train['user_id'].sample(n=7).to_list()

[1365189, 1287316, 1403873, 1358849, 1334171, 1251065, 1317281]

In [37]:
rec_list = get_recommendations_svd(1317281, items, events, svd_model).merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
rec_list

Unnamed: 0,item_id,score,author,title,genre_and_votes
0,54741,4.798998,Quino,Toda Mafalda,"{'Sequential Art-Comics': 157, 'Humor': 47, 'S..."
1,24814,4.783015,Bill Watterson,It's a Magical World: A Calvin and Hobbes Coll...,"{'Sequential Art-Comics': 680, 'Humor': 381, '..."
2,7864312,4.761605,José Antonio Cotrina,La sombra de la luna (El ciclo de la luna roja...,"{'Fantasy': 20, 'Young Adult': 7}"
3,323355,4.761081,Joseph Smith Jr.,The Book of Mormon: Another Testament of Jesus...,"{'Religion': 896, 'Nonfiction': 486, 'Christia..."
4,24812,4.745999,Bill Watterson,The Complete Calvin and Hobbes,"{'Sequential Art-Comics': 867, 'Humor': 378, '..."


In [38]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
#user_id = events_train['user_id'].sample().iat[0]
user_id = 1317281

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
user_recommendations = get_recommendations_svd(user_id, items, events_train, svd_model)
user_recommendations = user_recommendations.merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
display(user_recommendations) 

user_id: 1317281
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
49,Jenny Han,To All the Boys I've Loved Before (To All the ...,2016-08-27,2016-08-31,3,"{'Young Adult': 5919, 'Romance': 4965, 'Contem..."
50,Rachel Hawkins,"Rebel Belle (Rebel Belle, #1)",2016-08-24,2016-08-27,3,"{'Young Adult': 1336, 'Fantasy': 1234, 'Fantas..."
51,Renée Ahdieh,The Wrath and the Dawn (The Wrath and the Dawn...,2016-05-11,2016-05-12,4,"{'Fantasy': 5942, 'Young Adult': 3448, 'Romanc..."
52,Laini Taylor,Daughter of Smoke & Bone (Daughter of Smoke & ...,2016-07-03,2016-07-15,3,"{'Fantasy': 11681, 'Young Adult': 7110, 'Roman..."
53,Marissa Meyer,"Cinder (The Lunar Chronicles, #1)",2016-11-28,2016-12-04,4,"{'Young Adult': 10539, 'Fantasy': 9237, 'Scien..."
54,Victoria Aveyard,"Red Queen (Red Queen, #1)",2016-08-08,2016-08-10,4,"{'Fantasy': 10750, 'Young Adult': 6597, 'Scien..."
55,Sarah J. Maas,A Court of Thorns and Roses (A Court of Thorns...,2017-02-16,2017-02-20,5,"{'Fantasy': 14326, 'Young Adult': 4662, 'Roman..."
56,"Amie Kaufman, Meagan Spooner","Their Fractured Light (Starbound, #3)",2016-04-27,2016-04-28,4,"{'Science Fiction': 576, 'Young Adult': 498, '..."
57,Cassandra Clare,"Clockwork Angel (The Infernal Devices, #1)",2016-08-08,2016-08-21,4,"{'Fantasy': 10151, 'Young Adult': 7424, 'Fanta..."
58,"Morgan Matson, Морган Мэтсон",The Unexpected Everything,2016-06-07,2016-06-10,5,"{'Contemporary': 1466, 'Young Adult': 1244, 'R..."


Рекомендации


Unnamed: 0,item_id,score,author,title,genre_and_votes
0,54741,4.798998,Quino,Toda Mafalda,"{'Sequential Art-Comics': 157, 'Humor': 47, 'S..."
1,24814,4.783015,Bill Watterson,It's a Magical World: A Calvin and Hobbes Coll...,"{'Sequential Art-Comics': 680, 'Humor': 381, '..."
2,7864312,4.761605,José Antonio Cotrina,La sombra de la luna (El ciclo de la luna roja...,"{'Fantasy': 20, 'Young Adult': 7}"
3,323355,4.761081,Joseph Smith Jr.,The Book of Mormon: Another Testament of Jesus...,"{'Religion': 896, 'Nonfiction': 486, 'Christia..."
4,24812,4.745999,Bill Watterson,The Complete Calvin and Hobbes,"{'Sequential Art-Comics': 867, 'Humor': 378, '..."


# === Базовые подходы: контентные рекомендации

In [30]:
import scipy
import sklearn.preprocessing

In [44]:
# перекодируем идентификаторы пользователей: 
# из имеющихся в последовательность 0, 1, 2, ...
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])

# перекодируем идентификаторы объектов: 
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["item_id"])
items["item_id_enc"] = item_encoder.transform(items["item_id"])
events_train["item_id_enc"] =  item_encoder.transform(events_train["item_id"])
events_test["item_id_enc"] = item_encoder.transform(events_test["item_id"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])


ValueError: y contains previously unseen labels: [682, 1978, 2115, 3326, 5036, 6837, 6996, 7292, 7619, 8824, 10457, 10518, 10822, 11671, 11719, 13358, 13822, 14427, 15501, 15616, 16409, 17886, 21835, 24150, 24520, 24867, 24880, 25935, 27472, 28425, 32850, 33530, 33577, 33886, 34289, 34811, 35402, 35514, 35728, 35899, 35988, 36555, 38419, 42908, 43112, 43885, 44755, 45632, 46002, 47533, 47601, 49045, 51008, 51138, 53086, 53811, 54111, 58069, 59647, 60431, 61168, 62177, 62482, 63024, 64984, 65716, 66553, 67329, 68204, 70288, 71706, 71788, 72481, 72999, 73687, 74696, 75506, 76525, 77555, 77998, 80015, 80483, 81049, 81051, 83491, 87371, 87578, 87747, 87956, 88332, 88868, 90381, 91802, 92743, 95447, 95657, 97294, 97652, 98475, 99966, 101014, 101887, 102239, 104418, 104665, 105639, 107338, 107469, 108314, 108924, 109774, 110519, 113985, 114597, 116231, 119804, 120405, 123168, 123544, 123683, 123706, 124983, 129956, 131338, 131843, 133284, 136456, 139620, 144204, 146656, 147635, 148513, 148938, 153594, 153806, 153910, 156597, 157456, 157801, 158417, 158479, 158685, 158969, 159444, 159709, 161057, 162130, 164236, 164723, 164947, 165002, 165208, 165449, 165861, 169671, 170844, 172490, 173574, 173590, 174682, 177413, 179191, 179436, 180806, 181256, 181259, 181286, 181776, 183673, 184866, 185152, 188338, 197458, 198898, 199707, 199712, 201381, 202224, 208723, 209936, 211439, 212332, 212993, 216420, 217355, 221817, 223145, 226151, 229122, 231532, 231768, 232674, 236183, 236363, 238514, 238846, 240950, 241101, 243117, 243389, 243604, 246574, 247819, 251180, 252459, 254324, 254501, 254659, 256959, 260337, 260482, 260659, 263091, 264205, 267533, 269751, 270658, 274052, 278299, 279119, 279647, 280021, 281152, 281493, 285941, 286491, 293313, 293486, 293499, 296523, 297909, 298236, 299689, 300451, 302431, 302699, 302727, 303502, 304889, 307962, 309455, 310026, 321544, 322006, 322253, 324655, 327322, 329084, 332100, 332510, 333082, 334258, 334517, 335940, 336025, 336717, 336895, 338046, 339031, 339227, 341992, 342371, 343192, 344199, 344351, 344580, 345167, 348634, 349583, 350358, 353157, 353303, 355335, 359638, 359759, 363046, 363331, 365331, 365976, 367679, 372850, 372862, 376922, 382419, 384458, 386480, 390786, 391850, 392873, 393091, 394729, 396422, 396587, 398863, 399260, 405980, 406341, 406853, 409014, 409589, 409637, 411746, 411992, 413085, 414146, 414147, 415032, 417007, 420474, 420753, 422732, 426000, 426215, 426840, 428368, 430996, 435804, 437993, 440201, 440358, 441046, 441644, 442673, 445426, 448696, 449575, 452939, 462883, 464036, 464916, 465870, 474777, 476384, 482021, 483646, 484094, 484122, 485815, 492887, 493746, 495305, 496123, 496131, 496638, 496910, 498664, 501710, 507590, 510022, 510029, 510537, 511152, 511613, 522726, 533639, 535116, 536302, 536451, 541924, 543203, 544508, 546570, 547102, 547879, 547943, 548699, 552106, 554491, 554672, 559625, 565836, 565837, 568175, 570132, 572084, 574027, 574128, 575405, 579039, 586416, 588343, 588508, 591822, 591965, 592714, 595304, 596687, 598495, 601735, 603992, 608787, 609160, 609518, 610426, 612426, 614768, 614863, 615955, 620755, 622156, 622340, 623811, 624988, 630561, 631591, 634067, 636647, 639648, 639833, 647685, 655739, 659578, 660280, 662803, 666447, 669852, 671039, 674213, 679465, 684911, 689328, 694321, 700157, 700262, 708526, 708921, 710750, 712636, 714214, 723558, 724935, 727270, 727281, 727831, 730959, 732141, 739589, 741966, 742772, 743812, 748350, 752348, 754354, 755107, 755627, 757196, 761126, 761218, 761306, 764816, 768952, 773788, 775338, 777459, 780726, 784113, 793065, 801878, 807609, 809248, 809428, 809515, 809635, 811439, 813198, 813721, 816462, 817168, 826472, 830899, 831446, 836475, 840096, 845250, 846941, 851927, 853825, 862873, 863250, 866402, 866983, 867517, 868698, 869969, 870485, 872468, 873304, 874762, 878079, 878460, 882744, 883079, 883801, 887980, 888674, 889071, 891608, 894881, 898056, 900426, 906340, 921818, 927244, 932049, 933562, 934205, 935436, 936880, 937106, 939467, 940202, 946836, 947362, 953256, 957282, 964427, 969369, 974193, 980010, 980468, 982068, 982250, 983294, 983569, 989097, 989667, 989862, 991849, 1000931, 1004096, 1011451, 1017264, 1017301, 1017308, 1017906, 1019653, 1019742, 1020302, 1020580, 1030044, 1031422, 1034995, 1037138, 1052659, 1054845, 1057588, 1058615, 1058726, 1061931, 1070107, 1073530, 1074849, 1076760, 1078408, 1082318, 1092158, 1094401, 1095061, 1105476, 1107821, 1108888, 1115866, 1120193, 1124563, 1127539, 1136210, 1144121, 1148113, 1149607, 1153951, 1155278, 1158376, 1160588, 1163880, 1164589, 1167033, 1168440, 1169051, 1169133, 1179976, 1187448, 1191721, 1192530, 1197929, 1198981, 1202625, 1206158, 1206930, 1207063, 1212774, 1216545, 1223054, 1223164, 1230340, 1232796, 1235535, 1236193, 1242506, 1244160, 1244267, 1246570, 1251125, 1254247, 1254684, 1255731, 1256185, 1273773, 1273995, 1282886, 1286268, 1287272, 1288360, 1290757, 1291491, 1293991, 1296964, 1298130, 1300391, 1304028, 1307000, 1307109, 1312205, 1320170, 1321148, 1341454, 1347765, 1355993, 1357658, 1366132, 1372415, 1373792, 1374250, 1385471, 1387399, 1387654, 1387869, 1394909, 1406099, 1407232, 1410294, 1412302, 1420126, 1429666, 1433867, 1435992, 1437831, 1440075, 1440203, 1453991, 1455289, 1456450, 1458993, 1459217, 1471786, 1477512, 1492378, 1498723, 1499163, 1501622, 1510538, 1516736, 1522679, 1523598, 1527654, 1534750, 1539150, 1547361, 1547379, 1556130, 1559509, 1559895, 1561175, 1563075, 1570989, 1578586, 1579805, 1583018, 1585431, 1586116, 1587995, 1590423, 1591114, 1597669, 1598040, 1603101, 1616126, 1616596, 1619837, 1621217, 1626311, 1632396, 1641944, 1644076, 1648540, 1654709, 1657368, 1672391, 1685075, 1690088, 1691909, 1697732, 1700000, 1702058, 1712096, 1717269, 1728551, 1739710, 1740282, 1747576, 1749940, 1755776, 1760765, 1763612, 1785300, 1791460, 1796726, 1798329, 1801073, 1803701, 1813077, 1817308, 1821730, 1829115, 1829889, 1832682, 1833668, 1837710, 1846479, 1851655, 1856301, 1857192, 1869060, 1873536, 1877024, 1883336, 1906946, 1914652, 1924847, 1930216, 1932424, 1933005, 1938237, 1941443, 1945842, 1946568, 1958493, 1960591, 1970023, 1970036, 1973931, 1981046, 1993801, 2003722, 2008039, 2008298, 2011772, 2014000, 2017109, 2030441, 2040225, 2044979, 2065589, 2077035, 2079563, 2093404, 2099596, 2101630, 2104061, 2110473, 2147058, 2150355, 2153110, 2166165, 2174010, 2184593, 2187466, 2195443, 2216225, 2218352, 2223353, 2224739, 2228149, 2229030, 2253212, 2255284, 2260330, 2281052, 2290798, 2302536, 2314849, 2325248, 2337398, 2344801, 2358849, 2358870, 2368651, 2372440, 2378672, 2389938, 2422554, 2431812, 2433478, 2437698, 2452563, 2471719, 2474204, 2490758, 2495056, 2501638, 2512131, 2525348, 2578867, 2581254, 2584045, 2587444, 2594322, 2608296, 2610233, 2613071, 2617033, 2617843, 2618807, 2620105, 2646383, 2652089, 2658042, 2669775, 2681467, 2687495, 2699898, 2703773, 2706530, 2721469, 2728741, 2729814, 2737089, 2745692, 2784363, 2785686, 2788968, 2790570, 2791515, 2808430, 2809742, 2812137, 2838797, 2842871, 2855751, 2864577, 2877861, 2878910, 2884299, 2930753, 2961799, 2968562, 2972344, 2980509, 2980972, 2986286, 2986392, 2989344, 3003723, 3004197, 3009998, 3010034, 3044630, 3046463, 3048530, 3060904, 3078550, 3083278, 3084002, 3085912, 3086224, 3112450, 3119662, 3130076, 3133038, 3143533, 3169954, 3182869, 3188302, 3195668, 3209087, 3217673, 3230480, 3239750, 3242413, 3248297, 3257215, 3260634, 3265294, 3271565, 3277674, 3290525, 3326540, 3359010, 3359011, 3359180, 3366149, 3378965, 3388877, 3389592, 3399843, 3408524, 3425252, 3427390, 3486266, 3495748, 3505944, 3544225, 3545049, 3550665, 3676719, 3696025, 3804289, 3811882, 3820389, 3852838, 3854232, 3855578, 3916451, 3954298, 3955077, 3973924, 3986062, 4041119, 4066338, 4079518, 4086725, 4110127, 4133354, 4135492, 4163378, 4187533, 4211037, 4222676, 4276937, 4310522, 4335756, 4428533, 4457727, 4468819, 4469138, 4495772, 4496091, 4525593, 4540242, 4608571, 4642752, 4703259, 4734404, 4744368, 4745993, 4756029, 4775967, 4776314, 4785973, 4893543, 4926564, 4956883, 5043759, 5098020, 5167648, 5184603, 5198729, 5278069, 5280587, 5290189, 5296227, 5385951, 5415929, 5460223, 5606499, 5758016, 5822252, 5851400, 5853934, 5973099, 5985254, 5994923, 5998379, 6008809, 6011972, 6060899, 6091142, 6095455, 6099731, 6116682, 6120839, 6122521, 6124415, 6125135, 6135488, 6148816, 6150382, 6172169, 6192376, 6202745, 6203395, 6217765, 6218054, 6218681, 6224528, 6249600, 6261151, 6261155, 6261738, 6272048, 6296341, 6307432, 6316158, 6316959, 6324742, 6340149, 6341778, 6353882, 6359649, 6361692, 6365216, 6365996, 6375672, 6376522, 6376720, 6377037, 6381379, 6386306, 6399288, 6401032, 6410978, 6417058, 6420507, 6428155, 6431633, 6443318, 6449448, 6452663, 6456111, 6467249, 6467772, 6467853, 6474415, 6476887, 6477793, 6478602, 6494782, 6506701, 6508077, 6517322, 6531968, 6536261, 6540959, 6544386, 6549230, 6553175, 6555412, 6559244, 6559902, 6566667, 6589572, 6593318, 6599175, 6606469, 6609548, 6609626, 6615723, 6633142, 6633898, 6643581, 6644604, 6647780, 6649318, 6653469, 6657074, 6661002, 6665361, 6690630, 6693015, 6693402, 6723491, 6729300, 6757337, 6759454, 6765667, 6773503, 6774007, 6781006, 6795627, 6806633, 6829959, 6858346, 6860652, 6869000, 6882188, 6887333, 6901097, 6903603, 6924458, 6925670, 6926398, 6931439, 6944881, 6949613, 6957921, 6963165, 6964002, 6968555, 6969367, 6969961, 6974489, 6979147, 6989670, 6989790, 6992777, 6998357, 7001018, 7036570, 7046916, 7052308, 7052663, 7056082, 7056921, 7058894, 7077877, 7079397, 7080507, 7097183, 7097697, 7104435, 7117109, 7118086, 7130701, 7131681, 7135797, 7140220, 7140583, 7148737, 7151447, 7153802, 7159536, 7163871, 7179154, 7187861, 7192913, 7205446, 7224904, 7227391, 7245795, 7247842, 7254244, 7268250, 7280472, 7290754, 7293149, 7294912, 7312149, 7313711, 7331942, 7346106, 7401958, 7405508, 7405775, 7417871, 7430226, 7438807, 7443012, 7452300, 7453538, 7453869, 7456726, 7493011, 7494451, 7502082, 7505126, 7529869, 7530028, 7530046, 7553076, 7599243, 7600897, 7602367, 7650671, 7663033, 7663384, 7673244, 7683565, 7685687, 7693839, 7706518, 7706939, 7733054, 7748196, 7764176, 7767448, 7767483, 7769217, 7775594, 7791679, 7794043, 7799139, 7824057, 7827689, 7858034, 7860099, 7863604, 7869294, 7869869, 7882740, 7885894, 7890530, 7894437, 7901368, 7904207, 7931350, 7933377, 7935992, 7940630, 7943300, 7944140, 7948881, 7959424, 7968644, 7969396, 7983432, 7985834, 7995199, 8009281, 8026111, 8035533, 8041711, 8043282, 8065164, 8076962, 8080740, 8099597, 8104355, 8105187, 8125657, 8134092, 8144969, 8146077, 8163356, 8164800, 8189310, 8234195, 8241088, 8273636, 8323386, 8339150, 8340846, 8347054, 8363907, 8373331, 8419235, 8424017, 8429602, 8433695, 8434205, 8462504, 8464587, 8466114, 8467071, 8491419, 8493643, 8503543, 8508126, 8516589, 8525299, 8528702, 8532869, 8542611, 8545134, 8548042, 8573409, 8586573, 8591792, 8602369, 8609337, 8620962, 8639835, 8646050, 8662494, 8666090, 8694331, 8696564, 8706170, 8711937, 8724779, 8726184, 8738626, 8747626, 8797920, 8802774, 8834274, 8892943, 8895306, 8924846, 8926934, 8928799, 8935302, 8935883, 8963238, 8964735, 8978195, 8982561, 8985054, 9003854, 9009637, 9009787, 9064116, 9069150, 9085456, 9085861, 9086623, 9097523, 9137763, 9155291, 9187584, 9247442, 9257332, 9260669, 9265069, 9268693, 9271388, 9284095, 9297264, 9310626, 9317110, 9343373, 9347470, 9365567, 9378834, 9379496, 9386880, 9394189, 9400516, 9436327, 9439936, 9452244, 9454543, 9459078, 9477539, 9478546, 9480176, 9484851, 9487892, 9499774, 9505036, 9508678, 9535775, 9537041, 9562093, 9608958, 9630431, 9633873, 9634643, 9635098, 9640884, 9642101, 9643445, 9646747, 9654364, 9661294, 9662110, 9675221, 9678752, 9692308, 9692475, 9692847, 9699011, 9706777, 9707901, 9721891, 9722241, 9733795, 9736277, 9753960, 9754997, 9761341, 9766649, 9768203, 9768204, 9774086, 9790396, 9791117, 9792028, 9832370, 9846485, 9849568, 9856722, 9858605, 9860359, 9866666, 9867109, 9879625, 9893701, 9895478, 9895634, 9963724, 9991170, 10029438, 10042932, 10043433, 10046509, 10046958, 10053157, 10053339, 10067884, 10075354, 10081805, 10100352, 10104846, 10114213, 10118917, 10119527, 10128608, 10147914, 10148600, 10151228, 10166619, 10166746, 10176320, 10180109, 10202672, 10221138, 10230238, 10255180, 10288614, 10299092, 10299349, 10318844, 10330982, 10331318, 10334012, 10335647, 10341915, 10347690, 10360191, 10360228, 10380819, 10386060, 10392429, 10399969, 10419436, 10425098, 10426130, 10430822, 10478391, 10514748, 10515837, 10638323, 10639954, 10660836, 10673335, 10684386, 10711520, 10725481, 10762984, 10771667, 10775714, 10779168, 10779433, 10783162, 10783349, 10786002, 10791451, 10792858, 10798003, 10812135, 10815662, 10830932, 10836748, 10844109, 10845991, 10869707, 10870347, 10886612, 10887160, 10888884, 10946420, 10948240, 10957730, 10962216, 10973495, 10982182, 10985449, 10991547, 10995431, 10998596, 11011623, 11021753, 11052045, 11071275, 11074322, 11079823, 11092328, 11093191, 11093223, 11093722, 11098153, 11104728, 11127604, 11129953, 11155935, 11156050, 11175689, 11187932, 11191602, 11203057, 11217236, 11231128, 11234478, 11236063, 11262178, 11281778, 11290671, 11299997, 11300451, 11316721, 11319441, 11324664, 11328737, 11344621, 11365223, 11377709, 11380159, 11388049, 11390271, 11393301, 11399934, 11402993, 11403577, 11405802, 11412145, 11415233, 11420521, 11421122, 11421250, 11423066, 11426715, 11428799, 11429971, 11434279, 11447962, 11448751, 11452342, 11466078, 11468048, 11473888, 11474158, 11487159, 11490714, 11495264, 11495351, 11495514, 11511865, 11520860, 11530656, 11582010, 11635834, 11653832, 11660106, 11675913, 11680962, 11682486, 11693145, 11733898, 11733930, 11736965, 11753084, 11759990, 11766885, 11767918, 11781164, 11785661, 11803545, 11813450, 11819303, 11822403, 11836711, 11858222, 11871888, 11882662, 11903141, 11913095, 11914390, 11921194, 11926607, 11967493, 11973176, 11978094, 11978250, 11979226, 11986000, 12004035, 12010541, 12017037, 12042088, 12043511, 12058035, 12059096, 12067894, 12069290, 12073835, 12078346, 12078495, 12086913, 12087835, 12093000, 12122222, 12144652, 12162274, 12162809, 12180551, 12195724, 12204117, 12213062, 12213159, 12240399, 12260599, 12260642, 12262889, 12275635, 12279098, 12285174, 12287628, 12315661, 12329684, 12349761, 12352651, 12376304, 12388631, 12389611, 12389880, 12393684, 12396699, 12397415, 12409097, 12412151, 12445556, 12451003, 12453199, 12464361, 12465583, 12467417, 12470048, 12471652, 12488384, 12489539, 12539225, 12548144, 12576713, 12585217, 12588163, 12589190, 12600638, 12609519, 12626444, 12630771, 12634575, 12648447, 12651506, 12654904, 12662207, 12675648, 12679402, 12680575, 12712998, 12752947, 12756638, 12757485, 12795738, 12805244, 12833536, 12837554, 12846322, 12862773, 12868904, 12881550, 12888674, 12889647, 12890787, 12899350, 12901649, 12901781, 12905462, 12911577, 12915451, 12922039, 12934115, 12935254, 12955887, 12962555, 12964018, 12977584, 12987831, 12988400, 12991060, 12993530, 12998104, 13001692, 13008316, 13012275, 13027097, 13044955, 13053752, 13055310, 13056609, 13060189, 13074597, 13077220, 13088316, 13107443, 13108416, 13116203, 13140783, 13142811, 13155430, 13161686, 13162096, 13165171, 13182986, 13183318, 13222282, 13222508, 13235427, 13239849, 13253736, 13260238, 13265443, 13276458, 13283620, 13347796, 13354222, 13354919, 13356913, 13358016, 13358039, 13366869, 13367570, 13373822, 13381880, 13386020, 13392814, 13405109, 13405375, 13414323, 13414407, 13415801, 13416281, 13418211, 13420948, 13421433, 13422656, 13423353, 13424287, 13431658, 13442922, 13445123, 13445216, 13446555, 13447981, 13449000, 13449285, 13451872, 13456241, 13459256, 13459791, 13480580, 13480718, 13486488, 13486762, 13488113, 13488155, 13490663, 13504041, 13504368, 13506171, 13506458, 13509957, 13510745, 13512970, 13513468, 13514519, 13515533, 13517610, 13519812, 13530500, 13537184, 13539149, 13539158, 13540892, 13551716, 13552145, 13552223, 13555269, 13556450, 13556706, 13556937, 13558344, 13559814, 13563391, 13565990, 13568261, 13568546, 13571879, 13576195, 13578634, 13580660, 13581290, 13583266, 13586756, 13590988, 13599403, 13599565, 13601086, 13603299, 13605091, 13605258, 13605858, 13607345, 13607560, 13616989, 13617968, 13624274, 13627457, 13627782, 13629830, 13629892, 13630113, 13631536, 13633367, 13633444, 13634335, 13634908, 13638968, 13641013, 13643831, 13650334, 13721291, 13723504, 13742264, 13797103, 13807518, 13826601, 13831122, 13974798, 14058818, 14059751, 14061823, 14288454, 14288932, 14289645, 14290346, 14421103, 14423057, 14426996, 14429156, 14435212, 14461892, 14495404, 14578857, 14581838, 14622971, 14743932, 14760825, 14789333, 14908514, 14921905, 15082835, 15232939, 15307498, 15400107, 15468163, 15524270, 15701521, 15702273, 15702593, 15703319, 15704281, 15704515, 15704567, 15711694, 15711722, 15713473, 15714640, 15716445, 15719561, 15723025, 15723445, 15723882, 15725708, 15729965, 15733160, 15734528, 15735609, 15738256, 15743461, 15743742, 15744064, 15745258, 15745506, 15746808, 15748083, 15752536, 15754100, 15756420, 15757292, 15757694, 15758934, 15762116, 15764025, 15765674, 15767282, 15767465, 15768664, 15769772, 15771745, 15774058, 15774754, 15780507, 15780713, 15781041, 15781230, 15781600, 15786529, 15796469, 15803562, 15803571, 15804148, 15805400, 15810471, 15810878, 15813648, 15820072, 15823588, 15827382, 15828667, 15831257, 15832573, 15835831, 15836079, 15838162, 15850976, 15851096, 15861292, 15894829, 15898243, 15954655, 15958209, 15980313, 15980748, 15982604, 15982935, 15982975, 15985583, 15987008, 15988798, 15988913, 15991079, 15992526, 15995436, 15995446, 15995770, 15996855, 15997064, 15997131, 16000608, 16001302, 16003058, 16005542, 16006233, 16029550, 16029985, 16034400, 16034729, 16037555, 16038973, 16039814, 16044540, 16047983, 16048211, 16048485, 16049402, 16050504, 16050736, 16051846, 16054376, 16058646, 16059057, 16061093, 16064647, 16065949, 16068219, 16068357, 16077006, 16077824, 16080113, 16080744, 16081836, 16085359, 16093278, 16093810, 16098441, 16100383, 16100486, 16100516, 16111209, 16114675, 16115374, 16116139, 16117160, 16118030, 16120233, 16120634, 16123143, 16123199, 16126646, 16131056, 16133434, 16134839, 16135735, 16138637, 16139570, 16143103, 16147232, 16148944, 16152057, 16152258, 16152314, 16152797, 16152816, 16157099, 16159648, 16160232, 16163229, 16166088, 16166100, 16166103, 16172345, 16173781, 16173854, 16173923, 16186085, 16189172, 16192890, 16194369, 16210820, 16217804, 16218406, 16219204, 16219585, 16240720, 16241426, 16242171, 16251278, 16277353, 16280717, 16281835, 16282347, 16283340, 16300547, 16316508, 16381931, 16457571, 16514719, 16615568, 16700289, 17085296, 17125217, 17152940, 17155528, 17155995, 17156642, 17157183, 17157524, 17162950, 17163274, 17179167, 17183284, 17203883, 17204850, 17206278, 17208956, 17213964, 17214485, 17230551, 17231232, 17238573, 17244702, 17250780, 17250975, 17252074, 17252254, 17253976, 17255662, 17257015, 17260659, 17268689, 17280880, 17280934, 17282934, 17284737, 17286033, 17286431, 17288137, 17296266, 17300028, 17303389, 17303408, 17303541, 17305670, 17306084, 17311105, 17311603, 17316196, 17317588, 17320075, 17331340, 17335450, 17339474, 17340017, 17341747, 17341790, 17352923, 17372897, 17373746, 17379661, 17387313, 17406952, 17408055, 17408922, 17409068, 17410811, 17414948, 17450513, 17451252, 17454163, 17457963, 17469800, 17554307, 17559004, 17560038, 17564643, 17570395, 17571528, 17611929, 17621426, 17656749, 17661770, 17664771, 17667332, 17667354, 17667587, 17668125, 17673620, 17676690, 17692997, 17694149, 17695243, 17697959, 17706483, 17706900, 17725038, 17727104, 17785991, 17789080, 17789633, 17791633, 17792558, 17792642, 17801961, 17823022, 17823473, 17828257, 17830325, 17832472, 17833176, 17833808, 17834453, 17836088, 17838388, 17839432, 17841297, 17848362, 17851695, 17852083, 17855087, 17861717, 17871711, 17883157, 17886138, 17886564, 17895869, 17903381, 17905761, 17909684, 17910459, 17911079, 17911103, 17913073, 17915750, 17925553, 17927412, 17931653, 17932619, 17934006, 17934849, 17936909, 17946659, 17949078, 17975912, 17976031, 17977479, 17978278, 17980131, 17982508, 17984095, 17993295, 17999747, 18001649, 18002005, 18003556, 18004471, 18039077, 18040841, 18041750, 18042652, 18042658, 18042708, 18043104, 18043192, 18044947, 18044954, 18045377, 18068668, 18072883, 18080358, 18085024, 18104566, 18112657, 18113200, 18132276, 18132803, 18134178, 18134565, 18134856, 18135901, 18138487, 18140699, 18141125, 18142171, 18145294, 18148722, 18150272, 18154705, 18161417, 18163019, 18164207, 18168352, 18169372, 18169955, 18170438, 18173687, 18176624, 18176666, 18177004, 18177306, 18179259, 18187149, 18187409, 18189375, 18215028, 18215033, 18217413, 18219356, 18221469, 18224118, 18224451, 18229453, 18232090, 18242339, 18243368, 18246803, 18251020, 18251073, 18272743, 18274597, 18301840, 18302739, 18304351, 18305217, 18308630, 18310809, 18325163, 18332302, 18334644, 18335916, 18342940, 18343587, 18346461, 18350989, 18370024, 18373829, 18374102, 18383197, 18384116, 18386010, 18387010, 18392905, 18392983, 18394000, 18397855, 18399828, 18401395, 18430056, 18454521, 18459685, 18461360, 18478373, 18481773, 18487297, 18488392, 18489000, 18490665, 18493146, 18499908, 18513841, 18515789, 18519955, 18525356, 18588815, 18593399, 18604211, 18621220, 18626461, 18630143, 18630964, 18634690, 18641060, 18658637, 18659364, 18662420, 18667780, 18680604, 18681071, 18684874, 18690744, 18698406, 18718606, 18740588, 18746598, 18753178, 18755640, 18758062, 18758780, 18769784, 18770284, 18771046, 18774663, 18802141, 18805911, 18813841, 18816539, 18849550, 18851126, 18873112, 18877657, 18928810, 18930938, 18935284, 18935633, 19004928, 19022707, 19065214, 19098092, 19124921, 19145978, 19162170, 19164437, 19177809, 19181243, 19189462, 19221972, 19225358, 19231716, 19236503, 19250100, 19258731, 19297208, 19308365, 19308397, 19321261, 19325752, 19481739, 19500522, 19505123, 19540203, 19546932, 19571104, 19598707, 19628308, 19628506, 19904043, 20033413, 20317261, 20319997, 20336320, 20411784, 20445451, 20451857, 20517362, 20536379, 20543348, 20554536, 20560219, 20567018, 20580333, 20589303, 20622797, 20632430, 20635765, 20642360, 20643214, 20653540, 20660316, 20694818, 20711061, 20734410, 20736462, 20757657, 20757675, 20758493, 20758497, 20758512, 20763060, 20785104, 20813238, 20815448, 20826742, 20860778, 20883640, 20931358, 20934963, 20957248, 20990952, 21211419, 21312846, 21399540, 21403405, 21404079, 21409704, 21415082, 21451485, 21455492, 21469302, 21483104, 21485648, 21539244, 21541107, 21543957, 21685602, 21788254, 21803389, 21805511, 21818422, 21840306, 21841630, 21847032, 21857790, 21899701, 21899705, 21914422, 21939203, 21939589, 21948661, 21971818, 22003007, 22008448, 22014031, 22032673, 22037732, 22038488, 22043782, 22053329, 22059778, 22096979, 22146939, 22175890, 22236956, 22240106, 22242097, 22246200, 22246225, 22267178, 22301647, 22324898, 22337223, 22358119, 22364474, 22372018, 22437126, 22450158, 22455886, 22456030, 22461942, 22471386, 22525680, 22555545, 22588031, 22594223, 22616552, 22664583, 22671084, 22681862, 22698732, 22709296, 22736720, 22739190, 22741863, 22751611, 22772589, 22791973, 22814408, 22816028, 22816793, 22820427, 22844784, 22853617, 22877940, 22883285, 22914299, 22915011, 22916001, 22962771, 23118840, 23125407, 23157034, 23163605, 23174601, 23199351, 23203786, 23209889, 23248185, 23270305, 23324262, 23342284, 23348685, 23353850, 23391529, 23437606, 23453680, 23472563, 23651235, 23767964, 24421941, 24563061, 24789250, 35408327, 36381037]

In [44]:
events_train['item_id_enc'].max()

43304

In [50]:
events_train['user_id'].nunique() * events_train['item_id'].nunique() / 1024**3

15.26221003383398

In [10]:
user_item_matrix_train = scipy.sparse.csr_matrix((
    events_train["rating"],
    (events_train['user_id_enc'], events_train['item_id_enc'])),
    dtype=np.int8) 

In [56]:
import sys

sum([sys.getsizeof(i) for i in user_item_matrix_train.data])/1024**3 

0.26360040064901114

In [57]:
from implicit.als import AlternatingLeastSquares

als_model = AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_train) 

  from .autonotebook import tqdm as notebook_tqdm
  check_blas_config()
100%|██████████| 50/50 [03:08<00:00,  3.78s/it]


In [89]:
def get_recommendations_als(user_item_matrix, model, user_id, user_encoder, item_encoder, include_seen=True, n=5):
    """
    Возвращает отранжированные рекомендации для заданного пользователя
    """
    user_id_enc = user_encoder.transform([user_id])[0]
    recommendations = model.recommend(
         user_id_enc, 
         user_item_matrix[user_id_enc], 
         filter_already_liked_items=not include_seen,
         N=n)
    recommendations = pd.DataFrame({"item_id_enc": recommendations[0], "score": recommendations[1]})
    recommendations["item_id"] = item_encoder.inverse_transform(recommendations["item_id_enc"])
    
    return recommendations 

In [119]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
user_id = events_train['user_id'].sample().iat[0]
print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes","item_id"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
user_recommendations = get_recommendations_als(user_item_matrix_train, als_model, user_id, user_encoder, item_encoder,include_seen = True)
user_recommendations = user_recommendations.merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
user_recommendations["seen"] = [  any(user_history["item_id"] == i)  for i in user_recommendations["item_id"]]
display(user_recommendations)

user_id: 1180851
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes,item_id
70,Stephen King,Gerald's Game,2017-05-06,2017-05-25,5,"{'Horror': 2871, 'Fiction': 1032, 'Thriller': ...",32692
71,"Stephen King, William Olivier Desmond, Maria T...",Dreamcatcher,2011-12-30,2016-04-27,5,"{'Horror': 2966, 'Fiction': 980, 'Science Fict...",11570
72,Stephen King,Cell,2017-06-05,2017-06-07,4,"{'Horror': 3732, 'Fiction': 1179, 'Horror-Zomb...",10567
73,Stephen King,The Tommyknockers,2016-04-30,2016-05-13,4,"{'Horror': 3264, 'Fiction': 1039, 'Science Fic...",17660
74,Stephen King,The Gunslinger,2015-12-04,2015-12-07,5,"{'Fantasy': 11423, 'Fiction': 4008, 'Horror': ...",43615
75,Stephen King,The Shining,2016-06-06,2016-06-08,5,"{'Horror': 14597, 'Fiction': 5659, 'Thriller':...",11588
76,Stephen King,Misery,2017-05-15,2017-05-16,5,"{'Horror': 7772, 'Fiction': 2953, 'Thriller': ...",10614
77,"J.R.R. Tolkien, Peter S. Beagle","The Two Towers (The Lord of the Rings, #2)",2016-12-27,2017-01-01,5,"{'Fantasy': 28091, 'Fiction': 6763, 'Classics'...",15241
78,J.K. Rowling,Harry Potter and the Deathly Hallows (Harry Po...,2017-06-18,2017-06-20,5,"{'Fantasy': 46667, 'Young Adult': 15403, 'Fict...",136251
79,J.R.R. Tolkien,The Fellowship of the Ring (The Lord of the Ri...,2016-12-19,2016-12-27,5,"{'Fantasy': 38907, 'Classics': 10145, 'Fiction...",34


Рекомендации


Unnamed: 0,item_id_enc,score,item_id,author,title,genre_and_votes,seen
0,33950,1.397805,16130549,Stephen King,Doctor Sleep,"{'Horror': 5287, 'Fiction': 1848, 'Thriller': ...",False
1,28851,1.252845,10644930,Stephen King,11/22/63,"{'Fiction': 4864, 'Historical-Historical Ficti...",False
2,37068,1.221485,18775247,Stephen King,"Mr. Mercedes (Bill Hodges Trilogy, #1)","{'Fiction': 1854, 'Thriller': 1706, 'Horror': ...",False
3,1391,1.18991,11588,Stephen King,The Shining,"{'Horror': 14597, 'Fiction': 5659, 'Thriller':...",True
4,32205,1.083562,13596166,"Stephen King, Glen Orbik",Joyland,"{'Horror': 1392, 'Mystery': 1308, 'Fiction': 1...",False


In [120]:
# получаем список всех возможных user_id (перекодированных)
user_ids_encoded = range(len(user_encoder.classes_))

# получаем рекомендации для всех пользователей
als_recommendations = als_model.recommend(
    user_ids_encoded, 
    user_item_matrix_train[user_ids_encoded], 
    filter_already_liked_items=False, N=100) 

In [121]:
item_ids_enc = als_recommendations[0]
als_scores = als_recommendations[1]

als_recommendations = pd.DataFrame({
    "user_id_enc": user_ids_encoded,
    "item_id_enc": item_ids_enc.tolist(), 
    "score": als_scores.tolist()})
als_recommendations = als_recommendations.explode(["item_id_enc", "score"], ignore_index=True)

# приводим типы данных
als_recommendations["item_id_enc"] = als_recommendations["item_id_enc"].astype("int")
als_recommendations["score"] = als_recommendations["score"].astype("float")

# получаем изначальные идентификаторы
als_recommendations["user_id"] = user_encoder.inverse_transform(als_recommendations["user_id_enc"])
als_recommendations["item_id"] = item_encoder.inverse_transform(als_recommendations["item_id_enc"])
als_recommendations = als_recommendations.drop(columns=["user_id_enc", "item_id_enc"])

In [122]:
als_recommendations = als_recommendations[["user_id", "item_id", "score"]]
als_recommendations.to_parquet("goodsread/als_recommendations.parquet") 

In [123]:
als_recommendations = (
    als_recommendations
    .merge(events_test[["user_id", "item_id", "rating"]]
               .rename(columns={"rating": "rating_test"}), 
           on=["user_id", "item_id"], how="left")
) 

In [124]:
import sklearn.metrics

def compute_ndcg(rating: pd.Series, score: pd.Series, k):

    """ подсчёт ndcg
    rating: истинные оценки
    score: оценки модели
    k: количество айтемов (по убыванию score) для оценки, остальные - отбрасываются
    """
    
    # если кол-во объектов меньше 2, то NDCG - не определена
    if len(rating) < 2:
        return np.nan

    ndcg = sklearn.metrics.ndcg_score(np.asarray([rating.to_numpy()]), np.asarray([score.to_numpy()]), k=k)

    return ndcg 

In [125]:
rating_test_idx = ~als_recommendations["rating_test"].isnull()
ndcg_at_5_scores = als_recommendations[rating_test_idx].groupby("user_id").apply(lambda x: compute_ndcg(x["rating_test"], x["score"], k=5))

In [126]:
print(ndcg_at_5_scores.mean()) 

0.9756221596154557


In [131]:
ndcg_at_5_scores.groupby("user_id").count()

user_id
1000006    1
1000007    0
1000019    0
1000020    0
1000023    1
          ..
1430558    0
1430569    0
1430573    0
1430578    1
1430584    0
Length: 48121, dtype: int64

преобразуем значения в genre_and_votes из текстового представления в тип в Python

In [25]:
items["genre_and_votes"] = items["genre_and_votes"].apply(eval)

In [26]:
def get_genres(items):

    """ 
    извлекает список жанров по всем книгам, 
    подсчитывает долю голосов по каждому их них
    """
    
    genres_counter = {}
    
    for k, v, in items.iterrows():
        genre_and_votes = v["genre_and_votes"]
        if genre_and_votes is None or not isinstance(genre_and_votes, dict):
            continue
        for genre, votes in genre_and_votes.items():
            # увеличиваем счётчик жанров
            try:
                genres_counter[genre] += votes
            except KeyError:
                genres_counter[genre] = 0

    genres = pd.Series(genres_counter, name="votes")
    genres = genres.to_frame()
    genres = genres.reset_index().rename(columns={"index": "name"})
    genres.index.name = "genre_id"
    
    return genres
   
genres = get_genres(items)

genres["score"] = genres["votes"] / genres["votes"].sum()
genres.sort_values(by="score", ascending=False).head(10) 

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
23,Fantasy,6764972,0.149511
1,Fiction,6364872,0.140668
35,Classics,3409787,0.075359
16,Young Adult,3241849,0.071647
30,Romance,2387940,0.052775
3,Nonfiction,1706935,0.037725
14,Historical-Historical Fiction,1518864,0.033568
18,Mystery,1353528,0.029914
22,Science Fiction,1198823,0.026495
29,Fantasy-Paranormal,850477,0.018796


Функция строит матрицу вида «книга-жанр». матрицы содержат множество пропусков, то есть являются разреженными. Чтобы сэкономить память при работе с ними, используем sparse-формат.

In [50]:
def get_item2genre_matrix(genres, items):

    genre_names_to_id = genres.reset_index().set_index("name")["genre_id"].to_dict()
    
    # list to build CSR matrix
    genres_csr_data = []
    genres_csr_row_idx = []
    genres_csr_col_idx = []
    
    for item_idx, (k, v) in enumerate(items.iterrows()):
        if v["genre_and_votes"] is None:
            continue
        for genre_name, votes in v["genre_and_votes"].items():
            genre_idx = genre_names_to_id[genre_name]
            genres_csr_data.append(int(votes))
            genres_csr_row_idx.append(item_idx)
            genres_csr_col_idx.append(genre_idx)

    genres_csr = scipy.sparse.csr_matrix((genres_csr_data, (genres_csr_row_idx, genres_csr_col_idx)), shape=(len(items), len(genres)))
    # нормализуем, чтобы сумма оценок принадлежности к жанру была равна 1
    genres_csr = sklearn.preprocessing.normalize(genres_csr, norm='l1', axis=1)
    
    return genres_csr

In [34]:
items = items.sort_values(by="item_id_enc")

In [31]:
all_items_genres_csr = get_item2genre_matrix(genres, items)

In [14]:
user_id = 1000010
user_events = events_train.query("user_id == @user_id")[["item_id", "rating"]]
user_items = items[items["item_id"].isin(user_events["item_id"])]

user_items_genres_csr = get_item2genre_matrix(genres,user_items)
user_items_genres_csr

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 149 stored elements and shape (22, 815)>

In [15]:
# вычислим склонность пользователя к жанрам как среднее взвешенное значение популяции на его оценки книг.

# преобразуем пользовательские оценки из списка в вектор-столбец
user_ratings = user_events["rating"].to_numpy() / 5
user_ratings = np.expand_dims(user_ratings, axis=1)

user_items_genres_weighted = user_items_genres_csr.multiply(user_ratings)

user_genres_scores = np.asarray(user_items_genres_weighted.mean(axis=0)) 

In [16]:
# выведем список жанров, которые предпочитает пользователь

user_genres = genres.copy()
user_genres["score"] = np.ravel(user_genres_scores)
user_genres = user_genres[user_genres["score"] > 0].sort_values(by=["score"], ascending=False)

user_genres.head(5) 

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Fiction,6406256,0.185241
38,Classics,3414934,0.103879
25,Fantasy,6850060,0.072447
5,Nonfiction,1737406,0.050865
24,Science Fiction,1218917,0.04092


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# вычисляем сходство между вектором пользователя и векторами по книгам
similarity_scores = cosine_similarity(all_items_genres_csr, user_genres_scores)

# преобразуем в одномерный массив
similarity_scores = similarity_scores.flatten()

similarity_scores

array([0.50714765, 0.49770536, 0.52433355, ..., 0.64894348, 0.03112624,
       0.29805756])

In [18]:
# получаем индексы top-k (по убыванию значений), по сути, индексы книг (encoded)
k = 5
top_k_indices = similarity_scores.argsort()[::-1][:k]

In [19]:
selected_items = items[items["item_id_enc"].isin(top_k_indices)]

with pd.option_context("max_colwidth", 100):
   display(selected_items[["author", "title", "genre_and_votes"]])

Unnamed: 0,author,title,genre_and_votes
80465,G.K. Chesterton,The Napoleon of Notting Hill,"{'Fiction': 166, 'Classics': 88, 'Fantasy': 44, 'Humor': 22, 'Literature': 20}"
1168335,Ray Bradbury,"Dandelion Wine (Green Town, #1)","{'Fiction': 1438, 'Classics': 914, 'Science Fiction': 529, 'Fantasy': 456, 'Young Adult': 212}"
393210,"G.K. Chesterton, Jonathan Lethem",The Man Who Was Thursday: A Nightmare,"{'Fiction': 1257, 'Classics': 929, 'Mystery': 469, 'Fantasy': 293, 'Philosophy': 156, 'Literatur..."
2244467,Samuel Butler,"Erewhon (Erewhon , #1)","{'Fiction': 162, 'Classics': 139, 'Science Fiction': 60, 'Fantasy': 55}"
39408,"Paulo Coelho, Alan R. Clarke, James Noel Smith",The Alchemist,"{'Fiction': 14023, 'Classics': 5787, 'Fantasy': 3289, 'Philosophy': 2759}"


# === Базовые подходы: валидация

In [20]:
als_recommendations =  pd.read_parquet("goodsread/als_recommendations.parquet")

##### Cобытия в тестовой выборке и рекомендации для одних и тех же пользователей разметим признаками:
 - gt (ground truth): объект есть в тестовой выборке;
 - pr (predicted): объект есть в рекомендациях.

##### Теперь разметим признаки бинарной классификации:
 - TP: объект есть и в тестовой выборке, и в рекомендациях (истинная рекомендация),
 - FP: объекта нет в тестовой выборке, но он есть в рекомендациях (ложноположительная рекомендация),
 - FN: объект есть в тестовой выборке, но его нет в рекомендациях (ложноотрицательная рекомендация)

In [6]:
def process_events_recs_for_binary_metrics(events_train, events_test, recs, top_k=None):

    """
    размечает пары <user_id, item_id> для общего множества пользователей признаками
    - gt (ground truth)
    - pr (prediction)
    top_k: расчёт ведётся только для top k-рекомендаций
    """

    events_test["gt"] = True
    common_users = set(events_test["user_id"]) & set(recs["user_id"])

    print(f"Common users: {len(common_users)}")
    
    events_for_common_users = events_test[events_test["user_id"].isin(common_users)].copy()
    recs_for_common_users = recs[recs["user_id"].isin(common_users)].copy()

    recs_for_common_users = recs_for_common_users.sort_values(["user_id", "score"], ascending=[True, False])

    # оставляет только те item_id, которые были в events_train, 
    # т. к. модель не имела никакой возможности давать рекомендации для новых айтемов
    events_for_common_users = events_for_common_users[events_for_common_users["item_id"].isin(events_train["item_id"].unique())]

    if top_k is not None:
        recs_for_common_users = recs_for_common_users.groupby("user_id").head(top_k)
    
    events_recs_common = events_for_common_users[["user_id", "item_id", "gt"]].merge(
        recs_for_common_users[["user_id", "item_id", "score"]], 
        on=["user_id", "item_id"], how="outer")    

    events_recs_common["gt"] = events_recs_common["gt"].fillna(False)
    events_recs_common["pr"] = ~events_recs_common["score"].isnull()
    
    events_recs_common["tp"] = events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fp"] = ~events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fn"] = events_recs_common["gt"] & ~events_recs_common["pr"]

    return events_recs_common

In [21]:
events_recs_for_binary_metrics = process_events_recs_for_binary_metrics(
  events_train,
    events_test, 
    als_recommendations, 
    top_k=5)

events_recs_for_binary_metrics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["gt"] = True


Common users: 123223


Unnamed: 0,user_id,item_id,gt,score,pr,tp,fp,fn
0,1196635,18467802,True,,False,False,False,True
1,1188739,10799,True,,False,False,False,True
2,1001879,13206828,True,,False,False,False,True
3,1001879,13206900,True,,False,False,False,True
4,1001879,13206760,True,,False,False,False,True
...,...,...,...,...,...,...,...,...
1030116,1430584,29056083,False,0.462653,True,False,True,False
1030117,1430584,17167166,False,0.456820,True,False,True,False
1030118,1430584,17927395,False,0.443951,True,False,True,False
1030119,1430584,16096824,False,0.443670,True,False,True,False


In [7]:
def compute_cls_metrics(events_recs_for_binary_metric):
    
    groupper = events_recs_for_binary_metric.groupby("user_id")

    # precision = tp / (tp + fp)
    precision = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fp"].sum())
    precision = precision.fillna(0).mean()
    
    # recall = tp / (tp + fn)
    recall = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fn"].sum())
    recall = recall.fillna(0).mean()

    return precision, recall


In [22]:
compute_cls_metrics(events_recs_for_binary_metrics)

(0.007657661313228862, 0.014173346821289545)

In [23]:
events_recs_for_binary_metrics_k10 = process_events_recs_for_binary_metrics(
  events_train,
    events_test, 
    als_recommendations, 
    top_k=10)

compute_cls_metrics(events_recs_for_binary_metrics_k10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["gt"] = True


Common users: 123223


(0.008682632300788003, 0.03116118372682361)

# === Двухстадийный подход: метрики

In [24]:
als_recommendations

Unnamed: 0,user_id,item_id,score
0,1000000,3,0.960111
1,1000000,15881,0.878722
2,1000000,5,0.855399
3,1000000,6,0.826737
4,1000000,2,0.789068
...,...,...,...
43058095,1430584,13206760,0.098786
43058096,1430584,3228917,0.098126
43058097,1430584,5129,0.097948
43058098,1430584,8520610,0.097732


In [25]:
cov_items = als_recommendations["item_id"].nunique()/len(items)
print(f"{cov_items:.2f}") 

0.09


In [27]:
# разметим каждую рекомендацию признаком read
events_train["read"] = True
als_recommendations = als_recommendations.merge(events_train, on=["user_id", "item_id"], how="left")
als_recommendations["read"] = als_recommendations["read"].fillna(False).astype("bool")

# проставим ранги
als_recommendations = als_recommendations.sort_values(by="score",ascending=False)
als_recommendations["rank"] = als_recommendations.groupby("user_id").cumcount() + 1

# посчитаем novelty по пользователям
novelty_5 = (1-als_recommendations.query("rank <= 5").groupby("user_id")["read"].mean())

# посчитаем средний novelty
novelty_5.mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["read"] = True


0.6077012223019596

# === Двухстадийный подход: модель

In [8]:
# задаём точку разбиения
split_date_for_labels = pd.to_datetime("2017-09-15").date()

split_date_for_labels_idx = events_test["started_at"] < split_date_for_labels
events_labels = events_test[split_date_for_labels_idx].copy()
events_test_2 = events_test[~split_date_for_labels_idx].copy()
events_labels["user_id"].nunique()

99849

In [31]:
events_labels.head(3)

Unnamed: 0,item_id,started_at,read_at,is_read,rating,is_reviewed,user_id,user_id_enc,item_id_enc,gt
84,18467802,2017-09-01,2017-09-22,True,1,False,1196635,196635,36588,True
257,10799,2017-08-06,2017-10-14,True,3,False,1188739,188739,1262,True
273,18658071,2017-09-11,2017-09-15,True,2,True,1001879,1879,36848,True


In [12]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations = pd.read_parquet("goodsread/candidates/training/als_recommendations.parquet")
content_recommendations = pd.read_parquet("goodsread/candidates/training/content_recommendations.parquet")

candidates = pd.merge(
    als_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=["user_id", "item_id"],
    how="outer") 
len(candidates)

82993094

In [30]:
candidates.head(3)

Unnamed: 0,user_id,item_id,als_score,cnt_score
0,1000000,3,0.972557,0.920225
1,1000000,15881,0.890201,0.90574
2,1000000,5,0.86585,0.918026


In [13]:
# добавляем таргет к кандидатам со значением:
# — 1 для тех item_id, которые пользователь прочитал
# — 0, для всех остальных 

events_labels["target"] = 1
candidates = candidates.merge(events_labels[["user_id", "item_id", "target"]], 
                              on=["user_id", "item_id"],how="left")
candidates["target"] = candidates["target"].fillna(0).astype("int")

# в кандидатах оставляем только тех пользователей, у которых есть хотя бы один положительный таргет
candidates_to_sample = candidates.groupby("user_id").filter(lambda x: x["target"].sum() > 0)

In [14]:
# для каждого пользователя оставляем только 4 негативных примера
negatives_per_user = 4
candidates_for_train = pd.concat([
    candidates_to_sample.query("target == 1"),
    candidates_to_sample.query("target == 0") \
        .groupby("user_id") \
        .apply(lambda x: x.sample(negatives_per_user, random_state=0))
    ])

len(candidates_for_train)

213708

In [11]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score']
target = 'target'

# Create the Pool object
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0
)

# тренируем модель
cb_model.fit(train_data)

0:	learn: 0.6526057	total: 77.9ms	remaining: 1m 17s
100:	learn: 0.5118959	total: 1.93s	remaining: 17.2s
200:	learn: 0.5111710	total: 3.87s	remaining: 15.4s
300:	learn: 0.5105208	total: 5.83s	remaining: 13.5s
400:	learn: 0.5100174	total: 7.74s	remaining: 11.6s
500:	learn: 0.5095747	total: 9.64s	remaining: 9.6s
600:	learn: 0.5091600	total: 11.5s	remaining: 7.66s
700:	learn: 0.5087803	total: 13.5s	remaining: 5.77s
800:	learn: 0.5084220	total: 15.5s	remaining: 3.85s
900:	learn: 0.5080930	total: 17.4s	remaining: 1.91s
999:	learn: 0.5078081	total: 19.3s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fa0b84c6a40>

In [15]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations_2 = pd.read_parquet("goodsread/candidates/inference/als_recommendations.parquet")
content_recommendations_2 = pd.read_parquet("goodsread/candidates/inference/content_recommendations.parquet")

candidates_to_rank = pd.merge(
    als_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=["user_id", "item_id"],
    how="outer"
    )


In [16]:

# оставляем только тех пользователей, что есть в тестовой выборке, для экономии ресурсов
candidates_to_rank = candidates_to_rank[candidates_to_rank["user_id"].isin(events_test_2["user_id"].drop_duplicates())]
print(len(candidates_to_rank))

14517152


In [21]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = predictions[:, 1]

# для каждого пользователя проставляем rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = candidates_to_rank.groupby("user_id").cumcount() + 1

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank[candidates_to_rank["rank"] <= max_recommendations_per_user]

In [22]:
len(final_recommendations)

7519400

In [27]:
events_inference = pd.concat([events_train, events_labels])

In [26]:
cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    events_test_2,
    final_recommendations.rename(columns={"cb_score": "score"}), 
    top_k=5)

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}") 

Common users: 75194
precision: 0.006, recall: 0.015


# === Двухстадийный подход: построение признаков

In [18]:
item_popularity.head(3)

Unnamed: 0,index,item_id,users,avg_rating,popularity_weighted
0,35011,22557272,40690,3.788965,154173.0
1,37701,29056083,25785,3.801784,98029.0
2,32387,18007564,20207,4.321275,87320.0


In [19]:
items["age"] = 2018-items["publication_year"]
invalid_age_idx = items["age"] < 0
items.loc[invalid_age_idx, "age"] = np.nan
items["age"] = items["age"].astype("float")
items = items.merge(item_popularity[["avg_rating","item_id"]], on="item_id")


In [20]:
candidates_for_train = candidates_for_train.merge(items[["age","avg_rating","item_id"]],on="item_id")
candidates_to_rank = candidates_to_rank.merge(items[["age","avg_rating","item_id"]],on="item_id")

In [125]:
candidates_to_rank["age"].median()

7.0

In [21]:
def get_user_features(events):
    """ считает пользовательские признаки """
    
    user_features = events.groupby("user_id").agg(
        reading_years=("started_at", lambda x: (x.max()-x.min()).days/365.25),
        books_read=("item_id", "count"),
        rating_avg=("rating", "mean"),
        rating_std=("rating", "std"))
    
    user_features["books_per_year"] = user_features["books_read"] / user_features["reading_years"]
    
    return user_features
    
user_features_for_train = get_user_features(events_train)
candidates_for_train = candidates_for_train.merge(user_features_for_train, on="user_id", how="left")
candidates_for_train.head(3)

Unnamed: 0,user_id,item_id,als_score,cnt_score,target,age,avg_rating,reading_years,books_read,rating_avg,rating_std,books_per_year
0,1000006,29868610,0.286715,,1,,3.981891,1.820671,17.0,4.294118,0.685994,9.337218
1,1000145,29868610,0.181936,,1,,3.981891,1.97399,14.0,3.714286,0.61125,7.092233
2,1003889,29868610,0.68805,,1,,3.981891,6.023272,109.0,3.963303,0.870575,18.096477


In [22]:
# оставим только тех пользователей, что есть в тесте, для экономии ресурсов
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test["user_id"].drop_duplicates())]

user_features_for_ranking = get_user_features(events_inference)
candidates_to_rank = candidates_to_rank.merge(user_features_for_ranking, on="user_id", how="left")

In [29]:
candidates_for_train["books_read"].median()

32.0

In [40]:
# определяем индексы топ-10 жанров и всех остальных
genres_top_k = 10
genres_top_idx = genres.sort_values("votes", ascending=False).head(genres_top_k).index
genres_others_idx = list(set(genres.index) - set(genres_top_idx))

genres_top_columns = [f"genre_{id}" for id in genres_top_idx]
genres_others_column = "genre_others"
genre_columns = genres_top_columns + [genres_others_column]

In [45]:
# составляем таблицу принадлежности книг к жанрам
item_genres = (
    pd.concat([
        # топ жанров
        pd.DataFrame(all_items_genres_csr[:, genres_top_idx].toarray(), columns=genres_top_columns),
        # все остальные жанры
        pd.DataFrame(all_items_genres_csr[:, genres_others_idx].sum(axis=1), columns=[genres_others_column])
        ],
        axis=1)
    .reset_index()
    .rename(columns={"index": "item_id_enc"})
)

In [46]:
#items.drop(genre_columns,axis=1,inplace=True)
item_genres

Unnamed: 0,item_id,genre_23,genre_1,genre_35,genre_16,genre_30,genre_3,genre_14,genre_18,genre_22,genre_29,genre_others
0,0,0.524988,0.148026,0.014064,0.170655,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.142267
1,1,0.621931,0.174786,0.000000,0.203283,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000
2,2,0.514586,0.154140,0.022797,0.153916,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.154561
3,3,0.518708,0.148702,0.015347,0.160382,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.156860
4,4,0.623564,0.176369,0.000000,0.200067,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
38909,38909,0.000000,0.000000,0.000000,0.000000,0.343750,0.0,0.0,0.0,0.000000,0.000000,0.656250
38910,38910,0.106952,0.000000,0.000000,0.000000,0.160428,0.0,0.0,0.0,0.000000,0.272727,0.459893
38911,38911,0.000000,0.000000,0.000000,0.000000,0.444444,0.0,0.0,0.0,0.000000,0.000000,0.555556
38912,38912,0.000000,0.000000,0.000000,0.000000,0.478873,0.0,0.0,0.0,0.000000,0.000000,0.521127


In [47]:
# объединяем информацию принадлежности книг к жанрам с основной информацией о книгах
items = items.merge(item_genres, on="item_id_enc", how="left")

In [48]:
def get_user_genres(events, items, item_genre_columns):
    user_genres = (
        events
        .merge(items[["item_id"] + item_genre_columns], on="item_id", how="left")
        .groupby("user_id")[item_genre_columns].mean()
    )
    return user_genres
    
user_genres_for_train = get_user_genres(events_train, items, genre_columns)
candidates_for_train = candidates_for_train.merge(user_genres_for_train, on="user_id", how="left")

In [49]:
user_genres_for_ranking = get_user_genres(events_inference, items, genre_columns)
candidates_to_rank = candidates_to_rank.merge(user_genres_for_ranking, on="user_id", how="left")

In [50]:
genres.loc[genres["name"] == "Romance"]

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30,Romance,2387940,0.052775


In [108]:
candidates_for_train["genre_34"].median().round(2)

0.04

In [51]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score', 
    'age', 'avg_rating', 'reading_years', 'books_read', 
    'rating_avg', 'rating_std', 
    'books_per_year'] + genre_columns
target = 'target'

# создаём Pool
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0,
)

# тренируем модель
cb_model.fit(train_data)

0:	learn: 0.6481365	total: 96.7ms	remaining: 1m 36s
100:	learn: 0.4723175	total: 2.65s	remaining: 23.5s
200:	learn: 0.4646599	total: 5.12s	remaining: 20.4s
300:	learn: 0.4594849	total: 7.61s	remaining: 17.7s
400:	learn: 0.4550096	total: 10.5s	remaining: 15.7s
500:	learn: 0.4512060	total: 13.1s	remaining: 13.1s
600:	learn: 0.4475758	total: 15.6s	remaining: 10.4s
700:	learn: 0.4441345	total: 18.1s	remaining: 7.74s
800:	learn: 0.4408613	total: 20.7s	remaining: 5.13s
900:	learn: 0.4378002	total: 23.1s	remaining: 2.54s
999:	learn: 0.4349111	total: 25.6s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fa8594c15d0>

In [146]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

In [147]:
candidates_to_rank["cb_score"] = predictions[:,1]

# для каждого пользователя проставим rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = candidates_to_rank.groupby("user_id").cumcount() + 1

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank.query("rank <= @max_recommendations_per_user")

In [148]:
final_recommendations["user_id"].nunique()

75194

In [149]:
final_recommendations.to_parquet("goodsread/final_recommendations_feat.parquet")

## Валидация модели с признаками

In [9]:
final_recommendations = pd.read_parquet("goodsread/final_recommendations_feat.parquet")

In [10]:
# для экономии ресурсов оставим события только тех пользователей, 
# для которых следует оценить рекомендации
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test_2["user_id"].drop_duplicates())]

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    events_test_2,
    final_recommendations.rename(columns={"cb_score": "score"}), 
    top_k=5)

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}") 

Common users: 75194
precision: 0.012, recall: 0.030


In [52]:
feature_importance = pd.DataFrame(cb_model.get_feature_importance(), 
    index=features, 
    columns=["fi"])
feature_importance

Unnamed: 0,fi
als_score,29.213024
cnt_score,4.030751
age,17.548847
avg_rating,19.390739
reading_years,4.232399
books_read,5.918692
rating_avg,2.418142
rating_std,1.875278
books_per_year,2.664648
genre_23,1.33515


In [53]:
feature_importance = feature_importance.sort_values(by="fi",ascending=False)

print(feature_importance ) 

                       fi
als_score       29.213024
avg_rating      19.390739
age             17.548847
books_read       5.918692
reading_years    4.232399
cnt_score        4.030751
books_per_year   2.664648
rating_avg       2.418142
rating_std       1.875278
genre_others     1.451886
genre_23         1.335150
genre_30         1.308121
genre_1          1.274381
genre_16         1.258079
genre_14         1.156725
genre_35         1.050188
genre_18         1.025278
genre_3          1.016062
genre_22         1.008271
genre_29         0.823340


In [54]:
cb_model.save_model("models/cb_feature.pkl")