In [1]:
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

In [3]:
# загрузка данных
items = pd.read_parquet("./goodsread/items.par")
events = pd.read_parquet("./goodsread/events.par")

In [4]:
# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

# количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()
# количество пользователей, которые есть и в train, и в test
common_users = list(set(users_train) & set(users_test))

print(len(users_train), len(users_test), len(common_users))

428220 123223 120858


In [5]:
import scipy
import sklearn.preprocessing

In [None]:
# перекодируем идентификаторы пользователей: 
# из имеющихся в последовательность 0, 1, 2, ...
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])

# перекодируем идентификаторы объектов: 
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["item_id"])
items["item_id_enc"] = item_encoder.transform(items["item_id"])
events_train["item_id_enc"] =  item_encoder.transform(events_train["item_id"])
events_test["item_id_enc"] = item_encoder.transform(events_test["item_id"])

In [8]:
events_train.head()

Unnamed: 0,item_id,started_at,read_at,is_read,rating,is_reviewed,user_id,user_id_enc,item_id_enc
0,22034,2015-07-12,2015-07-17,True,5,False,1229132,229132,2460
1,22318578,2015-06-07,2015-08-09,True,5,True,1229132,229132,38691
2,22551730,2015-06-24,2015-07-11,True,4,True,1229132,229132,38867
3,22816087,2015-09-27,2015-11-04,True,5,True,1229132,229132,39109
5,17910054,2015-03-04,2015-07-28,True,3,False,1229132,229132,35638


In [7]:
# UI матрица очень большая, конвертируем её в CSR формат
user_item_matrix_train = scipy.sparse.csr_matrix((
    events_train["rating"],
    (events_train['user_id_enc'], events_train['item_id_enc'])),
    dtype=np.int8) 

In [9]:
# на основании сжатой UI матрицы строим ALS модель
from implicit.als import AlternatingLeastSquares

als_model = AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_train)

  from .autonotebook import tqdm as notebook_tqdm
  check_blas_config()
100%|██████████| 50/50 [03:04<00:00,  3.70s/it]


In [14]:
# получим энкодированные идентификаторы всех объектов, известных нам из events_train
train_item_ids_enc = events_train['item_id_enc'].unique()

max_similar_items = 10

# получаем списки похожих объектов, используя ранее полученную ALS-модель
# метод similar_items возвращает и сам объект, как наиболее похожий
# этот объект мы позже отфильтруем, но сейчас запросим на 1 больше
similar_items = als_model.similar_items(train_item_ids_enc, N=max_similar_items+1)

In [15]:
# преобразуем полученные списки в табличный формат
sim_item_item_ids_enc = similar_items[0]
sim_item_scores = similar_items[1]

In [16]:
print(len(train_item_ids_enc),len(sim_item_item_ids_enc), len(sim_item_scores))

41474 41474 41474


In [17]:
similar_items = pd.DataFrame({
    "item_id_enc": train_item_ids_enc,
    "sim_item_id_enc": sim_item_item_ids_enc.tolist(), 
    "score": sim_item_scores.tolist()
    })

In [18]:
similar_items

Unnamed: 0,item_id_enc,sim_item_id_enc,score
0,2460,"[2460, 2458, 806, 2459, 12528, 1147, 7852, 618...","[0.9999999403953552, 0.9224898815155029, 0.874..."
1,38691,"[38691, 39575, 40111, 25112, 32177, 34430, 367...","[1.0000001192092896, 0.9343445897102356, 0.930..."
2,38867,"[38867, 38023, 38951, 5992, 3865, 10539, 28584...","[1.0, 0.9388757348060608, 0.9345316886901855, ..."
3,39109,"[39109, 37674, 39384, 40645, 17054, 36002, 394...","[0.9999998211860657, 0.9593728184700012, 0.947..."
4,35638,"[35638, 37837, 41337, 39997, 31205, 25389, 324...","[1.0000001192092896, 0.9470844268798828, 0.944..."
...,...,...,...
41469,17937,"[17937, 32880, 36575, 36896, 38973, 493, 31807...","[1.0, 0.9274682402610779, 0.9085670709609985, ..."
41470,34066,"[34066, 40654, 40650, 39718, 5085, 13566, 3581...","[1.0, 0.8410629630088806, 0.8410628437995911, ..."
41471,43151,"[43151, 19601, 29207, 39803, 35742, 18425, 231...","[1.000000238418579, 0.7168519496917725, 0.6817..."
41472,11649,"[11649, 9021, 25953, 19690, 20545, 13696, 1757...","[1.0000001192092896, 0.6946579217910767, 0.669..."


In [37]:
out = []
for ind,row in similar_items.iterrows():
    for i,k in enumerate(row["sim_item_id_enc"]):
        out.append([row["item_id_enc"],k,row["score"][i]])

In [None]:
similar_items_test = pd.DataFrame(out, columns=['item_id_enc', 'sim_item_id_enc', 'score'])

In [40]:
similar_items_test

Unnamed: 0,item_id_enc,sim_item_id_enc,score
0,2460,2460,1.000000
1,2460,2458,0.922490
2,2460,806,0.874765
3,2460,2459,0.873763
4,2460,12528,0.850654
...,...,...,...
456209,38365,37490,0.534920
456210,38365,23306,0.515321
456211,38365,35631,0.507710
456212,38365,23687,0.496325


In [41]:
# приводим типы данных
similar_items_test["sim_item_id_enc"] =  similar_items_test["sim_item_id_enc"].astype("int")
similar_items_test["score"] = similar_items_test["score"].astype("float")

In [42]:
# получаем изначальные идентификаторы
similar_items_test["item_id_1"] = item_encoder.inverse_transform(similar_items_test["item_id_enc"])
similar_items_test["item_id_2"] = item_encoder.inverse_transform(similar_items_test["sim_item_id_enc"])
similar_items_test = similar_items_test.drop(columns=["item_id_enc", "sim_item_id_enc"])

In [44]:
similar_items = similar_items_test.query("item_id_1 != item_id_2")
similar_items

Unnamed: 0,score,item_id_1,item_id_2
1,0.922490,22034,22026
2,0.874765,22034,6882
3,0.873763,22034,22028
4,0.850654,22034,364089
5,0.835730,22034,9827
...,...,...,...
456209,0.534920,21847032,19904043
456210,0.515321,21847032,6167746
456211,0.507710,21847032,17908487
456212,0.496325,21847032,6349976


In [45]:
similar_items.query("item_id_1==7126")

Unnamed: 0,score,item_id_1,item_id_2
25873,0.948725,7126,7190
25874,0.940997,7126,24280
25875,0.930144,7126,1953
25876,0.925066,7126,58696
25877,0.91634,7126,38296
25878,0.916015,7126,2932
25879,0.913951,7126,7184
25880,0.911433,7126,387749
25881,0.909872,7126,7733
25882,0.909454,7126,30597


In [47]:
similar_items.to_parquet("./goodsread/similar_items.parquet") 

In [48]:
def print_sim_items(item_id, similar_items):

    item_columns_to_use = ["item_id", "author", "title", "genre_and_votes", "average_rating", "ratings_count"]
    
    item_id_1 = items.query("item_id == @item_id")[item_columns_to_use]
    display(item_id_1)
    
    si = similar_items.query("item_id_1 == @item_id")
    si = si.merge(items[item_columns_to_use].set_index("item_id"), left_on="item_id_2", right_index=True)
    display(si) 

In [50]:
print_sim_items(3, similar_items)

Unnamed: 0,item_id,author,title,genre_and_votes,average_rating,ratings_count
1584855,3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,"{'Fantasy': 59818, 'Fiction': 17918, 'Young Ad...",4.45,4765497


Unnamed: 0,score,item_id_1,item_id_2,author,title,genre_and_votes,average_rating,ratings_count
10297,0.986763,3,15881,"J.K. Rowling, Mary GrandPré",Harry Potter and the Chamber of Secrets (Harry...,"{'Fantasy': 50130, 'Young Adult': 15202, 'Fict...",4.38,1821802
10298,0.974947,3,5,"J.K. Rowling, Mary GrandPré",Harry Potter and the Prisoner of Azkaban (Harr...,"{'Fantasy': 49784, 'Young Adult': 15393, 'Fict...",4.53,1876252
10299,0.95439,3,6,"J.K. Rowling, Mary GrandPré",Harry Potter and the Goblet of Fire (Harry Pot...,"{'Fantasy': 48257, 'Young Adult': 15483, 'Fict...",4.53,1792561
10300,0.934225,3,2,"J.K. Rowling, Mary GrandPré",Harry Potter and the Order of the Phoenix (Har...,"{'Fantasy': 46485, 'Young Adult': 15194, 'Fict...",4.47,1766895
10301,0.922894,3,1,J.K. Rowling,Harry Potter and the Half-Blood Prince (Harry ...,"{'Fantasy': 46400, 'Young Adult': 15083, 'Fict...",4.54,1713866
10302,0.907875,3,136251,J.K. Rowling,Harry Potter and the Deathly Hallows (Harry Po...,"{'Fantasy': 46667, 'Young Adult': 15403, 'Fict...",4.62,1784684
10303,0.861305,3,8388506,"Bruno Nogueira, João Quadros","Tubo de Ensaio, Parte II","{'Humor': 4, 'Humor-Comedy': 1}",3.26,39
10304,0.861305,3,6379485,"Bruno Nogueira, João Quadros",Tubo de Ensaio,"{'Humor': 5, 'Humor-Comedy': 2}",3.27,44
10305,0.838405,3,7904207,Jim Henry,Antiquity Calais: Standing at Armageddon (The ...,,4.61,16
10306,0.737723,3,8226034,Hans Scherfig,Frydenholm,"{'Historical-Historical Fiction': 3, 'Fiction'...",4.06,98
