# Задача

Создать рекомендательную систему для пользователей библиотеки с учетом семантики

# Условия задачи

Участникам необходимо для каждого из 16 753 пользователей сделать
подборку из 20 рекомендаций.

Порядок рекомендаций не учитывается, но
очень важно, чтобы рекомендации основывались на интересе пользователя
и были ему релевантны. Обратите внимание, что тестирующая система
принимает только те решения, в которых содержится не более 20
рекомендаций для одного пользователя. Уникальных документов – 354 355.

Участники получают 3 таблицы: users.csv, items.csv,
train_transactions.csv.

Users.csv содержит информацию о читателях, где каждый читатель имеет свой
уникальный номер читательского билета (chb).

Таблица items.csv, содержит описание документов, которые доступны всем читателям,
каждый документ имеет уникальный системный номер (sys_numb).

Таблица train_transactions.csv устанавливает связь между
users-items, показывает наличие взаимодействия читателя с документом.

# Описание данных

##### users.csv:
chb – полный номер читательского билета
age – возраст читателя
gender – пол читателя
chit_type – тип читателя

##### items.csv:
sys_numb – системный номер документа
title – название документа
author – автор документа
izd – издательство
year_izd – год издания
bbk – ББК документа

##### train_transactions.csv:
chb - полный номер читательского билета
sys_numb – системный номер документа
date_1 – дата выдачи
is_real – был ли выдан заказ
type – тип книговыдачи (книговыдача/скачивание)
source – источник (один из трёх онлайн-просмотрщиков)
is_printed – печатный/электронный документ


In [17]:
import re

import numpy as np
import pandas as pd

import roman

import plotly.express as px
import plotly.graph_objects as go

from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import make_pipeline

from yellowbrick.cluster import KElbowVisualizer

import torch
import transformers

import catboost

import faiss

from sklearn.manifold import TSNE

from tqdm.notebook import tqdm

In [18]:
# tokenizer = transformers.BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-conversational')
# model = transformers.BertModel.from_pretrained('DeepPavlov/rubert-base-cased-conversational').to('cuda:0')

try:
	items_title_embeddings = pd.read_csv('items_title_embeddings.csv')
except:
	pass

## Шаг 1. Загрузим и посмотрим на данные

In [19]:
users = pd.read_csv(
	'../data/users.csv',
	sep=';',
	index_col=None,
	dtype={'age': str, 'chb': str, 'chit_type': str, 'gender': str}
)

items = pd.read_csv(
	'../data/items.csv',
	sep=';',
	index_col=None,
	dtype={'author': str, 'bbk': str, 'izd': str, 'sys_numb': str, 'title': str, 'year_izd': str}
)

train_transactions = pd.read_csv(
	'../data/train_transactions_extended.csv',
	sep=';',
	index_col=None,
	dtype={'chb': str, 'date_1': str, 'is_printed': str, 'is_real': str, 'source': str, 'sys_numb': str, 'type': str}
)

users.name = 'users'
items.name = 'items'
train_transactions.name = 'train_transactions'

all_data = [users, items, train_transactions]

Руками заполним пропуски в некоторых данных

In [20]:
# у пользователя с индексом 9681 в данных полях стоит значение "отсутствует", заменим его на самое частотное значение поля
users.loc[9681, 'gender'] = users['gender'].value_counts().index[0]
users.loc[9681, 'age'] = users['age'].value_counts().index[0]
users.loc[9681, 'chit_type'] = users['chit_type'].value_counts().index[0]
users.loc[8978, 'age'] = users['age'].value_counts()[0]

# книга без заполненных данных, ее некому будет рекомендовать
items.loc[45417, 'year_izd'] = '2016'
# заполним пропуски в поле bbk
items['bbk'] = items['bbk'].fillna('отсутствует')

# поля "none" и "отсутствует" по сути одно и то же, так что заменим none на отсутствует
items['title'] = items['title'].apply(lambda author: 'отсутствует' if 'none' in author else author)
items['author'] = items['author'].apply(lambda author: 'отсутствует' if 'none' in author else author)
items['izd'] = items['izd'].apply(lambda author: 'отсутствует' if 'none' in author else author)
items['year_izd'] = items['year_izd'].apply(lambda author: 'отсутствует' if 'none' in author else author)
items['bbk'] = items['bbk'].apply(lambda author: 'отсутствует' if 'none' in author else author)

# удаляем квадратные скобки из годов
items['year_izd'].apply(lambda year: year.replace('[', '').replace(']', '')).value_counts()

# создаем фичу определяющую является ли кника учебником или нет
items['is_textboot'] = items['title'].apply(lambda title: int(max([x in title for x in ['учеб', 'Учеб']])))

# типы "скачивание" и "скачка" одинаковые, приводим к единому виду "скачивание"
train_transactions['type'] = train_transactions['type'].apply(lambda type: 'скачивание' if 'скачка' in type else type)

Немного магии с годом издания книги

In [21]:
def is_roman_number(year):
	if '۱۱۴۵' in year:
		return 'отсутствует'

	pattern = r'\d{4}|[I|V|X|L|C|D|M]+'
	res = re.findall(pattern, year)
	if len(res) > 0:
		if len(res) > 1:
			return res[1]
		else:
			return res[0]
	else:
		pattern = r'[а-я]|[А-Я]|[a-z]|[A-Z]'
		res = re.findall(pattern, year)
		if len(res) > 0:
			return 'отсутствует'
		else:
			return year

def from_roman_to_int(year):
	try:
		return str(roman.fromRoman(year))
	except:
		return year

def validate(year):
	try:
		if int(year) >= 2022 or int(year) <= 1000:
			return 'отсутствует'
		else:
			return year
	except:
		return year


def parse(year):
	try:
		return year.split(' ')[0]
	except:
		return year


items['year_izd'] = (items['year_izd']
					.apply(is_roman_number)
					.apply(from_roman_to_int)
					.apply(
						lambda year: year
						.replace('?', '')
						.replace('.', '', 2)
						.replace('-', '0', 2)
						.replace('-', '')
						.replace('[', '')
						.replace(']', '')
					)
					.apply(parse)
					.apply(validate)
					.apply(
						lambda year: year
						.replace('.', '0')
						.replace('–', '0')
						)
					)

Приводим в порядок информацию об издательстве

In [22]:
def check_izdat(izdat):
	if '[б. и.]' in izdat:
		return 'отсутствует'
	elif 'Б. и.' in izdat:
		return 'отсутствует'
	return izdat.replace('[', '').replace(']', '')

items['izd'] = items['izd'].apply(check_izdat)

Приводим в порядок информацию об авторах (увеличивает число дублей среди авторов на 1132 (было 181734, стало 182866)

In [23]:
def get_only_author(author):
	pattern = r'\D+'
	return ' '.join(re.findall(pattern, author)).replace(',', '').replace('-', '')

items['author'] = items['author'].apply(get_only_author)

Создадим таблицу с данными, какие клиенты какие книги уже брали, чтобы их не рекомендовать

In [24]:
books_readed_by_clients = train_transactions.groupby('chb')['sys_numb'].apply(set)

In [25]:
for data in all_data:
	print(f'{"="*30}{data.name}{"="*30}')
	display(data.sample(7))



Unnamed: 0,chb,age,gender,chit_type
13277,200000874782,23,female,нет данных
326,200000931229,43,female,нет данных
462,200001036530,27,female,нет данных
4190,200001074502,27,female,нет данных
3761,100000821399,60,female,normal
7366,100001136654,45,female,echb
7435,200001167948,23,male,нет данных




Unnamed: 0,sys_numb,title,author,izd,year_izd,bbk,is_textboot
77658,RSL01007849285,Модель синтеза текста формулы изобретения : ди...,Шереметьева Светлана Олеговна,отсутствует,1985,отсутствует,0
235753,RSL01009858590,Король-Солнце Людовик XIV и его прекрасные дамы,Сотникова Наталия Николаевна,Алгоритм,2018,"Т3(4Фр)432-8Людовик XIV 21,02",0
69736,RSL01005062073,Теоретические основы выявления суггестивного п...,Карлик Надежда Анатольевна,отсутствует,2013,"Ш5(2=Р)5-357,0",0
205995,RSL01004257481,Договорная работа на предприятии : практически...,Беляева Ольга Александровна,ИНФРА-М,2009,"Х623.2,07",0
122138,RSL01009501529,Разработка математических моделей и программно...,Корчагин Сергей Алексеевич,отсутствует,2017,отсутствует,0
107503,RSL01002571934,История зарубежной литературы второй половины ...,Пучкова Галина Арсентьевна,АГПИ,2004,Ш5(0)53-32я73-1,1
159759,RSL07000477625,отсутствует,отсутствует,отсутствует,отсутствует,отсутствует,0




Unnamed: 0,chb,sys_numb,date_1,is_real,type,source,is_printed
29723,100001031585,RSL01009949468,2022-02-14,yes,,,True
149349,300000758346,RSL01003714001,2021-04-09,yes,скачивание,dlib.rsl.ru,False
94522,200000980479,RSL01002795732,2021-09-18,yes,книговыдача,2DL.Viewer,False
169651,300000881229,RSL01004577873,2021-09-04,yes,,,True
86985,200000944971,RSL01004430414,2021-03-27,yes,книговыдача,2DL.Viewer,False
95591,200000986001,RSL01010836657,2021-12-15,yes,книговыдача,2DL.Viewer,False
348,100000704313,RSL01003821728,2021-01-12,yes,скачивание,dlib.rsl.ru,False


In [26]:
print(f"Кол-во пользователей: {len(train_transactions['chb'].unique())}")
print(f"Кол-во документов в истории пользователей: {len(train_transactions['sys_numb'].unique())}")
print(f"Общее кол-во документов: {len(items['sys_numb'].unique())}")

Кол-во пользователей: 16753
Кол-во документов в истории пользователей: 194666
Общее кол-во документов: 354355


## Подготовим признаки в таблицах


In [27]:
users_data_encoder = OrdinalEncoder()
users_encode_columns = ['gender', 'chit_type']
users[users_encode_columns] = users_data_encoder.fit_transform(users[users_encode_columns])
users['age'] = users['age'].astype('int')
users.head()

Unnamed: 0,chb,age,gender,chit_type
0,300001020830,21,0.0,2.0
1,300001113642,36,0.0,2.0
2,300001148466,46,0.0,2.0
3,300001117011,22,0.0,2.0
4,200001038094,24,0.0,0.0


In [28]:
# scaler = StandardScaler()
# kmean = KMeans()
# visualiser = KElbowVisualizer(kmean, k=(2, 30))
# visualiser.fit(scaler.fit_transform(users[['age', 'gender', 'chit_type']]))
# visualiser.show()

In [29]:
kmean_pipe = make_pipeline(
	StandardScaler(),
	KMeans(n_clusters=6)
)
users['user_kmean_class'] = kmean_pipe.fit_predict(users[['age', 'gender', 'chit_type']])
users.sample(7)

Unnamed: 0,chb,age,gender,chit_type,user_kmean_class
5208,300000942455,21,0.0,1.0,0
3373,300000870574,32,1.0,2.0,3
13747,200001084639,32,0.0,0.0,2
3651,100000836139,38,1.0,1.0,3
4146,100001052307,65,0.0,2.0,4
7010,300001124819,54,1.0,2.0,1
10349,300001107233,16,1.0,0.0,2


In [30]:
items_data_encoder = OrdinalEncoder()
items_encode_columns = ['author', 'izd', 'bbk']
items[items_encode_columns] = items_data_encoder.fit_transform(items[items_encode_columns])
items.sample(7)

Unnamed: 0,sys_numb,title,author,izd,year_izd,bbk,is_textboot
300379,RSL01010890679,Предатели в русской истории : 1000 лет коварст...,36047.0,31585.0,2021,39764.0,0
210074,RSL01008855725,"Инвестиции, строительство, недвижимость как ма...",171478.0,33897.0,2017,62179.0,0
151665,RSL01008048781,Правовая природа интеллектуальных прав по зако...,56222.0,33897.0,2015,130777.0,0
127070,RSL07000426814,отсутствует,171478.0,33897.0,отсутствует,130777.0,0
134815,RSL07000473415,отсутствует,171478.0,33897.0,отсутствует,130777.0,0
225171,RSL01010572217,"""Элефант"" : тяжелое штурмовое орудие Фердинанд...",70845.0,32749.0,2021,77175.0,0
291595,RSL01009554720,"Национальные модели хозяйствования : интересы,...",6965.0,26388.0,2017,57322.0,0


In [31]:
# visualiser = KElbowVisualizer(kmean, k=(2, 15))
# visualiser.fit(scaler.fit_transform(items[items_encode_columns + ['is_textboot']]))
# visualiser.show()

In [32]:
kmean_pipe = make_pipeline(
	StandardScaler(),
	KMeans(n_clusters=5)
)

items['item_kmean_class'] = kmean_pipe.fit_predict(items[['author', 'izd', 'bbk', 'is_textboot']])
items.sample(7)

Unnamed: 0,sys_numb,title,author,izd,year_izd,bbk,is_textboot,item_kmean_class
56288,RSL01004603312,Тезаурусное представление терминов нефтегазово...,149959.0,33897.0,2010,94027.0,0,0
170500,RSL01001599687,Бои за историю : Сб. ст.,150570.0,19844.0,1991,130777.0,0,0
305038,RSL01006540357,Конкурентная разведка : сущность и способы осу...,123376.0,29959.0,2012,57806.0,1,3
40187,RSL01003451172,Истоки и становление массовой культуры в Росси...,82973.0,33897.0,2008,35332.0,0,1
14192,RSL01003384910,Социальные аспекты эпидемии и профилактики ВИЧ...,20997.0,11672.0,2007,33664.0,0,4
214223,RSL01004584372,Степан Петрович Яремич / сост. В. П. Третьяков...,170900.0,33897.0,2009,107499.0,0,0
15135,RSL02000021233,отсутствует,171478.0,33897.0,отсутствует,130777.0,0,0


## Создание эмбеддингов названий книг


In [33]:
# print(f'Максимальная длинна токена: {items["title"].apply(lambda row: row.split()).apply(len).sort_values(ascending=False).values[0]}')

Сгенерим эмбеддинги описаний


In [34]:
# batch_size = 1
#
# vector = items['title'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=151))
# # применим padding к векторам
# n = len(max(vector, key=len))
# # англ. вектор с отступами
# padded = np.array([i + [0]*(n - len(i)) for i in vector.values])
#
# # создадим маску для важных токенов
# attention_mask = np.where(padded != 0, 1, 0)
#
# embeddings = []
# for i in tqdm(range(padded.shape[0] // batch_size)):
# 	# преобразуем данные
# 	batch = torch.LongTensor(padded[batch_size*i : batch_size*(i+1)]).to('cuda:0')
# 	# преобразуем маску
# 	attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i : batch_size*(i+1)]).to('cuda:0')
# 	with torch.no_grad():
# 		batch_embeddings = model(batch, attention_mask=attention_mask_batch)
#
# 	# преобразуем элементы методом numpy() к типу numpy.array
# 	embeddings.append(batch_embeddings[0][:,0,:].cpu().numpy())
#
# features = pd.DataFrame(np.concatenate(embeddings))

In [35]:
# features.to_csv('items_title_embeddings.csv', index=False)

In [36]:
items = items.drop('title', axis=1)
items = pd.concat(
	[
		items,
		items_title_embeddings
	],
	axis=1
)
items.sample(7)

Unnamed: 0,sys_numb,author,izd,year_izd,bbk,is_textboot,item_kmean_class,0,1,2,...,758,759,760,761,762,763,764,765,766,767
235609,RSL01002959118,22665.0,22534.0,2006,94741.0,1,3,-0.477946,0.879065,-0.820496,...,0.300607,0.247886,-1.590933,-0.422365,0.825829,0.089491,-0.063888,-1.640378,0.729632,-0.880682
275401,RSL01002136967,171478.0,10854.0,1986,130777.0,1,3,-0.01057,0.054546,-0.274104,...,-0.102245,0.248315,-1.028383,-0.208678,1.199478,0.364111,-0.592717,-0.728675,0.403801,-0.213276
353486,RSL01004348327,119286.0,1690.0,2009,91903.0,0,4,-0.546297,0.149983,-0.422349,...,-0.017053,0.344494,-0.599477,-0.302137,-0.334257,-0.086596,-0.524603,-0.430878,0.546444,-1.389565
335406,RSL01002819388,171478.0,33897.0,2005,74070.0,1,3,-0.009973,0.863742,-0.5428,...,-0.05499,0.557439,-1.366467,-0.077067,0.463039,-0.306563,0.248377,-0.795569,0.195602,-0.030963
86721,RSL01008583146,82972.0,33897.0,2017,130777.0,0,2,-0.229861,-0.471015,-0.658167,...,0.722205,0.519452,-1.288758,-0.66225,1.151839,-0.12839,0.086169,-0.255075,0.931593,-0.725854
283282,RSL01004869282,171478.0,8704.0,2011,69096.0,1,3,-0.409682,-0.511083,0.379434,...,0.383868,1.217856,-1.696886,-1.169554,1.496962,0.325771,-0.296277,0.004606,0.653681,-0.821276
8110,RSL07000362949,171478.0,33897.0,отсутствует,130777.0,0,0,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302


In [37]:
# train_transactions = train_transactions.drop('date_1', axis=1)

transactions_data_encoder = OrdinalEncoder()
transactions_encode_columns = ['is_real', 'type', 'source', 'is_printed']
train_transactions[transactions_encode_columns] = transactions_data_encoder.fit_transform(train_transactions[transactions_encode_columns])
train_transactions.sample(7)

Unnamed: 0,chb,sys_numb,date_1,is_real,type,source,is_printed
50980,200000789223,RSL01004952412,2021-05-12,1.0,0.0,1.0,1.0
71089,200000872413,RSL01008795621,2021-04-08,1.0,1.0,0.0,0.0
186618,300000936104,RSL02000023023,2021-02-04,1.0,1.0,0.0,0.0
53234,200000792956,RSL01000736387,2021-01-19,1.0,0.0,1.0,1.0
63797,200000864451,RSL01003605288,2021-04-10,1.0,2.0,2.0,0.0
47822,200000782373,RSL01003478688,2021-05-06,1.0,2.0,2.0,0.0
100827,200001007313,RSL02000013914,2021-02-13,1.0,1.0,0.0,0.0


In [38]:
most_viewed = train_transactions.groupby(['chb', 'sys_numb']).count().rename({'date_1': 'count'}, axis=1)['count'].reset_index().sort_values(by='count', ascending=False)

most_viewed = most_viewed[most_viewed['count'] >= 4]
most_viewed

Unnamed: 0,chb,sys_numb,count
213726,300001090427,RSL07000449201,44
86257,200000987642,RSL01004357029,42
80884,200000957291,RSL01006724299,35
144763,300000850004,RSL01010175347,35
144698,300000850004,RSL01002745675,26
...,...,...,...
14562,100000929463,RSL01001159547,4
55142,200000846112,RSL01003311865,4
177886,300000996405,RSL01010535294,4
66215,200000877628,RSL01009796677,4


In [39]:
predp = train_transactions.groupby(['chb'])['sys_numb'].apply(list)

items[items['sys_numb'].isin(predp.loc['100000711986'])].groupby(['izd', 'author'])['sys_numb'].apply(list).reset_index()

Unnamed: 0,izd,author,sys_numb
0,31521.0,135204.0,[RSL01003738918]
1,33897.0,68139.0,[RSL01000301324]
2,33897.0,104409.0,[RSL01003829301]
3,33897.0,123490.0,[RSL01004011405]
4,33897.0,124733.0,"[RSL01002895371, RSL01002903839]"
5,33897.0,137902.0,[RSL01006644591]
6,36026.0,18140.0,[RSL01003556103]
7,36093.0,135204.0,[RSL01003716900]


In [40]:
predp

chb
100000641403     [RSL01004206702, RSL01000769304, RSL01004211574]
100000644359    [RSL01009800093, RSL01003557352, RSL0101058731...
100000665127    [RSL01003947258, RSL01003276143, RSL0101011743...
100000676191     [RSL60000203658, RSL01005076342, RSL60000305661]
100000679200    [RSL01003462568, RSL01010248423, RSL0100328901...
                                      ...                        
300001170610    [RSL01009427881, RSL01009427881, RSL0100390718...
300001172682                     [RSL01005084974, RSL01003305563]
300001172795    [RSL01003681873, RSL01003629006, RSL0100356731...
300001173062    [RSL01002899944, RSL01002899944, RSL0100281651...
400001035059     [RSL01002298169, RSL01002632325, RSL01003421323]
Name: sys_numb, Length: 16753, dtype: object

Создадим индекс для айтемов

In [41]:
items['year_izd'] = items['year_izd'].replace({'отсутствует': 0, '': 0})
items.head()

Unnamed: 0,sys_numb,author,izd,year_izd,bbk,is_textboot,item_kmean_class,0,1,2,...,758,759,760,761,762,763,764,765,766,767
0,RSL01008600016,93911.0,32580.0,2016,72799.0,0,1,-0.829435,0.707242,-0.909075,...,-0.27968,0.087414,-0.621644,-0.145256,0.9148,-0.219325,-0.093396,-0.731793,1.235529,-0.886894
1,RSL01004304880,48327.0,33897.0,2006,130777.0,0,2,-0.173611,-0.045968,-0.636685,...,0.641574,0.170896,-0.585657,-0.311282,0.669547,0.145333,0.132103,-0.610703,0.408084,-0.817821
2,RSL07000461043,171478.0,33897.0,0,130777.0,0,0,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
3,RSL07000433335,171478.0,33897.0,0,130777.0,0,0,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
4,RSL01002419013,167650.0,22996.0,2004,122581.0,0,0,-0.161775,-0.028736,-0.701615,...,0.359039,0.212547,-1.460411,-0.149345,0.402753,-0.024701,-0.497746,-0.08693,0.472253,-0.625737


In [42]:
items_to_index = items.drop('sys_numb', axis=1)

items_index = faiss.IndexFlatL2(items_to_index.shape[1])
print(items_index.ntotal)  # пока индекс пустой

items_index.add(np.ascontiguousarray(items_to_index.to_numpy().astype('float32')))
print(items_index.ntotal)  # теперь в нем n векторов

 # расстояния, отсортированные по убыванию

0
354355


In [43]:
# сохраняем индекс в файл
#faiss.write_index(index, 'faiss_items_index.index')

## Объединим датасеты

In [44]:
full_data = pd.merge(pd.merge(train_transactions, users, on='chb', how='left'), items, on='sys_numb', how='left')
full_data.loc[full_data[full_data['year_izd'] == ''].index, 'year_izd'] = '0'
# full_data['year_izd'] = full_data['year_izd'].apply(lambda year: 0 if 'отсутствует' in year else year).astype('int')
full_data.sample(7)

Unnamed: 0,chb,sys_numb,date_1,is_real,type,source,is_printed,age,gender,chit_type,...,758,759,760,761,762,763,764,765,766,767
52499,200000789223,RSL01000907806,2021-02-09,1.0,0.0,1.0,1.0,67,0.0,1.0,...,0.169481,0.180098,-1.08056,-0.351517,0.487906,0.119887,0.287182,-0.453381,0.058641,-0.653946
216916,300001042075,RSL01001715505,2021-10-26,1.0,0.0,1.0,1.0,48,0.0,0.0,...,0.357168,-0.035538,-0.606147,-0.350501,-0.286356,-0.025331,0.272961,-0.216696,0.0192,-0.476531
130972,200001090885,RSL01004723182,2021-10-15,1.0,1.0,0.0,0.0,24,1.0,2.0,...,0.249712,0.588491,-1.007645,-1.019265,0.89159,0.314563,-0.208028,-0.6817,0.589731,-0.542481
133232,200001099715,RSL01008734006,2021-10-27,1.0,1.0,0.0,0.0,24,0.0,2.0,...,0.170834,0.437622,-1.114273,-0.661416,0.950592,0.336382,0.277352,0.191245,0.684618,-0.579157
172117,300000893684,RSL07000392045,2021-05-28,1.0,1.0,0.0,0.0,26,0.0,1.0,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
238381,300001080051,RSL01009550721,2021-04-23,1.0,0.0,1.0,1.0,24,0.0,2.0,...,-0.178667,0.98573,-1.09343,-0.336779,0.925864,-0.089196,0.231511,-0.244338,0.806588,-0.497898
209448,300001024606,RSL01004743862,2021-12-06,1.0,0.0,1.0,1.0,31,1.0,0.0,...,-0.307671,0.224934,-0.625844,-0.124343,0.955143,-0.049972,-0.170146,-0.339338,0.710404,-0.197472


In [45]:
full_data = full_data.drop_duplicates().reset_index(drop=True)

# Построим индексы


In [46]:
# full_data.to_csv('../data/full_data.csv', index=False)

In [47]:
# new_full_data = full_data.set_index(['chb', 'sys_numb'], drop=True).head()
users_id = full_data['chb']
items_id = full_data['sys_numb']
full_data = full_data.drop(['chb', 'sys_numb'], axis=1)

## Создадим матрицу взаимодействий пользователя


In [48]:
# user_iteractions = pd.DataFrame(
# 	np.empty(shape=(len(set(users['chb'].values)), len(set(items['sys_numb'].values))), dtype=np.int8),
# 	index=list(set(users['chb'].values)),
# 	columns=list(set(items['sys_numb'].values)),
# )
#
# for user in tqdm(list(set(users['chb'].values))):
# 	user_readed_books = list(set(train_transactions[train_transactions['chb'] == user]['sys_numb'].values))
# 	for read_book_user in user_readed_books:
# 		user_iteractions.loc[user, read_book_user] = 1
#
# from scipy.sparse import csr_matrix
# user_iteractions_csr = csr_matrix(user_iteractions.values)

## Попробуем решить задачу с помощью FAISS

In [49]:
full_data = full_data.drop('date_1', axis=1)

In [50]:
index = faiss.IndexFlatL2(full_data.shape[1])
print(index.ntotal)  # пока индекс пустой

index.add(np.ascontiguousarray(full_data.to_numpy().astype('float32')))
print(index.ntotal)  # теперь в нем n векторов

topn = 10

product_index_in_data = 4732
dist, same_embedding_indexes = index.search(
	np.ascontiguousarray(full_data.loc[product_index_in_data].to_numpy().astype('float32').reshape((1, -1))),
	topn)

print(same_embedding_indexes[0])  # индексы самых похожих векторов
print(dist)  # расстояния, отсортированные по убыванию

0
259566
[  4731   4730   4732 184708 200262 252876 176468 237421 164566 223609]
[[   0.        0.        0.     2872.8076 3131.7078 3741.814  3825.946
  4155.816  4277.8267 4334.979 ]]


In [51]:
import catboost

In [74]:
train_transactions['date_1'] = pd.to_datetime(train_transactions['date_1'])
# train_transactions.head()
train_transactions['month'] = train_transactions['date_1'].dt.to_period("M")

n_recs = train_transactions.groupby(['chb', 'month']).count().reset_index().groupby('chb').mean().astype('int').rename({'type': 'count'}, axis=1)['count']
n_recs[n_recs >= 20] = 20

n_recs.head(30)

chb
100000641403     1
100000644359    12
100000665127     4
100000676191     1
100000679200     1
100000681262    20
100000683677     2
100000689674     8
100000692449     7
100000693475     2
100000695045     5
100000697273     3
100000698405     6
100000702964     3
100000704313    20
100000705925    20
100000706981     9
100000711986     5
100000715413     2
100000717512     4
100000717612     5
100000718304    14
100000720098    20
100000722110     1
100000722645     1
100000728030    11
100000730783     4
100000733127     2
100000733883     1
100000737450     2
Name: count, dtype: int32

In [76]:
n_recs.loc['100000644359']

12

In [None]:
full_data = full_data.drop('date_1', axis=1)

In [82]:
res = []
n_rec = 80
# count_rec_res = 10  # сколько будет рекомендаций для пользователя
for chb in tqdm(set(users.chb.values)):
	count_rec_res = n_recs.loc[chb] # количество рекомендаций для пользователя
	count_rec = 0
	readed = books_readed_by_clients[str(chb)]

	# добавляем рекомендации из тех книг, которые пользователь часто берет
	mv_rec = most_viewed[most_viewed['chb'] == chb]['sys_numb']
	for numb in mv_rec:
		res.append([chb, numb])
		count_rec += 1
	if count_rec == count_rec_res:
		continue

	# собираем результаты для пользователя
	full_user_dist = []
	full_user_indexes = []

	for chb_index in users_id[users_id == str(chb)].index:
		# получаем рекомендацию
		dist, rec_indexes = index.search(
			np.ascontiguousarray(
				full_data.loc[chb_index].to_numpy().astype('float32')
				.reshape((1, -1))
			),
			n_rec
		)
		# сохраняем ее
		full_user_dist.append(dist[0])
		full_user_indexes.append(rec_indexes[0])

	# список списков преобразуем в одномерный список
	full_user_dist = np.array(full_user_dist).flatten()
	full_user_indexes = np.array(full_user_indexes).flatten()
	# преобразуем в Series и сортируем по distances
	rec_index = pd.Series(
		data=full_user_dist,
		index=full_user_indexes
	).sort_values()
	# отбираем дистанции больше 0 т.к. 0 соответствует самому себе
	rec_index = rec_index[rec_index > 0].index
	temp_items = []
	for same_index in rec_index:
		item = items_id.loc[same_index]
		if item not in temp_items:
			temp_items.append(item)
			if item not in list(readed):
				count_rec += 1
				res.append([chb, item])
			if count_rec == count_rec_res:
				break
	if count_rec < count_rec_res:

		# собираем результаты для пользователя
		full_user_dist = []
		full_user_indexes = []

		for item in items[items['sys_numb'].isin(train_transactions[train_transactions['chb'] == chb]['sys_numb'])]['sys_numb']:
			item_ind = items[items['sys_numb'] == item].index
			# получаем рекомендацию
			dist, rec_indexes = items_index.search(
				np.ascontiguousarray(
					items_to_index.loc[item_ind].to_numpy().astype('float32')
					.reshape((1, -1))
				),
				n_rec
			)
			# сохраняем ее
			full_user_dist.append(dist[0])
			full_user_indexes.append(rec_indexes[0])

		# список списков преобразуем в одномерный список
		full_user_dist = np.array(full_user_dist).flatten()
		full_user_indexes = np.array(full_user_indexes).flatten()

		# преобразуем в Series и сортируем по distances
		rec_index = pd.Series(
			data=full_user_dist,
			index=full_user_indexes
		).sort_values()
		# отбираем дистанции больше 0 т.к. 0 соответствует самому себе
		rec_index = rec_index[rec_index > 0].index

		for same_index in rec_index:
			item = items_id.loc[same_index]
			res.append([chb, item])
			count_rec += 1
			if count_rec == count_rec_res:
				break
	if count_rec < count_rec_res:
		print(f'{chb}: count rec {count_rec}')

  0%|          | 0/16753 [00:00<?, ?it/s]

300001061444: count rec 0
300001075805: count rec 1
300001083398: count rec 0
300001091095: count rec 0
200001078466: count rec 0
200001118680: count rec 0
200001064465: count rec 0
300000866641: count rec 0
300001124819: count rec 0
300001109363: count rec 0
300001154804: count rec 0
300001120762: count rec 0
100001060592: count rec 0
200001081782: count rec 0
300001072961: count rec 0
200001108542: count rec 0
200001009261: count rec 0
300001087468: count rec 0
200001066542: count rec 0
300001073457: count rec 0
300001058975: count rec 0
200001098458: count rec 0
200000780205: count rec 0
200001154460: count rec 0
200000962714: count rec 2
200001113141: count rec 0
200000979504: count rec 2
300000871957: count rec 0
300000912142: count rec 0
200001059329: count rec 0
300001122580: count rec 0
200001131626: count rec 0
200001081472: count rec 0
300001128185: count rec 0
200001167948: count rec 0
300000890992: count rec 0
200000916296: count rec 0
300001108543: count rec 0
200001036202

In [83]:
pd.DataFrame(res, columns=['chb', 'sys_numb']).to_csv('12th_iter.csv', sep=';', index=False)

In [115]:
puc = pd.DataFrame(res, columns=['chb', 'sys_numb'])

In [91]:
n_recs.loc['200000789223']

20

In [120]:
droppped = ['200000789223', '300000913764', '300000984428', '100001049235', '300000925634', '200000991383', '300000984072', '300000922909']

for drop_chb in droppped:
	n_rec_new = n_recs.loc[drop_chb]
	recs = puc[puc['chb'] == drop_chb].head(n_rec_new)['sys_numb'].to_list()
	index_to_drop = puc[puc['chb'] == drop_chb].index
	puc = puc.drop(index_to_drop)
	for rec in recs:
		puc = pd.concat(
			[puc, pd.DataFrame([rec], columns=['sys_numb'], index=[drop_chb])]
		)

200000789223 | 20
300000913764 | 20
300000984428 | 12
100001049235 | 14
300000925634 | 5
200000991383 | 3
300000984072 | 2
300000922909 | 1


In [117]:
puc.groupby('chb').count().sort_values(by='sys_numb', ascending=False)

Unnamed: 0_level_0,sys_numb
chb,Unnamed: 1_level_1
100001010750,20
300001135717,20
100001110210,20
300001005802,20
300000901358,20
...,...
200000924034,1
300001066647,1
300001066668,1
200000923601,1


In [119]:
puc.head()

Unnamed: 0,chb,sys_numb
0,100001137125,RSL01000591218
1,100001137125,RSL01002155431
2,100001137125,RSL01001500214
3,100001137125,RSL01004722252
4,200000795041,RSL01000254680


In [None]:
puc.to_csv('12th_iter.csv', sep=';', index=False)

In [152]:
new_res = items[items['sys_numb'].isin(train_transactions[train_transactions['chb'] == '200000928664']['sys_numb'])]
if new_res.shape[0] >= 4:
	gachi = new_res.groupby(['author', 'izd'])['sys_numb'].apply(list)
	for item in gachi:
		pass

for p in items[items['sys_numb'].isin(train_transactions[train_transactions['chb'] == '200000928664']['sys_numb'])]['sys_numb']:
	print(p)

RSL07000467545
RSL07000352880
RSL07000429548
RSL07000441643
RSL07000461678
RSL07000452801
RSL07000467995
RSL07000453468


In [41]:
pd.DataFrame(res, columns=['chb', 'sys_numb']).to_csv('11th_iter.csv', sep=';', index=False)


In [44]:
pof = pd.read_csv('10th_iter.csv', sep=';')

In [55]:
f1 = pof.groupby('chb')['sys_numb'].apply(list).apply(lambda x: x[:10])

In [91]:
n = pd.DataFrame(columns=['chb', 'sys_numb'])

for row in tqdm(f1.iteritems()):
	chb = row[0]
	sys_numbs = row[1]
	for numb in sys_numbs:
		n = pd.concat([n, pd.DataFrame({'chb': chb, 'sys_numb': numb}, index=[0])])


n.to_csv('11th_iter.csv', sep=';', index=False)

0it [00:00, ?it/s]

In [187]:
index_to_del = pof_new[pof_new['chb'] == '300000913764'].index[20:]
pof_new = pof_new.drop(index_to_del)

In [190]:
pof_new.to_csv('10th_iter.csv', sep=';', index=False)

In [175]:
items[items['sys_numb'].isin(train_transactions[train_transactions['chb'] == '200000789223']['sys_numb'])]

Unnamed: 0,sys_numb,author,izd,year_izd,bbk,is_textboot,item_kmean_class,0,1,2,...,758,759,760,761,762,763,764,765,766,767
364,RSL01003371032,171478.0,20814.0,2007,122020.0,0,4,-0.364120,0.528713,0.835498,...,-0.307313,0.452469,-1.339303,-0.947004,0.477154,0.210417,0.377726,0.024643,1.081792,-0.112187
1307,RSL01006550496,77633.0,5191.0,2013,120292.0,0,3,-0.984972,-0.141799,-0.275390,...,0.432058,-0.008617,-0.986411,-0.542843,0.917930,-0.509416,-0.105579,-0.487578,0.917741,-0.901505
1421,RSL01008895400,130695.0,31932.0,2017,16925.0,0,0,-1.335677,-0.071682,-0.070710,...,0.006448,-0.000995,-0.251056,0.054474,0.161946,-0.036665,0.044311,0.091683,0.987661,-0.795135
5033,RSL01004123795,49841.0,2475.0,2008,126110.0,0,3,-0.775513,-0.229855,0.748087,...,0.810728,0.440202,-0.207567,-1.087967,-0.143804,0.804192,0.254146,-0.152970,-0.193239,-1.951387
7315,RSL01004405273,101811.0,2917.0,2009,95081.0,0,3,-0.791708,-0.050490,0.408101,...,0.384736,-0.293418,-0.947994,-0.681866,-0.298956,0.173642,-0.733505,-0.944530,0.816367,-1.428345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288797,RSL01001694290,13758.0,12759.0,1994,130777.0,0,1,-0.532385,-0.079973,0.239421,...,0.001189,-0.052073,-0.685655,-0.588953,0.377750,-0.016333,-0.163730,-0.050604,0.331661,-0.088319
288798,RSL01010778128,28083.0,30616.0,2021,34904.0,0,0,-0.987721,0.530896,-0.244406,...,0.122756,-0.220837,-0.474897,-0.438574,-0.178187,-0.165708,-0.075412,-1.137978,0.790419,-1.229690
288799,RSL01001801669,155433.0,20834.0,1998,130777.0,0,4,-0.466421,0.169697,0.383134,...,0.064017,-0.124158,-0.550101,-0.351466,0.327920,0.071172,0.040846,-0.114628,0.262458,-0.048342
288801,RSL01010548130,15877.0,25494.0,2021,74251.0,0,1,-1.547659,0.555516,0.133630,...,-0.403069,0.058203,-0.470700,-1.011322,1.347675,-0.348826,-0.279674,-1.151408,0.916590,-1.094023


In [200]:
pof_new[pof_new['chb'] == '300001075805']

Unnamed: 0,chb,sys_numb
313558,300001075805,RSL07000372451


In [189]:
pof_new.groupby('chb')['sys_numb'].apply(list).apply(len).sort_values()

chb
300001075805     1
200000979504     2
200000962714     2
300001142507     3
300001153491     5
                ..
200001035732    20
200001035842    20
200001035846    20
200001035934    20
400001035059    20
Name: sys_numb, Length: 16710, dtype: int64

In [196]:
# смотрим что уже порекомендованно
items[items['sys_numb'].isin(pof_new[pof_new['chb'] == '300001075805']['sys_numb'])]

Unnamed: 0,sys_numb,author,izd,year_izd,bbk,is_textboot,item_kmean_class,0,1,2,...,758,759,760,761,762,763,764,765,766,767
4910,RSL07000372451,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302


In [191]:
# смотрим какие книги юзер брал
items[items['sys_numb'].isin(train_transactions[train_transactions['chb'] == '300001075805']['sys_numb'])]

Unnamed: 0,sys_numb,author,izd,year_izd,bbk,is_textboot,item_kmean_class,0,1,2,...,758,759,760,761,762,763,764,765,766,767
2350,RSL07000415773,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
4910,RSL07000372451,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
79500,RSL07000372455,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
83065,RSL02000006654,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302


In [193]:
# смотрим такие же по автору и издательству
items[(items['author'] == 171478.0) & (items['izd'] == 33897.0)]

Unnamed: 0,sys_numb,author,izd,year_izd,bbk,is_textboot,item_kmean_class,0,1,2,...,758,759,760,761,762,763,764,765,766,767
2,RSL07000461043,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.205070,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.043390,0.285153,-0.181302
3,RSL07000433335,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.205070,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.043390,0.285153,-0.181302
6,RSL07000467555,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.205070,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.043390,0.285153,-0.181302
9,RSL07000458490,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.205070,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.043390,0.285153,-0.181302
29,RSL07000355720,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.205070,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.043390,0.285153,-0.181302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354284,RSL01004011453,171478.0,33897.0,1912,130777.0,0,4,0.085555,0.324205,-0.370308,...,-0.690902,0.046693,-0.853451,0.250400,0.115962,-0.144393,-0.387870,-0.065081,0.408006,0.014711
354298,RSL01011034405,171478.0,33897.0,1883,130777.0,0,4,-0.371838,-0.013498,-0.942991,...,-0.041956,0.201893,-1.036587,0.045496,0.569462,0.616259,-0.857223,0.161010,0.274876,-1.087000
354309,RSL01000567490,171478.0,33897.0,1996,84034.0,0,4,-0.632071,0.518056,0.134784,...,0.053200,0.659948,-1.275459,-0.537373,-0.003366,-0.137675,0.111982,-1.333713,0.823611,-0.569724
354314,RSL01003970966,171478.0,33897.0,1902,130777.0,1,2,0.036629,-0.223110,-0.520931,...,0.133995,0.327780,-0.360977,-1.300547,0.627138,-0.201863,-0.015881,-0.713385,-0.038805,-1.314322


In [199]:
item_ind = 2350
# получаем рекомендацию
dist, rec_indexes = items_index.search(
	np.ascontiguousarray(
		items_to_index.loc[item_ind].to_numpy().astype('float32')
		.reshape((1, -1))
	),
	n_rec
)

items.loc[rec_indexes[0]]

Unnamed: 0,sys_numb,author,izd,year_izd,bbk,is_textboot,item_kmean_class,0,1,2,...,758,759,760,761,762,763,764,765,766,767
46,RSL07000411551,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
130,RSL02000010551,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
469,RSL07000435269,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
29,RSL07000355720,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
64,RSL07000353616,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,RSL07000380592,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
482,RSL07000350413,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
437,RSL07000484405,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302
125,RSL07000451038,171478.0,33897.0,0,130777.0,0,4,0.003803,0.204588,-0.20507,...,-0.079526,0.209965,-0.544201,-0.241722,-0.020239,-0.108107,-0.332226,0.04339,0.285153,-0.181302


In [None]:
# сохраняем индекс в файл
faiss.write_index(index, 'faiss_product_usage_index_without_title.index')

In [None]:
def get_recommendation():
	res = []
	n_rec = 100
	for chb in tqdm(set(users.chb.values)):
		count_rec = 0
		# получаем множество уже прочитанных книг
		readed = books_readed_by_clients[str(chb)]
		user_index = full_data[users_id['chb'] == chb]
		# формируем рекомендации
		_, same_embedding_indexes = index.search(
			np.ascontiguousarray(full_data.to_numpy().astype('float32')[user_index].reshape((1, -1))),
			n_rec
		)
		# для каждой рекомендации
		for same_index in same_embedding_indexes[0]:
			# проверяем нет ли ее в уже прочитанных
			# если нет, то добавляем в результат
			# если есть, то переходим к следующей рекомендации
			# print(same_index)
			item = user_full_data.loc[same_index, 'sys_numb']
			if item not in list(readed):
				count_rec += 1
				res.append([chb, item])
			if count_rec == 20:
				break
	return res

In [None]:
pd.DataFrame(get_recommendation(), columns=['chb', 'sys_numb']).to_csv('4rd_iter.csv', sep=';', index=False)