# Обучим и протестируем модель

In [14]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k
import pickle

In [15]:
ratings = pd.read_csv('data/ratings.csv')
books = pd.read_csv('data/books.csv')
tags = pd.read_csv('data/tags.csv')
book_tags = pd.read_csv('data/book_tags.csv')

In [16]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [17]:
mapper = dict(zip(books.goodreads_book_id,books.book_id))

In [18]:
tags = pd.read_csv('data/tags_cleaned.csv')
book_tags = book_tags[book_tags.tag_id.isin(tags.tag_id)]
book_tags['id'] = book_tags.goodreads_book_id.apply(lambda x: mapper[x])

In [19]:
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count,id
1,1,11305,37174,27
4,1,33114,12716,27
5,1,11743,9954,27
6,1,14017,7169,27
10,1,27199,3857,27


In [20]:
ratings_coo = sparse.coo_matrix((ratings.rating,(ratings.user_id,ratings.book_id)))
feature_ratings  = sparse.coo_matrix(([1]*len(book_tags), (book_tags.id,book_tags.tag_id)))

Объявим вспомогательные константы для обучения модели:

In [None]:
#число потоков нашего процессора. Ставим 1, так как lightfm на macos ставится без OpenMP
NUM_THREADS = 1

#число параметров вектора 
NUM_COMPONENTS = 60

#число эпох обучения
NUM_EPOCHS = 10 

На этапе создания модели мы используем библиотеку LightFM, чтобы сделать матричное разложение (ALS) наших рейтингов книг и получить два набора векторов. 

In [None]:
#Создаём модель
model = LightFM(learning_rate=0.05, loss='warp', no_components=NUM_COMPONENTS)
 
#Разбиваем наш датасет на обучающую и тестовую выборки
train, test = random_train_test_split(ratings_coo, test_percentage=0.2, random_state=None)

#Обучаем модель
model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS,item_features =feature_ratings)

In [None]:
import datetime
print(datetime.datetime.now())

2021-05-01 19:05:26.581316


Протестируем модель

In [None]:
#Тестируем нашу модель
precision_score = precision_at_k(
                     model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()
 
recall_score = recall_at_k(model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()

print(recall_score, precision_score)

0.04067587521181742 0.0878126


Сохраним модель

In [None]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file, protocol=pickle.HIGHEST_PROTOCOL)

# Добавим эмбеддинги к модели и посмотрим, что получилось

In [29]:
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

In [30]:
# Достаём эбмеддинги
item_biases, item_embeddings = model.get_item_representations(features=feature_ratings)

In [31]:
import nmslib

In [32]:
#Создаём наш граф для поиска
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
 
#Начинаем добавлять наши книги в граф
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)

2021-07-07 08:34:15.435 M                   = 16
2021-07-07 08:34:15.436 indexThreadQty      = 8
2021-07-07 08:34:15.438 efConstruction      = 200
2021-07-07 08:34:15.438 maxM			          = 16
2021-07-07 08:34:15.438 maxM0			          = 32
2021-07-07 08:34:15.438 mult                = 0.360674
2021-07-07 08:34:15.445 skip_optimized_index= 0
2021-07-07 08:34:15.446 delaunay_type       = 2
2021-07-07 08:34:15.448 Set HNSW query-time parameters:
2021-07-07 08:34:15.448 ef(Search)         =20
2021-07-07 08:34:15.448 algoType           =2
2021-07-07 08:34:15.999 
The vector space is CosineSimilarity
2021-07-07 08:34:16.000 Vector length=60
2021-07-07 08:34:16.002 searchMethod			  = 3
2021-07-07 08:34:16.003 Making optimized index
2021-07-07 08:34:16.010 Finished making optimized index
2021-07-07 08:34:16.010 Maximum level = 3
2021-07-07 08:34:16.010 Total memory allocated for optimized index+data: 3 Mb


In [33]:
#Вспомогательная функция для поиска по графу
def nearest_books_nms(book_id, index, n=10):
    nn = index.knnQuery(item_embeddings[book_id], k=n)
    return nn

Найдем id книги 1984

In [37]:
books[books.original_title.str.find('The Silence of the Lambs')>=0].head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
208,209,23807,23807,22533,187,99446782,9780099000000.0,Thomas Harris,1988.0,The Silence of the Lambs,...,351107,366112,3866,10268,12845,55427,123652,163920,https://images.gr-assets.com/books/1390426249m...,https://images.gr-assets.com/books/1390426249s...


Теперь найдем все похожие книги и посмотрим на них

In [40]:
nbm = nearest_books_nms(209, nms_idx)[0]

In [41]:
books[books.book_id.isin(nbm)][['authors', 'original_title']]

Unnamed: 0,authors,original_title
208,Thomas Harris,The Silence of the Lambs
273,"Mario Puzo, Robert Thompson, Peter Bart",The Godfather
430,Thomas Harris,Red Dragon
525,Jeff Lindsay,Darkly Dreaming Dexter
767,Dennis Lehane,Shutter Island
1175,Dennis Lehane,Mystic River
1484,James Ellroy,The Black Dahlia
1801,Thomas Harris,Hannibal
5312,Scott B. Smith,A Simple Plan
9792,Patricia Highsmith,Strangers on a Train


Сохраним эмбеддинги

In [None]:
with open('item_embeddings.pkl', 'wb') as file:
    pickle.dump(item_embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)

# Прототип на Streamlit

In [21]:
import streamlit as st
import numpy as np
import pandas as pd
import lightfm as lf
import nmslib
import pickle
import scipy.sparse as sparse

In [22]:
def nearest_books_nms(book_id, index, n=10):
    """Функция для поиска ближайших соседей, возвращает построенный индекс"""
    nn = index.knnQuery(item_embeddings[book_id], k=n)
    return nn

In [23]:
def get_names(index):
    """
    input - idx of books
    Функция для возвращения имени книг
    return - list of names
    """
    names = []
    for idx in index:
        names.append('Book name:  {} '.format(
            name_mapper[idx]) + '  Book Author: {}'.format(author_mapper[idx]))
    return names

In [24]:
def read_files(folder_name='data'):
    """
    Функция для чтения файлов + преобразование к  нижнему регистру
    """
    ratings = pd.read_csv(folder_name+'/ratings.csv')
    books = pd.read_csv(folder_name+'/books.csv')
    books['title'] = books.title.str.lower()
    return ratings, books 

In [25]:
def make_mappers():
    """
    Функция для создания отображения id в title
    """
    name_mapper = dict(zip(books.book_id, books.title))
    author_mapper = dict(zip(books.book_id, books.authors))

    return name_mapper, author_mapper

In [26]:
def load_embeddings():
    """
    Функция для загрузки векторных представлений
    """
    with open('item_embeddings.pkl', 'rb') as f:
        item_embeddings = pickle.load(f)

    # Тут мы используем nmslib, чтобы создать наш быстрый knn
    nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
    nms_idx.addDataPointBatch(item_embeddings)
    nms_idx.createIndex(print_progress=True)
    return item_embeddings,nms_idx

In [27]:
#Загружаем данные
ratings, books  = read_files(folder_name='data') 
name_mapper, author_mapper = make_mappers()
item_embeddings, nms_idx = load_embeddings()

2021-07-07 08:34:00.151 M                   = 16
2021-07-07 08:34:00.151 indexThreadQty      = 8
2021-07-07 08:34:00.153 efConstruction      = 200
2021-07-07 08:34:00.153 maxM			          = 16
2021-07-07 08:34:00.155 maxM0			          = 32
2021-07-07 08:34:00.155 mult                = 0.360674
2021-07-07 08:34:00.155 skip_optimized_index= 0
2021-07-07 08:34:00.155 delaunay_type       = 2
2021-07-07 08:34:00.155 Set HNSW query-time parameters:
2021-07-07 08:34:00.161 ef(Search)         =20
2021-07-07 08:34:00.161 algoType           =2
2021-07-07 08:34:00.686 
The vector space is CosineSimilarity
2021-07-07 08:34:00.686 Vector length=60
2021-07-07 08:34:00.686 searchMethod			  = 3
2021-07-07 08:34:00.686 Making optimized index
2021-07-07 08:34:00.696 Finished making optimized index
2021-07-07 08:34:00.696 Maximum level = 3
2021-07-07 08:34:00.703 Total memory allocated for optimized index+data: 3 Mb


---

In [28]:
# Вводим строку для поиска книг
title = st.text_input('Book Name', '')
title = title.lower()

#Наш поиск по книгам
output = books[books.title.str.contains(title) > 0]

#Выбор книги из списка
option = st.selectbox('Which book?', output['title'].values)

#Выводим книгу
'You selected: ', option

#Ищем рекомендации
val_index = output[output['title'].values == option].id
index = nearest_books_nms(val_index, nms_idx, 5)

#Выводим рекомендации к ней
'Most simmilar books are: '
st.write('', get_names(index[0])[1:])

AttributeError: 'DataFrame' object has no attribute 'id'