Создано на основе https://github.com/zygmuntz/goodbooks-10k/

In [12]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k
 
try:
    ratings
except NameError:
    ratings = pd.read_csv('data/ratings.csv.zip', low_memory=False)
    
try:
    books
except NameError:
    books = pd.read_csv('data/books.csv.zip', low_memory=False)

try:
    tags
except NameError:
    tags = pd.read_csv('data/tags_cleaned.csv.zip', low_memory=False)
    
try:
    book_tags
except NameError:
    book_tags = pd.read_csv('data/book_tags.csv.zip')

In [13]:
mapper = dict(zip(books.goodreads_book_id,books.book_id))
book_tags = book_tags[book_tags.tag_id.isin(tags.tag_id)]
book_tags['id'] = book_tags.goodreads_book_id.apply(lambda x: mapper[x])

In [14]:
ratings_coo = sparse.coo_matrix((ratings.rating,(ratings.user_id,ratings.book_id)))
feature_ratings  = sparse.coo_matrix(([1]*len(book_tags),(book_tags.id,book_tags.tag_id)))
# print(ratings_coo)
# print(feature_ratings)

In [15]:
#Разбиваем наш датасет на обучающую и тестовую выборки
train,test = random_train_test_split(ratings_coo, test_percentage=0.2, random_state=None)


In [16]:
#число потоков нашего процессора
NUM_THREADS = 8 

#число параметров вектора 
NUM_COMPONENTS = 30 

#число эпох обучения
NUM_EPOCHS = 10 

#Создаём модель
model = LightFM(learning_rate=0.05, loss='warp', no_components=NUM_COMPONENTS)
 
#Обучаем модель
model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS,item_features =feature_ratings)


In [None]:
#Тестируем нашу модель
prec_score = precision_at_k(
                     model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()
 
recall_at_k = recall_at_k(model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()

print(recall_at_k,prec_score)

In [None]:
# Достаём эбмеддинги

item_biases, item_embeddings = model.get_item_representations(features=feature_ratings)

In [None]:
item_embeddings

In [8]:
import pickle
with open('item_embeddings.pickle', 'wb') as file:
    pickle.dump(item_embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
import nmslib
 
#Создаём наш граф для поиска
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
 
#Начинаем добавлять наши книги в граф
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)

In [10]:
#Вспомогательная функция для поиска по графу
def nearest_books_nms(book_id, index, n=10):
    nn = index.knnQuery(item_embeddings[book_id], k=n)
    return nn

In [11]:
books[books.original_title.str.find('1984')>=0].head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,...,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...


In [12]:
id = 846
nbm = nearest_books_nms(id,nms_idx)[0]
books[books.book_id.isin(nbm)]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
13,14,7613,7613,2207778,896,452284244,9780452000000.0,George Orwell,1945.0,Animal Farm: A Fairy Story,...,1881700,1982987,35472,66854,135147,433432,698642,648912,https://images.gr-assets.com/books/1424037542m...,https://images.gr-assets.com/books/1424037542s...
47,48,4381,4381,1272463,507,307347974,9780307000000.0,Ray Bradbury,1953.0,Fahrenheit 451,...,570498,1176240,30694,28366,64289,238242,426292,419051,https://images.gr-assets.com/books/1351643740m...,https://images.gr-assets.com/books/1351643740s...
54,55,5129,5129,3204877,515,60929871,9780061000000.0,Aldous Huxley,1932.0,Brave New World,...,1022601,1079135,20095,26367,60328,219895,389379,383166,https://images.gr-assets.com/books/1487389574m...,https://images.gr-assets.com/books/1487389574s...
288,289,76620,76620,1357456,193,038039586X,9780380000000.0,Richard Adams,1972.0,Watership Down,...,292426,308373,10399,9158,15767,52906,102093,128449,https://images.gr-assets.com/books/1405136931m...,https://images.gr-assets.com/books/1405136931s...
374,375,1852,1852,3252320,1384,439227143,9780439000000.0,Jack London,1903.0,The Call of the Wild,...,223932,248795,6770,6366,16636,62853,90382,72558,https://images.gr-assets.com/books/1452291694m...,https://images.gr-assets.com/books/1452291694s...
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,...,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...
2489,2490,721012,721012,18197624,302,014043769X,9780140000000.0,"Geoffrey Crayon, Washington Irving, William L....",1819.0,"The Sketch Book of Geoffrey Crayon, Gent",...,42952,45496,437,449,2192,11813,16501,14541,https://images.gr-assets.com/books/1309285607m...,https://images.gr-assets.com/books/1309285607s...
4591,4592,90192,90192,1483780,636,393924769,9780394000000.0,"Nathaniel Hawthorne, Robert S. Levine",1851.0,The House of the Seven Gables,...,23797,28715,1651,1373,3640,9601,8899,5202,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
6154,6155,989313,989313,19248724,7,1593080212,9781593000000.0,"Joseph Conrad, A. Michael Matin",1899.0,,...,18873,19392,192,1061,1793,4477,5759,6302,https://images.gr-assets.com/books/1328851164m...,https://images.gr-assets.com/books/1328851164s...


In [13]:
books[books.original_title.str.find('The Silence of the Lambs')>=0].head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
208,209,23807,23807,22533,187,99446782,9780099000000.0,Thomas Harris,1988.0,The Silence of the Lambs,...,351107,366112,3866,10268,12845,55427,123652,163920,https://images.gr-assets.com/books/1390426249m...,https://images.gr-assets.com/books/1390426249s...


In [14]:
id = 209
nbm = nearest_books_nms(id,nms_idx)[0]
books[books.book_id.isin(nbm)]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
208,209,23807,23807,22533,187,99446782,9780099000000.0,Thomas Harris,1988.0,The Silence of the Lambs,...,351107,366112,3866,10268,12845,55427,123652,163920,https://images.gr-assets.com/books/1390426249m...,https://images.gr-assets.com/books/1390426249s...
430,431,28877,28877,925503,191,525945563,9780526000000.0,Thomas Harris,1981.0,Red Dragon,...,194013,205433,3309,3012,7790,43235,80662,70734,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
767,768,21686,21686,1234227,134,038073186X,9780381000000.0,Dennis Lehane,2003.0,Shutter Island,...,113718,124032,6990,1636,4727,22089,49875,45705,https://images.gr-assets.com/books/1329269081m...,https://images.gr-assets.com/books/1329269081s...
981,982,40024,40024,2266643,70,812976142,9780813000000.0,Caleb Carr,1994.0,The Alienist,...,96981,100908,4026,1798,4571,18715,37572,38252,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
1484,1485,21704,21704,434,93,446698873,9780447000000.0,James Ellroy,1987.0,The Black Dahlia,...,61412,65404,1944,1999,4999,17641,23340,17425,https://images.gr-assets.com/books/1387048173m...,https://images.gr-assets.com/books/1387048173s...
1801,1802,32418,32418,2992500,132,99297701,9780099000000.0,Thomas Harris,1999.0,Hannibal,...,57569,63555,2098,2166,5811,17220,20844,17514,https://images.gr-assets.com/books/1327356556m...,https://images.gr-assets.com/books/1327356556s...
3405,3406,18402,18402,2164481,77,034549038X,9780345000000.0,Matthew Pearl,2003.0,The Dante Club,...,31035,33728,2206,1822,4320,11905,10640,5041,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
4421,4422,32416,32416,46673,94,385339410,9780385000000.0,Thomas Harris,2006.0,Hannibal Rising,...,22767,25973,1317,1468,3733,8087,7174,5511,https://images.gr-assets.com/books/1394208690m...,https://images.gr-assets.com/books/1394208690s...
5312,5313,21727,21727,593515,46,307279952,9780307000000.0,Scott B. Smith,1993.0,A Simple Plan,...,18628,19650,986,478,1086,4239,7690,6157,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
