In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
import gradio as gr
from collections import defaultdict



In [2]:
from surprise import Dataset as SurpriseDataset, Reader, SVD, KNNBasic
from surprise.model_selection import train_test_split as surprise_train_test_split

In [3]:
ratings_path = "data/inputs/ratings.csv"
books_path = "data/inputs/books.csv"

In [4]:
ratings = pd.read_csv(ratings_path)
books =  pd.read_csv(books_path)

Теги (жанри)

In [5]:
book_tags = pd.read_csv("data/inputs/book_tags.csv")
tags = pd.read_csv("data/inputs/tags.csv")

In [6]:
# Об'єднання
book_tags = book_tags.merge(tags, on="tag_id", how="left")
book_tags = book_tags.merge(books[["book_id", "goodreads_book_id"]], on="goodreads_book_id", how="inner")

# Залишаємо лише найчастіші теги
book_tags = book_tags[book_tags["count"] > 10]

# Створюємо mapping: genre_name -> [book_id1, book_id2, ...]

tag_to_books = defaultdict(list)
for _, row in book_tags.iterrows():
    tag = row['tag_name'].lower()
    tag_to_books[tag].append((row['book_id'], row['count']))

In [7]:
# список популярних жанрів (знадобиться для інтер-у Gradio)
top_genres = (
    book_tags["tag_name"]
    .value_counts()
    .loc[lambda s: s > 50]
    .head(30)
    .index
    .tolist()
)

In [8]:
print(ratings.shape)
ratings.head(3)

(5976479, 3)


Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5


In [9]:
books.head(3)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...


In [10]:
len(set(ratings['book_id']).intersection(set(books['book_id'])))

10000

In [11]:
df = ratings.merge(books, on='book_id', how='inner')
df = df[df['user_id'] < 500]

In [12]:

authorDf = df[['authors', 'book_id']].drop_duplicates()

itemFeatureAssignments = [] # item, feature tuples
itemFeatureList = [] # list of unique item feature values
for ii in range(len(authorDf)):
    itemFeatureAssignments.append((authorDf['book_id'].iloc[ii], 
                          authorDf['authors'].iloc[ii].split(", ")))
    itemFeatureList.extend(authorDf['authors'].iloc[ii].split(", "))

itemFeatureList = set(itemFeatureList)

itemFeatureAssignments[:10]

[(258, ['Carlos Ruiz Zafón', 'Lucia Graves']),
 (4081, ['Tom Wolfe']),
 (260, ['Dale Carnegie']),
 (9296, ['Alice  Miller', 'Ruth Ward']),
 (2318, ['Thomas J. Stanley', 'William D. Danko']),
 (26, ['Dan Brown']),
 (315, ['Spencer Johnson', 'Kenneth H. Blanchard']),
 (33, ['Arthur Golden']),
 (301, ['Joseph Conrad']),
 (2686, ['W. Chan Kim', 'Renée Mauborgne'])]

In [13]:
# Make LightFM dataset
dataset = Dataset()
dataset.fit(users = df['user_id'],
            items = df['book_id'],
            item_features = itemFeatureList,
           )
item_features = dataset.build_item_features(itemFeatureAssignments)

In [14]:
trainDf, testDf = train_test_split(df, train_size=0.8)
train, _ = dataset.build_interactions(list(zip(trainDf['user_id'], trainDf['book_id'])))
test, _ = dataset.build_interactions(list(zip(testDf['user_id'], testDf['book_id'])))

In [15]:
numEpochs = 1

model = LightFM(loss='warp', 
          learning_rate=0.05,
          random_state = 1)
model.fit(train, 
          epochs=numEpochs, 
          item_features=item_features)

<lightfm.lightfm.LightFM at 0x169de61d0>

In [23]:
lfm_precision = precision_at_k(model, test, item_features=item_features, k=10).mean()
lfm_recall = recall_at_k(model, test, item_features=item_features, k=10).mean()

In [24]:
# SVD, KNN

reader = Reader(rating_scale=(df['rating'].min(), df['rating'].max()))
surprise_data = SurpriseDataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)
train_surp, test_surp = surprise_train_test_split(surprise_data, test_size=0.2, random_state=42)

train_df_surp = pd.DataFrame(train_surp.all_ratings(), columns=['uid', 'iid', 'rating'])
train_df_surp['uid'] = train_df_surp['uid'].astype(int)
train_df_surp['iid'] = train_df_surp['iid'].astype(int)
test_df_surp = pd.DataFrame(test_surp, columns=['uid', 'iid', 'rating'])

In [25]:
def precision_recall_at_k(model, train_df, test_df, k=10, threshold=4.0):
    user_rated_items = defaultdict(set)
    for _, row in train_df.iterrows():
        user_rated_items[int(row['uid'])].add(int(row['iid']))
    
    user_metrics = defaultdict(list)
    for uid in test_df['uid'].unique():
        seen = user_rated_items.get(uid, set())
        all_items = set(df['book_id'].unique())
        unseen_items = list(all_items - seen)
        preds = [model.predict(uid, iid) for iid in unseen_items]
        preds.sort(key=lambda x: x.est, reverse=True)
        top_k = preds[:k]

        relevant = set(test_df[(test_df['uid'] == uid) & (test_df['rating'] >= threshold)]['iid'])
        recommended = set([int(p.iid) for p in top_k])
        hits = len(recommended & relevant)

        if len(recommended) > 0:
            prec = hits / k
            rec = hits / len(relevant) if len(relevant) > 0 else 0
            user_metrics[uid] = [prec, rec]

    precisions = [v[0] for v in user_metrics.values()]
    recalls = [v[1] for v in user_metrics.values()]
    return np.mean(precisions), np.mean(recalls)

In [26]:
# --- 5. SVD ---
svd = SVD()
svd.fit(train_surp)
svd_prec, svd_rec = precision_recall_at_k(svd, train_df_surp, test_df_surp)

# --- 6. KNN ---
knn = KNNBasic(sim_options={'user_based': False})
knn.fit(train_surp)
knn_prec, knn_rec = precision_recall_at_k(knn, train_df_surp, test_df_surp)

# --- 7. Порівняння ---
print("=== Порівняльні метрики Precision@5 / Recall@5 ===")
print(f"LightFM:  precision = {lfm_precision:.4f}, recall = {lfm_recall:.4f}")
print(f"SVD:      precision = {svd_prec:.4f},  recall = {svd_rec:.4f}")
print(f"KNN:      precision = {knn_prec:.4f},  recall = {knn_rec:.4f}")

Computing the msd similarity matrix...
Done computing similarity matrix.
=== Порівняльні метрики Precision@5 / Recall@5 ===
LightFM:  precision = 0.0731, recall = 0.0343
SVD:      precision = 0.0124,  recall = 0.0110
KNN:      precision = 0.0004,  recall = 0.0005


In [21]:
print((test_df_surp['rating'] >= 4.0).sum())

6896


In [15]:
# Мапінг з оригінального user_id / book_id до внутрішніх індексів
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

# Зворотні мапінги
inv_user_id_map = {v: k for k, v in user_id_map.items()}
inv_item_id_map = {v: k for k, v in item_id_map.items()}

book_id_to_title = dict(zip(books['book_id'], books['title']))

In [16]:
def recommend_books_for_user(user_id, model, dataset, item_features, N=5):
    if user_id not in user_id_map:
        return f"Користувача з user_id={user_id} не знайдено."

    internal_uid = user_id_map[user_id]
    all_items = list(item_id_map.values())
    scores = model.predict(internal_uid, all_items, item_features=item_features)

    top_item_indices = np.argsort(-scores)[:N]
    top_book_ids = [inv_item_id_map[i] for i in top_item_indices]
    top_titles = [book_id_to_title.get(book_id, f"ID {book_id}") for book_id in top_book_ids]

    return top_titles

In [17]:
recommend_books_for_user(42, model, dataset, item_features)


['The Catcher in the Rye',
 'Harry Potter and the Order of the Phoenix (Harry Potter, #5)',
 "Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",
 'Harry Potter and the Chamber of Secrets (Harry Potter, #2)',
 'The Kite Runner']

In [18]:
print("Precision@5:", precision_at_k(model, test, item_features=item_features, k=5).mean())
print("Recall@5:", recall_at_k(model, test, item_features=item_features, k=5).mean())

Precision@5: 0.07134269
Recall@5: 0.016387978418262117


# Інтерфейс

In [19]:
def get_books_read_by_user(user_id):
    if user_id not in user_id_map:
        return []
    return df[df['user_id'] == user_id]['book_id'].map(book_id_to_title).dropna().unique().tolist()

In [20]:
def gradio_recommender_with_history(input_user_id):
    try:
        user_id = int(input_user_id)
        read_books = get_books_read_by_user(user_id)
        recommendations = recommend_books_for_user(user_id, model, dataset, item_features, N=5)
        return (
            "\n".join(read_books) if read_books else "–",
            "\n".join(recommendations) if isinstance(recommendations, list) else recommendations
        )
    except:
        return "–", "Введіть коректний числовий user_id."


In [21]:
def recommend_by_genre(selected_genres, top_n=5):
    selected_genres = [g.lower() for g in selected_genres]
    book_candidates = defaultdict(int)

    for genre in selected_genres:
        for book_id, count in tag_to_books.get(genre, []):
            if book_id in book_id_to_title:
                book_candidates[book_id] += count  # накопичуємо значущість

    if not book_candidates:
        return "Нічого не знайдено."

    # Сортуємо книги за сумарною значущістю (count по обраних жанрах)
    top_books = sorted(book_candidates.items(), key=lambda x: -x[1])[:top_n]
    return "\n".join(f"{book_id_to_title.get(bid, str(bid))} (відповідність жанру: {score})"
                     for bid, score in top_books)

In [22]:
with gr.Blocks() as demo:
    gr.Markdown("## 📚 Персоналізована система рекомендацій книг")

    with gr.Tab("Існуючий користувач"):
        user_input = gr.Textbox(label="User ID", placeholder="наприклад, 42")
        read_output = gr.Textbox(label="Прочитані книги")
        rec_output = gr.Textbox(label="Рекомендовані книги")

        recommend_btn = gr.Button("Отримати рекомендації")

        def full_recommend(user_id):
            read, rec = gradio_recommender_with_history(user_id)
            return read, rec

        recommend_btn.click(fn=full_recommend, inputs=user_input, outputs=[read_output, rec_output])
        
    with gr.Tab("Новий користувач"):
        genre_input = gr.CheckboxGroup(choices=top_genres, label="Оберіть жанри")
        genre_rec_output = gr.Textbox(label="Популярні книги за жанрами")
        genre_btn = gr.Button("Підібрати")
    
        genre_btn.click(fn=recommend_by_genre, inputs=genre_input, outputs=genre_rec_output)


demo.launch()


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


