In [10]:
import pandas as pd

# Load the CSVs
books = pd.read_csv("books.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")


In [9]:
books.head()


Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [8]:
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [7]:
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [11]:
tags.head()

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [12]:
book_tags_merged = book_tags.merge(tags, on="tag_id")

In [13]:
book_tags_merged

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,1,11305,37174,fantasy
2,1,11557,34173,favorites
3,1,8717,12986,currently-reading
4,1,33114,12716,young-adult
...,...,...,...,...
999907,33288638,21303,7,neighbors
999908,33288638,17271,7,kindleunlimited
999909,33288638,1126,7,5-star-reads
999910,33288638,11478,7,fave-author


In [14]:
top_tags = (
    book_tags_merged
    .sort_values(['goodreads_book_id', 'count'], ascending=[True, False])
    .groupby('goodreads_book_id')['tag_name']
    .apply(lambda x: ', '.join(x.head(3)))
    .reset_index()
    .rename(columns={'tag_name': 'top_tags'})
)

In [15]:
books_with_tags = books.merge(top_tags, on='goodreads_book_id', how='left')

In [16]:
print(books_with_tags[['title', 'authors', 'top_tags']].head())

                                               title  \
0            The Hunger Games (The Hunger Games, #1)   
1  Harry Potter and the Sorcerer's Stone (Harry P...   
2                            Twilight (Twilight, #1)   
3                              To Kill a Mockingbird   
4                                   The Great Gatsby   

                       authors                                   top_tags  
0              Suzanne Collins  favorites, currently-reading, young-adult  
1  J.K. Rowling, Mary GrandPré                to-read, favorites, fantasy  
2              Stephenie Meyer            young-adult, fantasy, favorites  
3                   Harper Lee               classics, favorites, to-read  
4          F. Scott Fitzgerald               classics, favorites, fiction  


In [19]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-win_amd64.whl (8.9 MB)
     ---------------------------------------- 8.9/8.9 MB 6.5 MB/s eta 0:00:00
Collecting scipy>=1.8.0
  Downloading scipy-1.16.0-cp311-cp311-win_amd64.whl (38.6 MB)
     ---------------------------------------- 38.6/38.6 MB 5.7 MB/s eta 0:00:00
Collecting joblib>=1.2.0
  Downloading joblib-1.5.1-py3-none-any.whl (307 kB)
     -------------------------------------- 307.7/307.7 kB 3.2 MB/s eta 0:00:00
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.5.1 scikit-learn-1.7.1 scipy-1.16.0 threadpoolctl-3.6.0



[notice] A new release of pip available: 22.3 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [21]:
books_with_tags['top_tags'] = books_with_tags['top_tags'].fillna('')

In [22]:
vectorizer = TfidfVectorizer(max_features=100)
tag_vectors = vectorizer.fit_transform(books_with_tags['top_tags'])

In [23]:
numeric_features = books_with_tags[['average_rating', 'original_publication_year']].fillna(0)
scaler = MinMaxScaler()
scaled_numeric = scaler.fit_transform(numeric_features)

In [25]:
from scipy.sparse import hstack

feature_matrix = hstack([tag_vectors, scaled_numeric])

print( feature_matrix.shape)

(10000, 102)


In [29]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=5, metric='cosine')  # cosine similarity works better for tags
knn.fit(feature_matrix)


0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


In [28]:

user_mood = "sad"
user_genres = ["romance", "emotional"]
user_pace = "slow"

user_input_tags = f"{user_mood}, {user_pace}, " + ", ".join(user_genres)

print("User Input Tags:", user_input_tags)


User Input Tags: sad, slow, romance, emotional


In [31]:
user_tag_vector = vectorizer.transform([user_input_tags])

user_numeric = scaler.transform([[4.0, 2010]])

user_feature = hstack([user_tag_vector, user_numeric])



In [32]:

distances, indices = knn.kneighbors(user_feature)

print(" Top 5 Book Recommendations:\n")
for idx in indices[0]:
    print("-", books_with_tags.iloc[idx]['title'], "by", books_with_tags.iloc[idx]['authors'])


 Top 5 Book Recommendations:

- Entwined with You (Crossfire, #3) by Sylvia Day
- One with You (Crossfire, #5) by Sylvia Day
- The Shop on Blossom Street (Blossom Street, #1) by Debbie Macomber
- Mr. Perfect by Linda Howard
- Dream Man by Linda Howard


In [33]:
from sklearn.metrics import pairwise_distances

distances, _ = knn.kneighbors(feature_matrix, n_neighbors=5)

mean_distance = distances.mean()
print(f"📐 Mean Cosine Distance to 5 Nearest Neighbors: {mean_distance:.4f}")


📐 Mean Cosine Distance to 5 Nearest Neighbors: 0.0032


In [34]:
import pickle

with open("knn_model.pkl", "wb") as f:
    pickle.dump(knn, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

books_with_tags.to_pickle("books_with_tags.pkl")

print("✅ All pickle files created successfully.")


✅ All pickle files created successfully.
