<a href="https://colab.research.google.com/github/uzayri001/BookRecommendation/blob/main/BookRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2025-04-04 12:20:59--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2025-04-04 12:20:59 (108 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [29]:
df = pd.merge(df_ratings, df_books, on='isbn')
print(df.head())

     user        isbn  rating  \
0  276725  034545104X     0.0   
1  276726  0155061224     5.0   
2  276727  0446520802     0.0   
3  276729  052165615X     3.0   
4  276729  0521795028     6.0   

                                               title           author  
0                               Flesh Tones: A Novel       M. J. Rose  
1                                   Rites of Passage       Judith Rae  
2                                       The Notebook  Nicholas Sparks  
3                                     Help!: Level 1    Philip Prowse  
4  The Amsterdam Connection : Level 4 (Cambridge ...      Sue Leather  


In [31]:
# add a totalRatingCount column to track how many total ratings a book has
combineBookRating = df.dropna(axis = 0, subset = ['title'])
bookRatingCount = (combineBookRating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
rating_with_totalRatingCount = combineBookRating.merge(bookRatingCount, left_on = 'title', right_on = 'title', how = 'left')
print(rating_with_totalRatingCount.head())

     user        isbn  rating  \
0  276725  034545104X     0.0   
1  276726  0155061224     5.0   
2  276727  0446520802     0.0   
3  276729  052165615X     3.0   
4  276729  0521795028     6.0   

                                               title           author  \
0                               Flesh Tones: A Novel       M. J. Rose   
1                                   Rites of Passage       Judith Rae   
2                                       The Notebook  Nicholas Sparks   
3                                     Help!: Level 1    Philip Prowse   
4  The Amsterdam Connection : Level 4 (Cambridge ...      Sue Leather   

   totalRatingCount  
0                60  
1                14  
2               650  
3                 1  
4                 1  


In [32]:
# filter books with less than 100 reviews (only take ratings on popular books)
ratings_threshold = 100
popularBooks = rating_with_totalRatingCount.query('totalRatingCount >= @ratings_threshold')
print(popularBooks.head())

     user        isbn  rating                title           author  \
2  276727  0446520802     0.0         The Notebook  Nicholas Sparks   
6  276744  038550120X     7.0      A Painted House     JOHN GRISHAM   
7  276746  0425115801     0.0            Lightning   Dean R. Koontz   
8  276746  0449006522     0.0  Manhattan Hunt Club        JOHN SAUL   
9  276746  0553561618     0.0        Dark Paradise        TAMI HOAG   

   totalRatingCount  
2               650  
6               838  
7               274  
8               111  
9               142  


In [35]:
# add a coulmn for total number of reviews per user
userRatingCount = (
    popularBooks
    .groupby('user')['rating']
    .count()
    .reset_index()
    .rename(columns={'rating': 'totalUserRatingCount'})
)
totalUserRatingCount = popularBooks.merge(userRatingCount, left_on='user', right_on='user', how='left')
print(totalUserRatingCount.head())

     user        isbn  rating                title           author  \
0  276727  0446520802     0.0         The Notebook  Nicholas Sparks   
1  276744  038550120X     7.0      A Painted House     JOHN GRISHAM   
2  276746  0425115801     0.0            Lightning   Dean R. Koontz   
3  276746  0449006522     0.0  Manhattan Hunt Club        JOHN SAUL   
4  276746  0553561618     0.0        Dark Paradise        TAMI HOAG   

   totalRatingCount  totalUserRatingCount  
0               650                     1  
1               838                     1  
2               274                     4  
3               111                     4  
4               142                     4  


In [38]:
# filter by users with more than 200 reviews (only take ratings from bookworms)
ratings_threshold_user = 200
filtered_table = totalUserRatingCount.query('totalUserRatingCount >= @ratings_threshold_user')
print(filtered_table.head())

      user        isbn  rating  \
4063  6575  0060173289     0.0   
4064  6575  0060198133     8.0   
4065  6575  0060502258     8.0   
4066  6575  0060915544     0.0   
4067  6575  0060916508     0.0   

                                                  title              author  \
4063   Divine Secrets of the Ya-Ya Sisterhood : A Novel       Rebecca Wells   
4064                        Five Quarters of the Orange       Joanne Harris   
4065  The Divine Secrets of the Ya-Ya Sisterhood: A ...       Rebecca Wells   
4066                                     The Bean Trees  Barbara Kingsolver   
4067              Their Eyes Were Watching God: A Novel  Zora Neale Hurston   

      totalRatingCount  totalUserRatingCount  
4063               130                   206  
4064               207                   206  
4065               376                   206  
4066               389                   206  
4067               100                   206  


In [41]:
# create pivot table to use in kNN
rating_pivot_table = filtered_table.pivot_table(index='title', columns='user', values='rating').fillna(0)
print(rating_pivot_table.head())

user                 6575      11676   16795   21014   23768   31315   35857   \
title                                                                           
1984                    0.0  3.333333     8.0     0.0     0.0     0.0     0.0   
1st to Die: A Novel     0.0  9.000000     9.0     0.0     0.0     0.0     0.0   
24 Hours                0.0  4.000000     0.0     0.0     0.0     0.0     0.0   
2nd Chance              0.0  7.500000     0.0     0.0     0.0    10.0     0.0   
4 Blondes               0.0  0.000000     0.0     0.0     0.0     0.0     0.0   

user                 35859   36606   43246   ...  230522  231210  232131  \
title                                        ...                           
1984                    0.0     0.0     0.0  ...     0.0     0.0     0.0   
1st to Die: A Novel     7.0     0.0     4.5  ...     0.0     0.0     4.0   
24 Hours                0.0     0.0     0.0  ...     0.0     0.0     0.0   
2nd Chance              7.5     0.0     9.0  ...    

In [43]:
# convert from pivot table to matrix
rating_matrix = csr_matrix(rating_pivot_table.values)

In [72]:
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(rating_matrix)

In [81]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  recommended_books = []
  query_index = rating_pivot_table.index.get_loc(book)
  distances, indices = model.kneighbors(rating_pivot_table.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)
  for i in range(1, len(distances.flatten())):
    recommended_books.append([rating_pivot_table.index[indices.flatten()[i]], distances.flatten()[i]])
  recommended_books = [book, recommended_books[::-1]]
  return recommended_books

In [82]:
books = get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))")
print(books)

['The Queen of the Damned (Vampire Chronicles (Paperback))', [['The Tao of Pooh', np.float32(0.3591017)], ["The Plains of Passage (Earth's Children (Paperback))", np.float32(0.3591017)], ['The Vampire Lestat (Vampire Chronicles, Book II)', np.float32(0.32453173)], ['The Great Gatsby', np.float32(0.3242008)], ['WLD ACCORDNG GARP', np.float32(0.14250702)]]]


In [64]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [['The English Patient', np.float64(21.354156494140625)], ['Five Days in Paris', np.float64(21.189620971679688)], ['I Know Why the Caged Bird Sings', np.float64(21.14237403869629)], ['Zoya', np.float64(20.880613327026367)], ['Unspeakable', np.float64(20.760540008544922)]]]
You haven't passed yet. Keep trying!


In [83]:
from sklearn.metrics.pairwise import cosine_distances

book1 = "Where the Heart Is (Oprah's Book Club (Paperback))"
book2 = "I'll Be Seeing You"

# Extract their feature vectors
vec1 = rating_pivot_table.loc[book1].values.reshape(1, -1)
vec2 = rating_pivot_table.loc[book2].values.reshape(1, -1)

# Compute cosine distance
distance = cosine_distances(vec1, vec2)[0][0]

print(f"Distance between '{book1}' and '{book2}': {distance}")

Distance between 'Where the Heart Is (Oprah's Book Club (Paperback))' and 'I'll Be Seeing You': 0.45980000495910645
