In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2025-05-28 14:53:59--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2025-05-28 14:54:00 (87.4 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [31]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [32]:
userCounts = df_ratings['user'].value_counts()
isbnCounts = df_ratings['isbn'].value_counts()

In [33]:
print(userCounts)
print(isbnCounts)

user
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
119573        1
276706        1
276697        1
276679        1
276676        1
Name: count, Length: 105283, dtype: int64
isbn
0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
              ... 
0671883917       1
0743257502       1
0767409752       1
0785263195       1
0802141358       1
Name: count, Length: 340556, dtype: int64


In [34]:
#remove all users with less than 200 reviews
df_ratings = df_ratings[df_ratings['user'].isin(userCounts[userCounts >= 200].index)]

#remove all books with less than 100 ratings
df_ratings = df_ratings[df_ratings['isbn'].isin(isbnCounts[isbnCounts >= 100].index)]

In [35]:
print(df_ratings)

           user        isbn  rating
1456     277427  002542730X    10.0
1469     277427  0060930535     0.0
1471     277427  0060934417     0.0
1474     277427  0061009059     9.0
1484     277427  0140067477     0.0
...         ...         ...     ...
1147304  275970  0804111359     0.0
1147436  275970  140003065X     0.0
1147439  275970  1400031346     0.0
1147440  275970  1400031354     0.0
1147441  275970  1400031362     0.0

[49781 rows x 3 columns]


In [36]:
df_table = df_ratings.pivot_table(index='isbn', columns='user', values='rating').fillna(0)
print(df_table)

user        254     2276    2766    2977    3363    4017    4385    6242    \
isbn                                                                         
002542730X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060008032     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060096195     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
006016848X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060173289     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
...            ...     ...     ...     ...     ...     ...     ...     ...   
1573227331     0.0     0.0     0.0     0.0     0.0     0.0     0.0     6.0   
1573229326     0.0     0.0     0.0     0.0     0.0     0.0     0.0     6.0   
1573229571     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1592400876     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1878424319     0.0     0.0     0.0     0.0     0.0     0.0     0

In [37]:
df_table.index = df_table.join(df_books.set_index('isbn'))['title']
print(df_table)

user                                                254     2276    2766    \
title                                                                        
Politically Correct Bedtime Stories: Modern Tal...     0.0     0.0     0.0   
Angels                                                 0.0     0.0     0.0   
The Boy Next Door                                      0.0     0.0     0.0   
Men Are from Mars, Women Are from Venus: A Prac...     0.0     0.0     0.0   
Divine Secrets of the Ya-Ya Sisterhood : A Novel       0.0     0.0     0.0   
...                                                    ...     ...     ...   
About a Boy                                            0.0     0.0     0.0   
How to Be Good                                         0.0     0.0     0.0   
About a Boy (Movie Tie-In)                             0.0     0.0     0.0   
Eats, Shoots &amp; Leaves: The Zero Tolerance A...     0.0     0.0     0.0   
The Four Agreements: A Practical Guide to Perso...     0.0     0

In [38]:
# function to return recommended books - this will be tested

def get_recommends(book = ""):
    recommended_books = []
    nbrs = NearestNeighbors(n_neighbors=6, metric="cosine").fit(df_table.values)
    distances, indices = nbrs.kneighbors([df_table.loc[book].values], n_neighbors=6)
    for i in range(1,6):
        recommended_books.append([df_table.index[indices[0][-i]], distances[0][-i]])

    return [book, recommended_books]

In [39]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
    test_pass = True
    recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
    if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
        test_pass = False
    recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
    recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
    for i in range(2):
        if recommends[1][i][0] not in recommended_books:
        test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
        test_pass = False
    if test_pass:
        print("You passed the challenge! 🎉🎉🎉🎉🎉")
    else:
        print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", np.float32(0.8016211)], ['The Weight of Water', np.float32(0.77085835)], ['The Surgeon', np.float32(0.7699411)], ['I Know This Much Is True', np.float32(0.7677075)], ['The Lovely Bones: A Novel', np.float32(0.7234864)]]]
You passed the challenge! 🎉🎉🎉🎉🎉
