In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [5]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2024-09-20 10:12:40--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: â€˜book-crossings.zip.2â€™


2024-09-20 10:12:43 (8.74 MB/s) - â€˜book-crossings.zip.2â€™ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [44]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [7]:
# add your code here - consider creating a new cell for each section of code

df_books.info()
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   isbn    271379 non-null  object
 1   title   271379 non-null  object
 2   author  271377 non-null  object
dtypes: object(3)
memory usage: 6.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   user    1149780 non-null  int32  
 1   isbn    1149780 non-null  object 
 2   rating  1149780 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 17.5+ MB


In [41]:
# clean users age df (has NULL)
# isbn: remove non numeric, but char O to number 0
df_ratings['isbn'] = df_ratings['isbn'].str.replace('O', '0')
df_ratings["isbn"] = df_ratings["isbn"].str.replace('[^0-9]', '', regex=True)

df_ratings = df_ratings[df_ratings['isbn'] != ""]

df_ratings['isbn'] = df_ratings['isbn'].astype('int')

df_books['isbn'] = df_books['isbn'].str.replace('[^0-9]', '', regex=True)
df_books = df_books[df_books['isbn'] != ""]
df_books['isbn'] = df_books['isbn'].astype('int')



In [297]:
# book_ratings = df_ratings.groupby('isbn')['rating'].agg(['mean', 'count'])
# book_ratings.rename(columns={'mean': 'avg_rating', 'count': 'num_ratings'}, inplace=True)
# df_books_with_ratings = pd.merge(df_books, book_ratings, left_on='isbn', right_index=True)


In [45]:
# # Filter out users and books with insufficient ratings
# count unique value of users
user_ratings_count = df_ratings['user'].value_counts()
# count unique value of books (isbn)
book_ratings_count = df_ratings['isbn'].value_counts()


# filter users with less than 200 reviews
df_ratings_filtered = df_ratings[df_ratings['user'].isin(user_ratings_count[user_ratings_count >= 200].index)]
print(df_ratings_filtered.shape)

# filter books with less than 100 reviews
df_ratings_filtered = df_ratings_filtered[df_ratings_filtered['isbn'].isin(book_ratings_count[book_ratings_count >= 100].index)]
print(df_ratings_filtered.shape)

# print (book_ratings)

# print (df_ratings_filtered.loc[df_ratings_filtered['title'] == "Where the Heart Is (Oprah's Book Club (Paperback))"])
# print (df_ratings_filtered.loc[df_ratings_filtered['title'] == "I'll Be Seeing You"])
# print (df_ratings_filtered.loc[df_ratings_filtered['title'] == "The Weight of Water"])
# print (df_ratings_filtered.loc[df_ratings_filtered['title'] == "The Surgeon"])
# print (df_ratings_filtered.loc[df_ratings_filtered['title'] == "I Know This Much Is True"])


# print (df_ratings_filtered.loc[df_ratings_filtered['title'] == "The Pilot's Wife : A Novel Tag: Author of the Weight of Water (Oprah's Book Club (Hardcover))"])

# we know num rataing does not matter

(527556, 3)
(49781, 3)


isbn
False    49479
True       441
Name: count, dtype: int64


In [29]:
books100 = df_ratings.isbn.value_counts()
books100 = books100.loc[books100 >= 100]
users200 = df_ratings.user.value_counts()
users200 = users200.loc[users200 >= 200]
df_toFit = df_ratings.loc[df_ratings.user.isin(users200.keys())]
df_toFit = df_toFit.loc[df_toFit.isbn.isin(books100.keys())]

df_toFit = df_toFit.pivot(index='isbn', columns='user', values='rating').fillna(0)


In [93]:
print(df_toFit)

user        254     2276    2766    2977    3363    4017    4385    6242    \
isbn                                                                         
002542730X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060008032     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060096195     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
006016848X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060173289     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
...            ...     ...     ...     ...     ...     ...     ...     ...   
1573227331     0.0     0.0     0.0     0.0     0.0     0.0     0.0     6.0   
1573229326     0.0     0.0     0.0     0.0     0.0     0.0     0.0     6.0   
1573229571     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1592400876     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1878424319     0.0     0.0     0.0     0.0     0.0     0.0     0

In [48]:
# user_ids = df_ratings_filtered['user'].unique()
# book_ids = df_ratings_filtered['isbn'].unique()

# user_to_index = {user: index for index, user in enumerate(user_ids)}
# book_to_index = {book: index for index, book in enumerate(book_ids)}

# # user_to_index = {user: index for index, user in enumerate(user_ids)}
# # book_to_index = {book: index for index, book in enumerate(book_ids)}
# df_ratings_filtered['user'] = df_ratings_filtered['user'].map(user_to_index)
# df_ratings_filtered['isbn'] = df_ratings_filtered['isbn'].map(book_to_index)

user_item_matrix = df_ratings_filtered.pivot(index='isbn', columns='user', values='rating').fillna(0)



In [49]:
print(user_item_matrix.head())

user        254     2276    2766    2977    3363    4017    4385    6242    \
isbn                                                                         
002542730X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060008032     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060096195     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
006016848X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060173289     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

user        6251    6323    ...  274004  274061  274301  274308  274808  \
isbn                        ...                                           
002542730X     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0060008032     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0060096195     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
006016848X     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
006

In [282]:
# # Group ratings by ISBN and calculate average rating
# average_ratings = df_ratings.groupby('isbn')['rating'].mean()

# # Create a dictionary mapping ISBNs to average ratings
# rating_dict = average_ratings.to_dict()

# # Get unique ISBNs
# isbns = list(rating_dict.keys())

# # Create data and indices for the CSR matrix
# data = list(rating_dict.values())
# row_indices = [i for i in range(len(isbns))]  # Row indices correspond to ISBNs
# col_indices = [0] * len(isbns)  # All values are in the first column (average rating)

# # Create the CSR matrix
# isbn_rating_csr = csr_matrix((data, (row_indices, col_indices)), shape=(len(isbns), 1))

In [74]:

# NearestNeighbors
model = NearestNeighbors(metric='cosine',n_neighbors=5)
model.fit(user_item_matrix)


In [30]:
knn = NearestNeighbors(n_neighbors = 6, metric='cosine')
knn.fit(df_toFit)
distances, indices = knn.kneighbors(df_toFit)

In [88]:
def get_recommends(book = ""):
  try:
    distances, indices = model.kneighbors(user_item_matrix)
    isbn = df_books.isbn[df_books.loc[df_books.title == book].index[0]]
    row = user_item_matrix.index.tolist().index(isbn)
    neighborsBooks = indices[row]
    print (neighborsBooks)
  except:
    return ["Book requested not found", []]
 
  recommended_books = [book, []]
  for count, nn in enumerate(neighborsBooks):
    if (distances[row][count] == 0.0): 
      pass
    else: 
      recommended_books[1].append([df_books.loc[df_books.isbn == user_item_matrix.iloc[nn].name].title.values[0], distances[row][count]])
  recommended_books[1].reverse()
  return recommended_books

In [106]:
def get_recommends(book = ""):
  recommended_books = [book, []]
  try:
    book_id = df_books[df_books['title'] == book].index[0]
    isbn = df_books.iloc[book_id]['isbn']
    distances, indices = model.kneighbors(user_item_matrix.loc[isbn].values.reshape(1, -1), n_neighbors=6)
    # print (distances.reverse())
    # recommended_books.append(book)
    book_list = []
    for i in range(1, len(distances.flatten())):
      book_list.append([df_books.iloc[indices.flatten()[i]]['title'], distances.flatten()[i]])
    recommended_books[1].append(book_list)
      
    return recommended_books
  except:
    return ["Book requested not found", []]

In [107]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! ðŸŽ‰ðŸŽ‰ðŸŽ‰ðŸŽ‰ðŸŽ‰")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [['Tu Nombre Escrito En El Agua (La Sonrisa Vertical)', 0.7234864], ['Airframe', 0.76770747], ['LÃƒ?Ã‚Â¶wenzahnwein. Roman.', 0.7699411], ['All He Ever Wanted: A Novel', 0.77085835], ['My Garden (Beginning Literacy)', 0.8016211]]]
You haven't passed yet. Keep trying!
