<a href="https://colab.research.google.com/github/uvezero/Book-Recommendation-KNN/blob/main/book_recommendation_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [4]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2022-12-30 19:22:26--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2022-12-30 19:22:26 (202 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [5]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [6]:
#MY CODE
df_ratings.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [7]:
df_books.head()


Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [8]:
df_ratings.user.count()
df_ratings.shape[0]
if df_ratings.shape[0]==df_ratings.user.count():
  print("No user voted twice on the same book")
else:
  print("Some users voted twice on the same book")



No user voted twice on the same book


In [9]:
df_ratings.isnull().sum()


user      0
isbn      0
rating    0
dtype: int64

In [10]:
nan=pd.isnull(df_books["author"])     #locate which book does not have author
df_books[nan]

Unnamed: 0,isbn,title,author
187700,9627982032,The Credit Suisse Guide to Managing Your Perso...,


In [11]:
#clean data and erase the book without author
df_books_clean=df_books.dropna(axis=0,subset=["author"])
df_books_clean.isnull().sum()  #succesfully clean


isbn      0
title     0
author    0
dtype: int64

Most books are not rated frequently. To ensure statistical significance, remove from the dataset users with less than 200 ratings and books with less than 100 ratings.


In [12]:
active_users=df_ratings.groupby('user').filter(lambda x: x['rating'].count()>=200)
popular_books=df_ratings.groupby('isbn').filter(lambda x: x['rating'].count()>=100)

an approach would be taking only the popular books from the active users however the number of books in that case would be reduced drastically to just 99 and it would be to of an specific dataset

In [13]:
dfbooks=popular_books.merge(df_books_clean, left_on='isbn', right_on='isbn')[['user', 'isbn', 'rating', 'title', 'author']]
dfbooks.head()

Unnamed: 0,user,isbn,rating,title,author
0,276727,446520802,0.0,The Notebook,Nicholas Sparks
1,278418,446520802,0.0,The Notebook,Nicholas Sparks
2,638,446520802,0.0,The Notebook,Nicholas Sparks
3,3363,446520802,0.0,The Notebook,Nicholas Sparks
4,7158,446520802,10.0,The Notebook,Nicholas Sparks


In [14]:
dfbooks.drop_duplicates(subset='isbn',keep='first').shape #727 books

(727, 5)

In [15]:
active_users.head()

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1457,277427,0026217457,0.0
1458,277427,003008685X,8.0
1459,277427,0030615321,0.0
1460,277427,0060002050,0.0


In [16]:
df_merge=pd.merge(dfbooks, active_users, how='inner', indicator=False)
df_merge.drop_duplicates(subset='user',keep='first').shape, df_merge.drop_duplicates(subset='isbn',keep='first').shape  # 888 users and 727 books

((888, 5), (727, 5))

In [17]:
df_unique=df_merge.drop_duplicates(['title','user'])

In [28]:
pivot_books= df_unique.pivot(index='title',columns='user', values='rating').fillna(0)
pivot_books

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Without Remorse,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
KNN = NearestNeighbors(algorithm='brute', metric='cosine')
KNN.fit(pivot_books.values)

NearestNeighbors(algorithm='brute', metric='cosine')

In [30]:
pivot_books.loc['A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash'].values.reshape(1,-1).shape
bool_id="Where the Heart Is (Oprah's Book Club (Paperback))"


In [191]:
#pivot_books.loc['A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash'].values.reshape(1,-1)
a=[]
distance, indices = KNN.kneighbors(pivot_books.loc[bool_id].values.reshape(1,-1), n_neighbors=6)
nearest_book=pivot_books.iloc[indices[0]].index.values

#[pivot_books.index[indices.flatten()[0]]]
#print(indices[0][0])
result=list(zip(nearest_book,distance[0]))
result[0]=result[0][0] #to erase the distance=0 from the given book
result.reverse()

a=result[-1:]+result[:-1]
a

["Where the Heart Is (Oprah's Book Club (Paperback))",
 ("I'll Be Seeing You", 0.8016211),
 ('The Weight of Water', 0.77085835),
 ('The Surgeon', 0.7699411),
 ('I Know This Much Is True', 0.7677075),
 ('The Lovely Bones: A Novel', 0.7234864)]

In [206]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  ''' 

  ---------------------
  
  '''
  #recommended_books=[]
  #Find K nearest neightbour
  distance, indices = KNN.kneighbors(pivot_books.loc[book].values.reshape(1,-1), n_neighbors=6)
  nearest_book=pivot_books.iloc[indices[0]].index.values
  #make list of books and distance
  result=list(zip(nearest_book,distance[0]))
  result[0]=result[0][0] #to erase the distance=0 from the given book
  result.reverse()
  recommended_books=result[-1:]+[result[:-1]]

  return recommended_books
get_recommends("1984")


['1984',
 [("The Hitchhiker's Guide to the Galaxy", 0.77445555),
  ("Slaughterhouse Five or the Children's Crusade: A Duty Dance With Death",
   0.77405274),
  ('The Catcher in the Rye', 0.76593226),
  ('Animal Farm', 0.7618247),
  ("The Handmaid's Tale", 0.75083876)]]

A function that gives the correct result however I wanna test labeled way.

In [204]:
def get_recommends(book = ""):

  distances, indices = KNN.kneighbors(pivot_books.loc[book].values.reshape(1,-1), n_neighbors=6)
  nearest_book=pivot_books.iloc[indices[0]]

  recommended_books = []
  for x in reversed(range(6)):
      bookrecommended = [pivot_books.index[indices[0][x]], distances[0][x]]
      recommended_books.append(bookrecommended)
  recommended_books = [book, recommended_books]
  
  return recommended_books
get_recommends("1984")


['1984',
 [["The Hitchhiker's Guide to the Galaxy", 0.77445555],
  ["Slaughterhouse Five or the Children's Crusade: A Duty Dance With Death",
   0.77405274],
  ['The Catcher in the Rye', 0.76593226],
  ['Animal Farm', 0.7618247],
  ["The Handmaid's Tale", 0.75083876],
  ['1984', 0.0]]]

In [197]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2): 
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [("I'll Be Seeing You", 0.8016211), ('The Weight of Water', 0.77085835), ('The Surgeon', 0.7699411), ('I Know This Much Is True', 0.7677075), ('The Lovely Bones: A Novel', 0.7234864)]]
You passed the challenge! 🎉🎉🎉🎉🎉
