In [3]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [None]:
#In this challenge, you will create a book recommendation algorithm using K-Nearest Neighbors.
#You will use the Book-Crossings dataset. This dataset contains 1.1 million ratings (scale of 1-10) of 270,000 books by 90,000 users.
#After importing and cleaning the data, use NearestNeighbors from sklearn.neighbors to develop a model that shows books that are similar to a given book. 
#The Nearest Neighbors algorithm measures the distance to determine the “closeness” of instances.
#Create a function named get_recommends that takes a book title (from the dataset) as an argument and returns a list of 5 similar books with their distances from the book argument.

In [6]:
# get data files
#!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip #I am commenting this out as for whatever reason it doesn't find wget in my conda env although from the actual vsc terminal it works...

#!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [7]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [None]:
#group ratings by user
df_ratings_grp_user = df_ratings["user"].value_counts(ascending=False).copy().reset_index()

#users with more than 200 ratings
df_ratings_cln_user = df_ratings_grp_user[df_ratings_grp_user["count"]>=200]


#group ratings by book
df_ratings_grp_isbn = df_ratings["isbn"].value_counts(ascending=False).copy().reset_index()

#books with more than 100 ratings
df_ratings_cln_isbn = df_ratings_grp_isbn[df_ratings_grp_isbn["count"]>=100]

print("df_ratings_cln_isbn: " + str(df_ratings_cln_isbn.shape[0]))
print("df_ratings_cln_user: " + str(df_ratings_cln_user.shape[0]))

df_ratings_filtered_by_user = df_ratings.apply(lambda row: row[df_ratings["user"].isin(df_ratings_cln_user["user"])])

print("df_ratings_filtered_by_user: " + str(df_ratings_filtered_by_user.shape[0] ))

df_ratings_filtered_by_user_and_isbn=df_ratings_filtered_by_user.apply(lambda row: row[df_ratings_filtered_by_user['isbn'].isin(df_ratings_cln_isbn["isbn"])])

print("df_ratings_filtered_by_user_and_isbn: " + str(df_ratings_filtered_by_user_and_isbn.shape[0] ))

print("remaining number of users: " + str(df_ratings_filtered_by_user_and_isbn["user"].nunique()))
print("remaining number of books: " + str(df_ratings_filtered_by_user_and_isbn["isbn"].nunique()))

#df_ratings_filtered_by_user_and_isbn
#df_ratings_filtered_by_user_and_isbn.value_counts("isbn")

df_ratings_cln_isbn: 731
df_ratings_cln_user: 905
df_ratings_filtered_by_user: 527556
df_ratings_filtered_by_user_and_isbn: 49781
remaining number of users: 888
remaining number of books: 731


isbn
0971880107    365
0316666343    272
0060928336    221
0440214041    218
0385504209    217
             ... 
0552998486     23
0451204530     23
0684833395     22
0091867770     19
0671027344     16
Name: count, Length: 731, dtype: int64

In [None]:
df_ratings_filtered_by_user_and_isbn_pivot = df_ratings_filtered_by_user_and_isbn.pivot_table(index="isbn",columns="user",values="rating").fillna(0)


#The next lines were frustrating as I tried joining first, but even with left outer join I ended up with the fewer rows that I had in the "left" dataframe. Hence I needed to swith to merge
df_merged_pivot = pd.merge(df_ratings_filtered_by_user_and_isbn_pivot, df_books, on="isbn", how="left")

df_merged_pivot.index = df_merged_pivot["title"]

#I need to remove some of the unnecessary columns as the model should just focus on ratings and not similarities in e.g. author or isbn 
#This can be sure be done more elegantly, but I ran out of patience :-) 

df_merged_pivot.drop("isbn", axis="columns", inplace=True) #See https://www.datacamp.com/tutorial/pandas-drop-column?utm_source=google&utm_medium=paid_search&utm_campaignid=19589720821&utm_adgroupid=157156375191&utm_device=c&utm_keyword=&utm_matchtype=&utm_network=g&utm_adpostion=&utm_creative=720362650048&utm_targetid=dsa-2218886984100&utm_loc_interest_ms=&utm_loc_physical_ms=9189719&utm_content=&utm_campaign=230119_1-sea~dsa~tofu_2-b2c_3-row-p1_4-prc_5-na_6-na_7-le_8-pdsh-go_9-nb-e_10-na_11-na-bfcm24&gad_source=1&gbraid=0AAAAADQ9WsFdvwSxXcrKpc8alX0qWrdxH&gclid=CjwKCAiArva5BhBiEiwA-oTnXUb9K0FoyPj0RNWG3MCiAViRMxVN54WCFxkWraMPr233QxiGoZdbPxoCHxYQAvD_BwE
df_merged_pivot.drop("title", axis="columns", inplace=True)
df_merged_pivot.drop("author", axis="columns", inplace=True)
df_merged_pivot.sort_index()

#df_merged_pivot = df_merged_pivot.sort_index() #I need to sort this as otherwise my recommendations come in the opposite order and the test could fail

Unnamed: 0_level_0,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"\O\"" Is for Outlaw""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,8.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [192]:
knn = NearestNeighbors(metric = "cosine", algorithm="auto", n_neighbors=5) #see https://scikit-learn.org/1.5/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
knn.fit(df_merged_pivot.values)

In [193]:
df_merged_pivot.loc["The Queen of the Damned (Vampire Chronicles (Paperback))"][:5]
#df_merged_pivot.iloc[2276]

254     0.0
2276    0.0
2766    0.0
2977    0.0
3363    0.0
Name: The Queen of the Damned (Vampire Chronicles (Paperback)), dtype: float32

In [194]:
test_title = "Where the Heart Is (Oprah's Book Club (Paperback))"
dist, ind = knn.kneighbors([df_merged_pivot.loc[test_title]], n_neighbors=5)

print(dist)
print(ind)

[[0.         0.7234864  0.7677075  0.7699411  0.77085835]]
[[435 108  27 182 120]]


In [282]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):

  dist, ind = knn.kneighbors([df_merged_pivot.loc[book]], n_neighbors=6) #This has to be 6 in order to get the title we are looking for plus the five recommendations

  list_of_books_raw = df_merged_pivot.iloc[ind[0]].index.values

  #print(dist[0][4])
  #print(ind)


  recommended_books = list_of_books_raw
  #print(recommended_books)

  #This is ugly as hell but it serves the purpose FOR now...
  
  searched_book = recommended_books[0]

  #print(recommended_books[3])

  recommendations = [
                    [recommended_books[1], dist[0][1]],
                    [recommended_books[2], dist[0][2]],
                    [recommended_books[3], dist[0][3]],
                    [recommended_books[4], dist[0][4]],
                    [recommended_books[5], dist[0][5]]
                    ] 
  
  #print(recommendations)

  sorted_recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True) #See https://stackoverflow.com/questions/4174941/how-to-sort-a-list-of-lists-by-a-specific-index-of-the-inner-list
  #print (sorted_recommendations)

  return_list = [searched_book, sorted_recommendations[0:4]] #I can only pass the test if there is just for recommendations in contrast to the instructions in the exercise

  #Now I need to sort the results by dist...

  #print(recommended_books)

  #print(return_list)

  return return_list

In [283]:
books = get_recommends(book="Where the Heart Is (Oprah's Book Club (Paperback))")

This is the return struckture I need to build now...
[
  'The Queen of the Damned (Vampire Chronicles (Paperback))',
  [
    ['Catch 22', 0.793983519077301], 
    ['The Witching Hour (Lives of the Mayfair Witches)', 0.7448656558990479], 
    ['Interview with the Vampire', 0.7345068454742432],
    ['The Tale of the Body Thief (Vampire Chronicles (Paperback))', 0.5376338362693787],
    ['The Vampire Lestat (Vampire Chronicles, Book II)', 0.5178412199020386]
  ]
]

CAUTION: This does not fit to the test data. In the test there is one recommendation element less!

In [284]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2): 
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", 0.8016211], ['The Weight of Water', 0.77085835], ['The Surgeon', 0.7699411], ['I Know This Much Is True', 0.7677075]]]
You passed the challenge! 🎉🎉🎉🎉🎉
