In [None]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [None]:
# get data files
#wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

#unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [None]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [None]:
# this shows how many ratings are given by how many users (left) and received by how many books (right)
# original problem asks to remove users if their count is < 200, which removes huge data. also, i think instead high count ones should be removed, specially those outliers
# also, original problem asks to remove books if their count is < 100, but this removes a huge amount of data
fig, axs = plt.subplots(1, 2)
axs[0].hist(df_ratings.loc[:, 'user'].value_counts(), bins = 50, log = True)
axs[0].set_xlabel('number of ratings')
axs[0].set_ylabel('number of users')
axs[1].hist(df_ratings.loc[:, 'isbn'].value_counts(), bins = 50, log = True)
axs[1].set_xlabel('number of ratings')
axs[1].set_ylabel('number of books')
plt.tight_layout()
plt.show()

In [None]:
# this filtering of ratings removes all ratings that are made by a user with < 200 ratings AND book corresponding to that rating has < 100 ratings
# groupby creates an object that will do operations based on what it is grouped by. then we take 'user' column to operate on
# finally, .transform('count') generates a Series that stores count for each user but in same format as original dataframe, functions can also be passed in transform()
df_ratings = df_ratings[(df_ratings.groupby(by = 'isbn')['isbn'].transform('count') >= 100) & (df_ratings.groupby(by = 'user')['user'].transform('count') >= 200)]

In [None]:
# this creates two more columns title and author in df_ratings with matching isbn. any isbn not present in df_books is dropeed from df_ratings
df_ratings = df_ratings.merge(right = df_books, on = 'isbn')
# the following drops ratings that were given to same title by same user (even though they might have different isbn or author), i think this is not ideal but is expected from problem
df_ratings.drop_duplicates(subset = ('title', 'user'), keep = 'first', inplace = True)

In [None]:
# this creates a 2d-array where rows are titles, columns are users and values are ratings. any unknown entry is filled with 0.0 (which is questionable)
df = df_ratings.pivot(index = 'title', columns = 'user', values = 'rating').fillna(0.0)

In [None]:
# this finds 6 nearest neighbors (including self) treating each row as distances for all users. n_jobs -2 uses all processors (but one) for parallel processing. metric cosine isused because angles between vectors seem to be more appropriate for this problem, than euclidean distances
nbrs = NearestNeighbors(n_neighbors = 6, algorithm = 'brute', metric = 'cosine', n_jobs = -2).fit(df)
distances, indices = nbrs.kneighbors(df)

In [62]:
# function to return recommended books - this will be tested
def get_recommends(book = ''):
    if book in df.index:
        i0 = df.index.get_loc(book)
    else:
        return f'book {book} not found'
    return [book, [[df.index[indices[i0][i]], distances[i0][i]] for i in range(5, 0, -1)]]    # reverse ordering is illogical but is expected to pass the challenge

In [None]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()