# Import libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

# Get data files

In [None]:
import sys
import os

if 'google.colab' in sys.modules:
    !wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
    !unzip book-crossings.zip

else:
    import subprocess

    def runCommand(command, verbose=False):
        process = subprocess.run(command, stdout = subprocess.PIPE, stderr = subprocess.PIPE, text = True, shell = True)
        std_out, std_err = process.stdout, process.stderr
        if verbose:
            print(std_out.strip(), std_err)

    files = os.listdir()
    if not ('book-crossings.zip' in files):
        runCommand('wget "https://cdn.freecodecamp.org/project-data/books/book-crossings.zip"')
    if (('BX-Book-Ratings.csv' in files) or ('BX-Books.csv' in files)) or ('BX-Users.csv' in files):
        runCommand('rm BX-Book-Ratings.csv BX-Books.csv BX-Users.csv')
    runCommand('tar -xf book-crossings.zip')

# Preprocessing Data

## Creating the dataframes

In [2]:
books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [3]:
df_books.shape

(271379, 3)

In [24]:
df_ratings.shape

(1149780, 3)

## Converting all the data which is a string into numerical values

In [3]:
# print(df_books.shape)
# df_books = df_books.drop_duplicates(subset='title')
# print(df_books.shape)
df_books['author'] = df_books['author'].str.title()

In [None]:
authors =  df_books['author'].unique().tolist()
strToNum = {word: position for position, word in enumerate(authors)}

authorsToFit = df_books.drop('isbn', axis=1).drop('title', axis=1).applymap(lambda x: strToNum[x])
authorsToFit.shape

In [None]:
ratingRep = df_ratings.isbn.value_counts()
ratingsSum = df_ratings.groupby('isbn')['rating'].sum()
def mappingRatings(x):
  try:
    return ratingsSum[x]/ratingRep[x]
  except:
    return 0

rankingsToFit = df_books.drop('title', axis=1).drop('author', axis=1)
rankingsToFit = rankingsToFit.applymap(lambda x: mappingRatings(x))
rankingsToFit = rankingsToFit.rename(columns={'isbn': 'rating'})
rankingsToFit.shape

In [None]:
df_toFit = authorsToFit.copy()
df_toFit['rating'] = rankingsToFit['rating'].copy()
print(df_toFit)

# KNN Algorithm

In [7]:
# add your code here - consider creating a new cell for each section of code
knn = NearestNeighbors(n_neighbors = 6, algorithm = 'brute')

In [8]:
knn.fit(df_toFit)

In [9]:
distances, indices = knn.kneighbors(df_toFit)

In [None]:
indices

In [None]:
distances

In [93]:
df_toFit.rating.values

array([0.        , 4.92857143, 5.        , ..., 0.        , 0.        ,
       0.        ])

In [None]:

plt.figure()
plt.plot(df_toFit.author.values, df_toFit.rating.values, '.k')
plt.show()

# Recommendation function

In [10]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  row = df_books.loc[df_books.title == book].index[0]
  neighborsBooks = indices[row]

  recommended_books = [book, []]
  for count, nn in enumerate(neighborsBooks):
    if (distances[row][count] == 0.0): 
      pass
    else: 
      recommended_books[1].append([df_books['title'][nn], distances[row][count]])

  return recommended_books

In [12]:
books = get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))")
print(books[0])
for i in range(len(books[1])):
  print(books[1][i])

The Queen of the Damned (Vampire Chronicles (Paperback))
['Vittorio the Vampire: New Tales of the Vampires', 0.045221445184109046]
['The Tale of the Body Thief (Vampire Chronicles)', 0.07237386269785813]
['Interview With the Vampire', 0.08205128198792992]
['Cry to Heaven', 0.10256410275980717]
['Memnoch the Devil (Rice, Anne, Vampire Chronicles, 5th Bk.)', 0.11538461550625446]


In [14]:
print(df_toFit.rating[df_books.loc[df_books.title == "The Queen of the Damned (Vampire Chronicles (Paperback))"].index[0]])
print("")
for title in books[1]:
    isbn = df_books.loc[df_books.title == title[0]].isbn.values[0]
    i = df_toFit.loc[df_books.isbn == isbn]
    print(i)

print(authors[480])

2.717948717948718

      author    rating
1133     480  1.949153
      author    rating
5845     480  2.790323
       author  rating
42383     480     2.8
      author    rating
6138     480  2.882353
       author    rating
19134     480  2.833333
Anne Rice


In [19]:
def checkInfo(recommended_books):
    for title in recommended_books[1]:
        isbn = df_books.loc[df_books.title == title[0]].isbn.values[0]
        i = df_toFit.loc[df_books.isbn == isbn]
        print(i)

recommended_books = [[], [["Catch 22"], ['The Witching Hour (Lives of the Mayfair Witches)'], ['Interview with the Vampire'], ['The Tale of the Body Thief (Vampire Chronicles (Paperback))'], ['The Vampire Lestat (Vampire Chronicles, Book II)']]]
checkInfo(recommended_books)
print("")
recommended_books = [[], [["I'll Be Seeing You"], ['The Weight of Water'], ['The Surgeon'], ['I Know This Much Is True']]]
checkInfo(recommended_books)


     author    rating
292     231  4.037037
      author    rating
1131     480  3.176955
      author    rating
2526     480  3.535573
      author    rating
2885     480  3.352332
     author    rating
608     480  3.777409

    author    rating
45      44  1.942857
     author    rating
252     111  3.025641
      author    rating
3703    1012  2.593985
      author    rating
3022     386  3.210744


# Final Test

In [16]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2): 
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [['Where the Heart Is: A Novel', 0.42026862033746376], ['The Honk and Holler Opening Soon', 0.810900938809324], ['The Honk and Holler Opening Soon', 0.8678876679029968], ["Manuel de Chasse Et de Peche A L'Usage Des Filles", 1.0748253208951728], ["The Girls' Guide to Hunting and Fishing", 1.2240963457083844]]]
You haven't passed yet. Keep trying!
