In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
# getting data files
!wget -q https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip -q book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'}
    )

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'}
    )

In [4]:
# function to return book name taking in isbn
def isbn_converter(isbn_value):
    title = df_books.loc[df_books['isbn'] == isbn_value, 'title'].values[0]
    return title

In [5]:
# removing users with less than 200 ratings and books with less than 100 ratings to improve statistical significance
user_rating_counts = df_ratings['user'].value_counts()
users_with_min_200_ratings = user_rating_counts[user_rating_counts >= 200].index

book_rating_counts = df_ratings['isbn'].value_counts()
books_with_min_100_ratings = book_rating_counts[book_rating_counts >= 100].index

df_ratings_filtered = df_ratings[df_ratings['user'].isin(users_with_min_200_ratings) & df_ratings['isbn'].isin(books_with_min_100_ratings)]
print(df_ratings_filtered)

           user        isbn  rating
1456     277427  002542730X    10.0
1469     277427  0060930535     0.0
1471     277427  0060934417     0.0
1474     277427  0061009059     9.0
1484     277427  0140067477     0.0
...         ...         ...     ...
1147304  275970  0804111359     0.0
1147436  275970  140003065X     0.0
1147439  275970  1400031346     0.0
1147440  275970  1400031354     0.0
1147441  275970  1400031362     0.0

[49781 rows x 3 columns]


In [6]:
# converting the dataframe into a matrix and then into csr matrix
user_item_matrix = df_ratings_filtered.pivot(index='isbn', columns='user', values='rating').fillna(0)
user_item_csr_matrix = csr_matrix(user_item_matrix.values)

In [7]:
# defining the model and training it
knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
knn_model.fit(user_item_csr_matrix)

In [8]:
# function to return recommended books
def get_recommends(book = ""):
    target_isbn = df_books.loc[df_books['title'] == book, 'isbn'].values[0]

    target_index = user_item_matrix.index.get_loc(target_isbn)

    distances, indices = knn_model.kneighbors(user_item_csr_matrix[target_index], n_neighbors=6)

    nearest_neighbors_isbn = [user_item_matrix.index[i] for i in indices.flatten()]
    nearest_neighbors_isbn = nearest_neighbors_isbn[1:]

    distances = [i for i in distances.flatten()]
    distances = distances[1:]


    for isbn_value in nearest_neighbors_isbn:
        print(isbn_converter(isbn_value))

    print(distances)


get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")

The Lovely Bones: A Novel
I Know This Much Is True
The Surgeon
The Weight of Water
I'll Be Seeing You
[0.7234864, 0.7677075, 0.7699411, 0.77085835, 0.8016211]
