In [40]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
df = pd.read_csv('datasets/books.csv',error_bad_lines=False)
df.head()

b'Skipping line 3350: expected 12 fields, saw 13\nSkipping line 4704: expected 12 fields, saw 13\nSkipping line 5879: expected 12 fields, saw 13\nSkipping line 8981: expected 12 fields, saw 13\n'


Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [42]:
df.shape

(11123, 12)

In [43]:
df['authors'].value_counts()['Douglas Adams']

14

In [23]:
#Remove any books not in english
df = df.loc[df['language_code'].isin(['eng','en-US','en-GB','en-CA'])].reset_index()

In [24]:
def popularityRecommender(df):
    #Define the minimum vote count
    minimum_vote_count = 0.75* df['ratings_count'].max()
    #Define C – the mean rating
    mean_rating = df['average_rating'].mean()

    df['weighted_rating'] = (((df['ratings_count']/(df['ratings_count']+minimum_vote_count))*df['average_rating'])+((minimum_vote_count/(df['ratings_count']+minimum_vote_count))*mean_rating))

    recommendations = df.sort_values(by = 'weighted_rating',ascending = False).head(5)
    return(recommendations)

In [25]:
top5 = popularityRecommender(df)
top5.head(5)

Unnamed: 0,index,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,weighted_rating
3,3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,4.185444
0,0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,4.172746
1,1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,4.146071
4169,4415,15881,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling/Mary GrandPré,4.42,0439064864,9780439064866,eng,341,2293963,34692,6/2/1999,Arthur A. Levine Books / Scholastic Inc.,4.126539
23,23,34,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. Tolkien,4.36,0618346252,9780618346257,eng,398,2128944,13670,9/5/2003,Houghton Mifflin Harcourt,4.094953


In [26]:
top5Titles = df["title"].head(5).values
print(top5Titles)

['Harry Potter and the Half-Blood Prince (Harry Potter  #6)'
 'Harry Potter and the Order of the Phoenix (Harry Potter  #5)'
 'Harry Potter and the Chamber of Secrets (Harry Potter  #2)'
 'Harry Potter and the Prisoner of Azkaban (Harry Potter  #3)'
 'Harry Potter Boxed Set  Books 1-5 (Harry Potter  #1-5)']


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

#Instantiate a new Vectorizer object 
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['title'] = df['title'].fillna('')

#This line transforms the description of the movies to the tfidf #matrix needed

tfidf_matrix = tfidf.fit_transform(df['title'])

distance_matrix = linear_kernel(tfidf_matrix)

indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [28]:
def ContentBasedRecommender(title, indices, distance_matrix):
    id_ = indices[title]
    distances = list(enumerate(distance_matrix[id_]))
    distances = sorted(distances, key=lambda x: x[1], reverse = True)
    distances = distances[1:6]
    recommendations = [distance[0] for distance in distances]
    return df['title'].iloc[recommendations]

In [29]:
ContentBasedRecommender("Poor People",indices,distance_matrix)

7292                  The Book of Other People
2872    The Working Poor: Invisible in America
3265                            All New People
9017                       A Man of the People
212              We Were Not Like Other People
Name: title, dtype: object

In [30]:
ContentBasedRecommender("Expelled from Eden: A William T. Vollmann Reader",indices,distance_matrix)

2315                 This Other Eden
7089                            Eden
2325    Who Was William Shakespeare?
5268                    West To Eden
1203                    East of Eden
Name: title, dtype: object

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
#Instantiate a new Vectorizer object 
cbr = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
#df['authors'] = df['authors'].fillna('')

#This line transforms the description of the movies to the tfidf #matrix needed

tfidf_matrix = cbr.fit_transform(df['authors'])

distance_matrix = cosine_similarity(tfidf_matrix)

#This line is apparently not redundant...
df = df.drop_duplicates(subset="authors")

indices = pd.Series(df.index, index=df['authors']).drop_duplicates(keep = "first", inplace=False)

In [33]:
def ContentBasedAuthorRecommender(author, indices, distance_matrix):
    id_ = indices[author]
    distances = list(enumerate(distance_matrix[id_]))
    distances = sorted(distances, key=lambda x: x[1], reverse = True)
    distances = distances[1:6]
    recommendations = [distance[0] for distance in distances]
    return df['authors'].iloc[recommendations]

In [34]:
ContentBasedAuthorRecommender("William Shakespeare",indices,distance_matrix)

661                     Dan Millman
2739           Sam Walton/John Huey
2741                 Barry Schwartz
3513                  Stuart McLean
3806    Ryū Murakami/Stephen Snyder
Name: authors, dtype: object