## Datasets

In [1]:
## Import libraries
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import re
# import seaborn as sns
import random
from scipy.spatial.distance import pdist, squareform

## Import datasets
ratings = pd.read_csv('../datasets/gb10k_ratings.csv')
books = pd.read_csv('../datasets/clean_books.csv')
books.drop(columns=['Unnamed: 0'], inplace=True)
books_tags = pd.read_csv('../datasets/clean_tags_books.csv')
books_tags.drop(columns=['Unnamed: 0'], inplace=True)

#### New dataset: books including genre

In [2]:
##Merge the tags with the rest of the books information
books_tags_all_info = pd.merge(books_tags, books, on='goodreads_book_id')

In [3]:
##Filter only the genre tags. Drop features not needed for this part of the analysis
books_with_category = books_tags_all_info[books_tags_all_info.tag_class == 'genre']
books_with_category.drop(columns=['tag_class', 'original_publication_year', 'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5'], inplace=True)

##Discard too broad tags (non-descriptive for genre-filtering purposes)
do_not_consider = ['contemporary', 'fiction', 'novel', 'series','adult', 'non-fiction', 'realistic', 'family']
books_with_category.loc[books_with_category.new_tag_name.isin(do_not_consider), 'new_tag_name'] = 'other'
books_with_category_exc = books_with_category[books_with_category.new_tag_name != 'other'].copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [4]:
##Create category columns
books_with_category_exc['main_genre'] = 0
books_with_category_exc['sec_genre'] = 0
list_of_book_id = books_with_category_exc.goodreads_book_id.unique().tolist()
for i in list_of_book_id:
        select = books_with_category_exc[books_with_category_exc.goodreads_book_id == i]
        temp = select[['goodreads_book_id', 'new_tag_name', 'count']].sort_values(by='count', ascending=False).reset_index(drop=True)
        if temp.shape[0] >=2:
                books_with_category_exc.loc[books_with_category_exc.goodreads_book_id == i, 'main_genre'] = temp.loc[0, 'new_tag_name']
                books_with_category_exc.loc[books_with_category_exc.goodreads_book_id == i, 'sec_genre'] = temp.loc[1, 'new_tag_name']
        elif temp.shape[0] == 1:
                books_with_category_exc.loc[books_with_category_exc.goodreads_book_id == i, 'main_genre'] = temp.loc[0, 'new_tag_name']

In [5]:
books_with_category_exc.drop(columns=['count', 'new_tag_name'], inplace=True)
books_with_category_exc = books_with_category_exc.drop_duplicates()
books_with_category_exc['book_series'] = books_with_category_exc.book_series.fillna('')

In [6]:
books_with_category_original = books_with_category_exc.copy()
books_with_category_exc.to_csv('../datasets/work_datasets/books_with_category.csv')

## Basic recommendations

In [7]:
'''
Considering a recommendation based on the book weighted rating, where:
- nr is the number of ratings that the book has
- mv is the minimum votes required to be listed
- ar is the average rating of the book
- mr is the mean rating across the whole dataset
'''

# Create a function to calculate the weighted rating
def w_rating(row, mv, mr):
    nr = row.work_ratings_count
    ar = row.average_rating
    return (nr/(nr+mv) * ar) + (mv/(mv+nr) * mr)

# Create a function to calculate recommendations. Considers genre and avoids series repetition
def recommendation_with_category(dataset, mv, mr, genre=None):
    if genre ==None: 
        books_to_evaluate = dataset.copy()
    else:
        books_to_evaluate = dataset.copy()
        books_to_evaluate = books_to_evaluate[books_to_evaluate.main_genre == genre]
    books_to_evaluate = books_to_evaluate[books_to_evaluate.work_ratings_count >= mv]
    books_to_evaluate['w_rating'] = books_to_evaluate.apply(lambda x : w_rating(x, mv, mr), axis=1)
    books_to_evaluate = books_to_evaluate.sort_values(by='w_rating', ascending=False)
    grouped = books_to_evaluate[['book_title','book_series','w_rating', 'authors', 'main_genre']].set_index('book_series')
    grouped = grouped.loc[~grouped.index.duplicated(keep='first')]
    return grouped


##Set main parameters. The minimum votes threshold is set by the quantile (in this case, 0.9, meaning that the books recommended
##need to have more votes than 90% of the books
all_books = books_with_category_original.copy()
mv = all_books['work_ratings_count'].quantile(0.90)
mr = all_books['average_rating'].mean()

recommendations = recommendation_with_category(all_books, mv, mr)
recommendations.head(10)

Unnamed: 0_level_0,book_title,w_rating,authors,main_genre
book_series,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Harry Potter,Harry Potter and the Deathly Hallows,4.577047,"J.K. Rowling, Mary GrandPré",fantasy
A Song of Ice and Fire,A Storm of Swords,4.450089,George R.R. Martin,fantasy
The Kingkiller Chronicle,The Name of the Wind,4.445507,Patrick Rothfuss,fantasy
The Lord of the Rings,The Return of the King,4.422745,J.R.R. Tolkien,fantasy
,The Help,4.422252,Kathryn Stockett,historical
The Infernal Devices,Clockwork Princess,4.414067,Cassandra Clare,fantasy
Percy Jackson and the Olympians,The Last Olympian,4.412904,Rick Riordan,fantasy
A Court of Thorns and Roses,A Court of Mist and Fury,4.392844,Sarah J. Maas,fantasy
The Stormlight Archive,The Way of Kings,4.390141,Brandon Sanderson,fantasy
The Heroes of Olympus,The Mark of Athena,4.345951,Rick Riordan,fantasy


## User-based recommendations

In [8]:
## Create a sample group of 50 users that will serve to try the recommender
all_ratings = ratings.copy()
random.seed(42)
random_users=random.sample(list(all_ratings['user_id'].unique()),50)
my_test_group = all_ratings.loc[all_ratings['user_id'].isin(random_users)]
my_train_data = all_ratings.loc[~all_ratings['user_id'].isin(random_users)]

In [27]:
## Create a function that returns a list of 5 most similar users for the user given. Since the total amount of data in the 
## dataframes is very extense, we reduce the number of users taken into account to 1000

def give_me_closest_users(train_dataframe, my_user_df):
    
    ##Extract the user id of the dataframe with all its info for further reference
    my_user_id = my_user_df.user_id.unique()[0]
    
    ##Select a list of random ids from our train dataset to create the correlation matrix with our user
    random_train_ids = random.sample(list(train_dataframe['user_id'].unique()),1000)
    
    ##Create a dataframe with the rating information of the randomly selected users
    train_df = train_dataframe[train_dataframe['user_id'].isin(random_train_ids)]
    
    ##Merge the ratings of our user with all the ratings of the train dataframe
    train_df_and_user = pd.concat([train_df, my_user_df], ignore_index=True)
    
    ##Create a pivot table of the dataframe
    pv_data = train_df_and_user.pivot_table(index=['user_id'], columns=['book_id'], values=['rating'], fill_value=0) 
    
    ##Normalize the values of the dataframe
    pv_data = pv_data.div(pv_data.sum(axis=1), axis=0)
    
    ##Create a user matrix that measures the correlation between users
    user_matrix = pd.DataFrame(1/ (1 + squareform(pdist(pv_data, metric = 'Euclidean'))), columns = pv_data.index, index = pv_data.index)
    print(user_matrix)
    ##Return a list of the 5 more related users. The first correlation of the list is excluded as it will be our target user
    return list(user_matrix[my_user_id].sort_values(ascending=False)[1:6].index)


##Create a function that returns a with all the books read by the most similar users, minus the ones already read by
##the user, ordered by the average rating of the similar users
def give_books_id(ratings_df, my_user_df, similar_users):
    
    ##Extract all the books rated by the similar users
    recommended_books = ratings_df[ratings_df['user_id'].isin(similar_users)]
    
    ##Group all the books rated by the similar users and calculate the mean rating given by those users
    grouped = recommended_books.groupby('book_id', as_index=False).agg({'rating':'mean'})
    
    ##Create a list of the books read (rated) by our user
    list_of_books = my_user_df.book_id.tolist()
    
    ##Substract from all the books the ones already read (rated) by our user
    grouped = grouped.loc[~grouped['book_id'].isin(list_of_books)]
    
    ##Return the dataframe ordered by rating
    return (grouped.sort_values(by='rating', ascending=False).reset_index(drop=True))

##Create a function that returns a list of the books to recommend by the user. The amount of books to recommend can be selected.
def give_books_titles(all_books_df, recommended_books_id, number_of_output_books):
    if len(recommended_books_id) > number_of_output_books:
        i = number_of_output_books - 1
    else:
        i = len(recommended_books_id)
    list_of_books = (recommended_books_id.loc[:i]).book_id.to_list()
    recommended_books_titles = all_books_df[all_books_df.book_id.isin(list_of_books)]
    return recommended_books_titles[['book_id', 'book_title', 'authors', 'original_publication_year']]





In [28]:
## Test the recommender

##Create a function that selects a random user from our test data and returns all its ratings
def give_me_a_user():
    random_user_id = random.sample(list(my_test_group['user_id'].unique()),1)[0]
    random_user_df = my_test_group.loc[my_test_group.user_id == random_user_id]
    return random_user_df, random_user_id

## Get the information from a random test user
my_user_df, my_user_id = give_me_a_user()

## Get a list of 5 similar users
similar_users = give_me_closest_users(my_train_data, my_user_df)

## Get the ids of all the possible books to recommend 
recommended_books_id = give_books_id(all_ratings, my_user_df, similar_users)

## Get a dataframe with 10 titles to recommend
recommended_books_titles = give_books_titles(books, recommended_books_id, 10)

user_id     82        195       501       592       599       617       701    \
user_id                                                                         
82       1.000000  0.889747  0.880300  0.890547  0.886809  0.886659  0.891058   
195      0.889747  1.000000  0.879417  0.889561  0.882097  0.888373  0.892811   
501      0.880300  0.879417  1.000000  0.877524  0.872302  0.883709  0.880691   
592      0.890547  0.889561  0.877524  1.000000  0.885592  0.888113  0.892468   
599      0.886809  0.882097  0.872302  0.885592  1.000000  0.883767  0.883369   
...           ...       ...       ...       ...       ...       ...       ...   
53134    0.876151  0.873573  0.866932  0.874718  0.871569  0.872493  0.876733   
53143    0.876273  0.874637  0.867696  0.874218  0.870178  0.872731  0.880413   
53205    0.896474  0.894647  0.884787  0.896871  0.893502  0.897133  0.894461   
53234    0.875971  0.876480  0.866843  0.872892  0.872574  0.871107  0.875114   
53369    0.887500  0.879843 

In [29]:
similar_users

[51802, 43076, 40675, 33599, 22950]

### Testing the algorithm

In [30]:
## Function that returns all the rated books for a user
def books_from_user(user_id):
    book_ids_user = ratings[ratings.user_id == user_id]
    book_ids_user_list = book_ids_user.book_id.to_list()
    books_user = books_with_category_exc.loc[(books_with_category_exc.book_id.isin(book_ids_user_list))]
    return books_user.drop(columns=['goodreads_book_id', 'average_rating', 'work_ratings_count'])


## Function that compares the similarity of two users
def compare_user_books(user_1, user_2):
    user_1_books = set(books_from_user(user_1).book_title.to_list())
    user_2_books = set(books_from_user(user_2).book_title.to_list())
    common_books = user_1_books.intersection(user_2_books)
    return user_1_books, user_2_books, common_books

In [31]:
### Books recommended to the user
recommended_books_titles

Unnamed: 0,book_id,book_title,authors,original_publication_year
342,343,The Runaway Jury,John Grisham,1996
344,345,Eldest,Christopher Paolini,2005
446,447,Brisingr,Christopher Paolini,2008
712,713,Prodigy,Marie Lu,2013
1559,1560,Switched,Amanda Hocking,2010
1561,1562,Confess,Colleen Hoover,2015
1695,1696,Glass Houses,Rachel Caine,2006
4233,4234,Twilight,Meg Cabot,2005
6050,6051,United We Spy,Ally Carter,2013
6278,6279,Airborn,Kenneth Oppel,2004


In [39]:
### Books already read by the user
already_read_titles = books_from_user(my_user_id)
already_read_titles

Unnamed: 0,book_id,book_title,book_series,authors,main_genre,sec_genre
201,2,Harry Potter and the Sorcerer's Stone,Harry Potter,"J.K. Rowling, Mary GrandPré",fantasy,young adult
6901,903,Anthem,,Ayn Rand,science-fiction,classic
13303,87,Night,The Night Trilogy #1),"Elie Wiesel, Marion Wiesel",classic,history
26001,36,The Giver,The Giver,Lois Lowry,children,classic
109101,569,The Angel Experiment,Maximum Ride,James Patterson,young adult,fantasy
...,...,...,...,...,...,...
856695,655,Crossed,Matched,Ally Condie,young adult,young adult
868997,4033,The Moon and More,,Sarah Dessen,young adult,young adult
896695,928,The Raven Boys,The Raven Cycle,Maggie Stiefvater,fantasy,young adult
915895,460,Clockwork Princess,The Infernal Devices,Cassandra Clare,fantasy,young adult


In [37]:
##Compare both lists
set(recommended_books_titles.book_title.tolist()).intersection(set(already_read_titles.book_title.tolist()))

{'Twilight'}

In [38]:
the_books[the_books.book_title == 'Twilight']

Unnamed: 0,book_id,book_title,book_series,authors,main_genre,sec_genre
233901,3,Twilight,Twilight,Stephenie Meyer,young adult,fantasy


In [34]:
##Compare similar users

for i in similar_users:
    user_1_books, user_2_books, common_books = compare_user_books(my_user_id, i)
    print('User', my_user_id, 'rated', len(user_1_books), 'books, and user', i, 'rated', len(user_2_books), '. They have', len(common_books), 'books in common.')

User 21451 rated 136 books, and user 51802 rated 123 . They have 41 books in common.
User 21451 rated 136 books, and user 43076 rated 138 . They have 34 books in common.
User 21451 rated 136 books, and user 40675 rated 136 . They have 38 books in common.
User 21451 rated 136 books, and user 33599 rated 163 . They have 28 books in common.
User 21451 rated 136 books, and user 22950 rated 114 . They have 39 books in common.
