## Datasets

In [1]:
## Import libraries
import pandas as pd
import random
from scipy.spatial.distance import pdist, squareform
from collections import Counter

## Import datasets
ratings = pd.read_csv('../datasets/gb10k_ratings.csv')
books = pd.read_csv('../datasets/clean_books.csv')
books.drop(columns=['Unnamed: 0'], inplace=True)
books_tags = pd.read_csv('../datasets/clean_tags_books.csv')
books_tags.drop(columns=['Unnamed: 0'], inplace=True)

#### New dataset: books including genre

In [2]:
##Merge the tags with the rest of the books information
books_tags_all_info = pd.merge(books_tags, books, on='goodreads_book_id')

In [3]:
##Filter only the genre tags. Drop features not needed for this part of the analysis
books_with_genre_tag = books_tags_all_info[books_tags_all_info.tag_class == 'genre']
books_with_genre_tag.drop(columns=['tag_class', 'original_publication_year', 'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5'], inplace=True)

##Discard too broad tags (non-descriptive for genre-filtering purposes)
do_not_consider = ['contemporary', 'fiction', 'novel', 'series','adult', 'non-fiction', 'realistic', 'family']
books_with_genre_tag.loc[books_with_genre_tag.new_tag_name.isin(do_not_consider), 'new_tag_name'] = 'other'
books_with_category = books_with_genre_tag[books_with_genre_tag.new_tag_name != 'other'].copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [4]:
##Create category columns
books_with_category['main_genre'] = 0
books_with_category['sec_genre'] = 0
list_of_book_id = books_with_category.goodreads_book_id.unique().tolist()
for i in list_of_book_id:
        select = books_with_category[books_with_category.goodreads_book_id == i]
        temp = select[['goodreads_book_id', 'new_tag_name', 'count']].sort_values(by='count', ascending=False).reset_index(drop=True)
        if temp.shape[0] >=2:
                books_with_category.loc[books_with_category.goodreads_book_id == i, 'main_genre'] = temp.loc[0, 'new_tag_name']
                books_with_category.loc[books_with_category.goodreads_book_id == i, 'sec_genre'] = temp.loc[1, 'new_tag_name']
        elif temp.shape[0] == 1:
                books_with_category.loc[books_with_category.goodreads_book_id == i, 'main_genre'] = temp.loc[0, 'new_tag_name']

In [5]:
books_with_category.drop(columns=['count', 'new_tag_name'], inplace=True)
books_with_category = books_with_category.drop_duplicates()
books_with_category['book_series'] = books_with_category.book_series.fillna('')

In [6]:
books_with_category_original = books_with_category.copy()
books_with_category.to_csv('../datasets/work_datasets/books_with_category.csv')

## Basic recommendations

In [7]:
'''
Considering a recommendation based on the book weighted rating, where:
- nr is the number of ratings that the book has
- mv is the minimum votes required to be listed
- ar is the average rating of the book
- mr is the mean rating across the whole dataset
'''

# Create a function to calculate the weighted rating
def w_rating(row, mv, mr):
    nr = row.work_ratings_count
    ar = row.average_rating
    return (nr/(nr+mv) * ar) + (mv/(mv+nr) * mr)

# Create a function to calculate recommendations. Considers genre and avoids series repetition
def recommendation_with_category(dataset, mv, mr, genre=None):
    if genre ==None: 
        books_to_evaluate = dataset.copy()
    else:
        books_to_evaluate = dataset.copy()
        books_to_evaluate = books_to_evaluate[books_to_evaluate.main_genre == genre]
    
    ## Choose the subset of books to evaluate
    books_to_evaluate = books_to_evaluate[books_to_evaluate.work_ratings_count >= mv]
    
    ## Calculate weighted rating and order the list of recommended books by that value
    books_to_evaluate['w_rating'] = books_to_evaluate.apply(lambda x : w_rating(x, mv, mr), axis=1)
    books_to_evaluate = books_to_evaluate.sort_values(by='w_rating', ascending=False)
    
    ## Take out from the dataframe books from the same series, keeping only the highest rated one
    grouped = books_to_evaluate[['book_title','book_series','w_rating', 'authors', 'main_genre']].set_index('book_series')
    grouped = grouped.loc[~grouped.index.duplicated(keep='first')]
    return grouped



##Set main parameters. The minimum votes threshold is set by the quantile (in this case, 0.9, meaning that the books recommended
##need to have more votes than 90% of the books
all_books = books_with_category_original.copy()
mv = all_books['work_ratings_count'].quantile(0.90)
mr = all_books['average_rating'].mean()

recommendations = recommendation_with_category(all_books, mv, mr)
recommendations.head(10)

Unnamed: 0_level_0,book_title,w_rating,authors,main_genre
book_series,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Harry Potter,Harry Potter and the Deathly Hallows,4.577047,"J.K. Rowling, Mary GrandPré",fantasy
A Song of Ice and Fire,A Storm of Swords,4.450089,George R.R. Martin,fantasy
The Kingkiller Chronicle,The Name of the Wind,4.445507,Patrick Rothfuss,fantasy
The Lord of the Rings,The Return of the King,4.422745,J.R.R. Tolkien,fantasy
,The Help,4.422252,Kathryn Stockett,historical
The Infernal Devices,Clockwork Princess,4.414067,Cassandra Clare,fantasy
Percy Jackson and the Olympians,The Last Olympian,4.412904,Rick Riordan,fantasy
A Court of Thorns and Roses,A Court of Mist and Fury,4.392844,Sarah J. Maas,fantasy
The Stormlight Archive,The Way of Kings,4.390141,Brandon Sanderson,fantasy
The Heroes of Olympus,The Mark of Athena,4.345951,Rick Riordan,fantasy


## User-based collaborative filtering

In [8]:
## Create a sample group of 50 users that will serve to try the recommender
all_ratings = ratings.copy()
random.seed(42)
random_users=random.sample(list(all_ratings['user_id'].unique()),50)
my_test_group = all_ratings.loc[all_ratings['user_id'].isin(random_users)]
my_train_data = all_ratings.loc[~all_ratings['user_id'].isin(random_users)]

In [9]:
## Create a function that returns a list of 5 most similar users for the user given. Since the total amount of data in the 
## dataframes is very extense, we reduce the number of users taken into account to 1000

def give_me_closest_users(train_dataframe, my_user_df):
    
    ##Extract the user id of the dataframe with all its info for further reference
    my_user_id = my_user_df.user_id.unique()[0]
    
    ##Select a list of random ids from our train dataset to create the correlation matrix with our user
    random_train_ids = random.sample(list(train_dataframe['user_id'].unique()),1000)
    
    ##Create a dataframe with the rating information of the randomly selected users
    train_df = train_dataframe[train_dataframe['user_id'].isin(random_train_ids)]
    
    ##Merge the ratings of our user with all the ratings of the train dataframe
    train_df_and_user = pd.concat([train_df, my_user_df], ignore_index=True)
    
    ##Create a pivot table of the dataframe
    pv_data = train_df_and_user.pivot_table(index=['user_id'], columns=['book_id'], values=['rating'], fill_value=0) 
    
    ##Normalize the values of the dataframe
    pv_data = pv_data.div(pv_data.sum(axis=1), axis=0)
    
    ##Create a user matrix that measures the correlation between users
    user_matrix = pd.DataFrame(1/ (1 + squareform(pdist(pv_data, metric = 'Euclidean'))), columns = pv_data.index, index = pv_data.index)

    ##Return a list of the 5 more related users. The first correlation of the list is excluded as it will be our target user
    return list(user_matrix[my_user_id].sort_values(ascending=False)[1:6].index)


##Create a function that returns list of books read by the most similar users, minus the ones already read by the user.
##The number of books will be indicated as a parameter
def give_books_id(ratings_df, my_user_df, similar_users, number_of_output_books):
    
    ##Extract all the books rated by the similar users
    recommended_books = ratings_df[ratings_df['user_id'].isin(similar_users)]
    
    ##Group all the books rated by the similar users and calculate the mean rating given by those users
    grouped = recommended_books.groupby('book_id', as_index=False).agg({'rating':'mean'})
    
    ##Create a list of the books read (rated) by our user
    list_of_books = my_user_df.book_id.tolist()
    
    ##Substract from all the books the ones already read (rated) by our user and order by rating
    grouped = grouped.loc[~grouped['book_id'].isin(list_of_books)]
    
    ##Select the amount of recommendations requested
    if len(grouped) > number_of_output_books:
        i = number_of_output_books
    else:
        i = len(recommended_books_id)
    final_df = (grouped.sort_values(by='rating', ascending=False).reset_index(drop=True))[0:i]
    return final_df

## Item-based collaborative filtering

*Note: since the creation of a correlation matrix among all books requires a long time, the code that does it has been commented in the following cell. The already created matrix is imported instead, though the code is still accessible for reference on how it was calculated*

In [10]:
# # Create the correlation matrix and export it to CSV
# books_rating_per_user = all_ratings.groupby(['book_id', 'user_id']).agg({'rating':'mean'}).reset_index()
# pv_books_rating_per_user = books_rating_per_user.pivot_table(index=['book_id'], columns=['user_id'], values=['rating'], fill_value=0)
# book_dist = pd.DataFrame(1/(1 + squareform(pdist(pv_books_rating_per_user, 'euclidean'))), index=pv_books_rating_per_user.index, columns=pv_books_rating_per_user.index)
# book_dist.to_csv('../datasets/correlation_matrix/book_distance.csv')

#Import books correlation matrix
books_corr = pd.read_csv('../datasets/correlation_matrix/book_distance.csv')

In [11]:
books_corr.drop(columns=['book_id'], inplace=True)

In [12]:
## Create a function that gets all the books rated by our user and searches the 5 more correlated books to each one of them. 
## From all the books returned, excluding the ones the user already rated, return the amount passed as a parameter.

def get_most_similar_book_ids(my_user_df, books_corr, number_of_output_books):
    
    ## Select the books to be considered
    if (my_user_df.rating == 5).sum() == 0 and (my_user_df.rating == 4).sum() == 0:
        selected_ratings = my_user_df[(my_user_df.rating == 3)]
    else:
        selected_ratings = my_user_df[(my_user_df.rating == 5) | (my_user_df.rating == 4)]
    
    ## Create a list with those books id
    user_book_ids = selected_ratings.book_id.unique().tolist()
    
    ## Look for similar books on our correlation matrix
    list_similar_books = []
    for book in user_book_ids:
        similar_books = list(books_corr.loc[book-1].sort_values(ascending=False)[1:6].index)
        list_similar_books.append(similar_books)
    
    ## Flatten the list obtained and remove duplicates. Convert the elements of the list to integers
    flat_list = list(set([item for nested_list in list_similar_books for item in nested_list]))
    int_list = list(map(int, flat_list))
    
    ## Remove the books already rated by the user
    all_user_book_ids = my_user_df.book_id.unique().tolist()
    final_list = [x for x in int_list if x not in all_user_book_ids]
    
    ##Select the amount of recommendations requested
    if len(final_list) > number_of_output_books:
        i = number_of_output_books
    else:
        i = len(final_list)    
    return final_list[:i]

## Run both recommenders

In [13]:
##Create a function that selects a random user from our test data and returns all its ratings
def give_me_a_user():
    random_user_id = random.sample(list(my_test_group['user_id'].unique()),1)[0]
    random_user_df = my_test_group.loc[my_test_group.user_id == random_user_id]
    return random_user_df, random_user_id

## Get the information from a random test user
my_user_df, my_user_id = give_me_a_user()

In [14]:
##Launch the UB-CF
## Get a list of 5 similar users
similar_users = give_me_closest_users(my_train_data, my_user_df)
## Get the ids of all the possible books to recommend 
recommended_books_ub_id = give_books_id(all_ratings, my_user_df, similar_users, 10)

In [15]:
##Launch the IB-CF
## Get the ids of all the possible similar books
recommended_books_ib_id = get_most_similar_book_ids(my_user_df, books_corr, 10)

### Checking the results

In [16]:
##Create a df with all the book info
def get_books_info(df_series):
    books_basic_info = books[['book_id', 'book_title', 'book_series', 'authors', 'original_publication_year']]
    books_genre = books_with_category[['book_id', 'main_genre', 'sec_genre']]
    books_to_check = pd.merge(df_series, books_basic_info, how='left', on='book_id')
    books_to_check = pd.merge(books_to_check, books_genre, how='left', on='book_id')
    return books_to_check

#### Books already rated by the user

In [17]:
pd.set_option('display.max_rows', None)
books_already_read = my_user_df.drop(columns=['user_id'])
books_already_read = books_already_read.sort_values(by='rating', ascending=False)
info_books = get_books_info(books_already_read.book_id)
books_already_read_final = pd.merge(books_already_read, info_books, how='left', on='book_id')
books_already_read_final

Unnamed: 0,book_id,rating,book_title,book_series,authors,original_publication_year,main_genre,sec_genre
0,25,5,Harry Potter and the Deathly Hallows,Harry Potter,"J.K. Rowling, Mary GrandPré",2007,fantasy,young adult
1,21,5,Harry Potter and the Order of the Phoenix,Harry Potter,"J.K. Rowling, Mary GrandPré",2003,fantasy,children
2,123,5,The Firm,Penguin Readers,John Grisham,1991,thriller,mystery
3,799,5,Watchers,,Dean Koontz,1987,thriller,horror
4,89,5,The Princess Bride,,William Goldman,1973,sci-fi-fantasy,young adult
5,184,5,Matilda,,"Roald Dahl, Quentin Blake",1988,young adult,young adult
6,2,5,Harry Potter and the Sorcerer's Stone,Harry Potter,"J.K. Rowling, Mary GrandPré",1997,fantasy,young adult
7,36,5,The Giver,The Giver,Lois Lowry,1993,children,classic
8,133,5,Anne of Green Gables,Anne of Green Gables,L.M. Montgomery,1908,young adult,classic
9,18,5,Harry Potter and the Prisoner of Azkaban,Harry Potter,"J.K. Rowling, Mary GrandPré, Rufus Beck",1999,fantasy,young adult


#### Books recommended by the User-Based Collaborative Filtering Algorithm

In [18]:
recommended_books_ub_final = get_books_info(recommended_books_ub_id.book_id)
recommended_books_ub_final

Unnamed: 0,book_id,book_title,book_series,authors,original_publication_year,main_genre,sec_genre
0,595,A Separate Peace,,John Knowles,1959,classic,young adult
1,1298,The Golem's Eye,Bartimaeus,Jonathan Stroud,2004,fantasy,young adult
2,1338,The Book of Mormon: Another Testament of Jesus...,,"Anonymous, Joseph Smith Jr.",1830,classic,history
3,289,Watership Down,Watership Down,Richard Adams,1972,classic,fantasy
4,136,Divine Secrets of the Ya-Ya Sisterhood,,Rebecca Wells,1996,chick lit,humor
5,295,11/22/63,,Stephen King,2011,historical,science-fiction
6,132,The Five People You Meet in Heaven,,Mitch Albom,2003,young adult,classic
7,131,The Grapes of Wrath,,John Steinbeck,1939,classic,classic
8,297,Cat's Cradle,,Kurt Vonnegut Jr.,1963,classic,science-fiction
9,302,Safe Haven,,Nicholas Sparks,2010,romance,chick lit


#### Books recommended by the Item-Based Collaborative Filtering Algorithm

In [19]:
recommended_books_ib_id_df = pd.DataFrame(recommended_books_ib_id, columns =['book_id']) 
recommended_books_ib_final = get_books_info(recommended_books_ib_id_df.book_id)
recommended_books_ib_final

Unnamed: 0,book_id,book_title,book_series,authors,original_publication_year,main_genre,sec_genre
0,9634,Annabel,Delirium,Lauren Oliver,2012,young adult,young adult
1,7751,Another Piece of My Heart,,Jane Green,2012,chick lit,romance
2,1894,Wedding Night,,Sophie Kinsella,2013,chick lit,romance
3,8444,Relentless,,Dean Koontz,2009,suspense,mystery
4,240,Inferno,Robert Langdon,Dan Brown,2013,mystery,thriller
5,9916,Shopaholic to the Rescue,Shopaholic,Sophie Kinsella,2015,chick lit,humor
6,2123,George's Marvellous Medicine,,"Roald Dahl, Quentin Blake",1981,children,children
7,1517,The Twelve,The Passage,Justin Cronin,2012,horror,science-fiction
8,2710,The Virgin's Lover,The Plantagenet and Tudor Novels,Philippa Gregory,2004,historical,chick lit
9,1268,Where We Belong,,Emily Giffin,2012,chick lit,romance


### Testing the algorithm

#### Check that the recommended lists do not include books already rated by the user

In [20]:
set_of_read_books = set(books_already_read.book_id.tolist())
set_of_ub_recommended_books = set(recommended_books_ub_id.book_id.tolist())
set_of_ib_recommended_books = set(recommended_books_ib_id_df.book_id.tolist())

In [21]:
##If UB recommeneded books are not repeated, the set will be empty
set_of_read_books.intersection(set_of_ub_recommended_books)

set()

In [22]:
##If IB recommeneded books are not repeated, the set will be empty
set_of_read_books.intersection(set_of_ib_recommended_books)

set()

#### Check that the users selected by the UB algorithm have books in common with our user

In [23]:
## Function that returns all the rated books for a user
def books_from_user(user_id):
    book_ids_user = ratings[ratings.user_id == user_id]
    book_ids_user_list = book_ids_user.book_id.to_list()
    books_user = books_with_category.loc[(books_with_category.book_id.isin(book_ids_user_list))]
    return books_user.drop(columns=['goodreads_book_id', 'average_rating', 'work_ratings_count'])


## Function that compares the similarity of two users
def compare_user_books(user_1, user_2):
    user_1_books = set(books_from_user(user_1).book_title.to_list())
    user_2_books = set(books_from_user(user_2).book_title.to_list())
    common_books = user_1_books.intersection(user_2_books)
    return user_1_books, user_2_books, common_books

In [24]:
##Compare similar users
for i in similar_users:
    user_1_books, user_2_books, common_books = compare_user_books(my_user_id, i)
    print('User', my_user_id, 'rated', len(user_1_books), 'books, and user', i, 'rated', len(user_2_books), '. They have', len(common_books), 'books in common.')

User 25901 rated 106 books, and user 20506 rated 137 . They have 37 books in common.
User 25901 rated 106 books, and user 12067 rated 134 . They have 30 books in common.
User 25901 rated 106 books, and user 13236 rated 140 . They have 30 books in common.
User 25901 rated 106 books, and user 6682 rated 153 . They have 30 books in common.
User 25901 rated 106 books, and user 1296 rated 174 . They have 23 books in common.
