In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD
import time
import atexit
import random


class BookRecommendation(object):

    def __init__(self, verbose=0):
        """
        params:
            verbose -> int
                if verbose > 0:
                    print progress
        """
        init_timer = time.time()
        self.df_r, self.df_b, self.df_bt, self.df_t = self._load_df(verbose=verbose)
        self.corr = self._init_corr(verbose=verbose)
        self.title_series = self.df_b["title"]
        if verbose > 0:
            print("Time to init:", time.time() - init_timer, "sec")
        self.base_update = False
        # use atexit if you want to update the database at the end of script
        #atexit.register(self.exit)
        #self.rating_mean = self.df_r["rating"].mean()
        self.rating_mean = 2.5

        
    def update_database(self):
        if self.base_update:
            df_r.to_csv("ratings.csv", index=False)
            self.base_update=False
    
    
    def exit(self):
        self.update_database()
        

    def _load_df(self, verbose=0):
        if verbose > 0:
            print("Loading ratings.csv ...")
        df_r = pd.read_csv('ratings.csv')
        df_r.sort_values(by="user_id", inplace=True, ignore_index=True)
        df_r = df_r.reset_index(drop=True)
        if verbose > 0:
            print("Done")
            print("Loading books.csv ...")
        df_b = pd.read_csv('books.csv')
        if verbose > 0:
            print("Done")
            print("Loading book_tags.csv ...")
        df_bt = pd.read_csv('book_tags.csv')
        if verbose > 0:
            print("Done")
            print("Loading tags.csv ...")
        df_t = pd.read_csv('tags.csv')
        if verbose > 0:
            print("Done")
        return df_r, df_b, df_bt, df_t


    def _init_corr(self, verbose=0):
        if verbose > 0:
            print("Loading matrix ...")
        df_r_pivot = self.df_r.pivot(index="book_id", columns ="user_id", values="rating")
        df_r_pivot = df_r_pivot.fillna(0)
        X = df_r_pivot.values
#         if verbose > 0:
#             print("Done")
#             print("Fiting SVD ...")
        # SVD = TruncatedSVD(n_components=n_components, random_state=42)
        # matrix = SVD.fit_transform(X)
        if verbose > 0:
            print("Done")
            print("Loading corr ...")
        #corr = np.corrcoef(matrix)
        corr = np.corrcoef(X)
        if verbose > 0:
            print("Done")
        return corr
        

    def show_book_title_from_id(self, book_id):
        book_title = self.df_b.loc[self.df_b["book_id"] == book_id, "title"].values[0]
        print(book_id, book_title)


    def show_books(self, start, end):
        while start < end and start < len(self.title_series):
            print("book_id", start+1, "title", self.title_series[start])
            start += 1


    def related_books(self, book, n_books=10, unwanted_id=[]):
        idx = []
        if type(book) == str:
            book_id = self.df_b.loc[self.df_b["title"] == book, "book_id"].values[0]
        elif type(book) == int:
            book_id = book
        else:
            return idx
        
        book_corr = self.corr[book - 1]
        idx = (-book_corr).argsort()
        
        i = 0
        while i < len(idx):
            idx[i] += 1
            i += 1

        new_idx = []
        i = 0
        while len(new_idx) < n_books and i < len(idx):
            if idx[i] not in unanted_id:
                new_idx.append(idx[i])
            i += 1
            
        return new_idx


    def show_books_from_user_id(self, user_id):
        df_user = self.df_r[self.df_r["user_id"] == user_id]
        df_user = df_user.sort_values(by="rating", ascending=False, ignore_index=True)
        for i in df_user.index:
            print(self.df_b[self.df_b["book_id"] == df_user["book_id"][i]]["title"].values[0], df_user["rating"][i])


    def recommend_books_from_user_id(self, user_id, n_books=100):
        """
        Recommend books for a user in our database.
        If he's not in our database we're recommanding typical best rated books
        params:
            user_id -> int
            n_books -> int; n_books >= 1
        """
        
        # get ratings of user
        df_user = self.df_r[self.df_r["user_id"] == user_id]
        
        # size of list we wanna create; we're gonna return n_books value randomly inside
        book_list_size = n_books * 3
        
        # if user didn't read any book return typical best rated books
        if len(df_user) == 0:
            book_list = self.popularity_recommender(book_list_size)
            # shuffle
            random.shuffle(book_list)
            return book_list[:n_books]
        
        # sort books rated by user by rating (descending)
        df_user = df_user.sort_values(by="rating", ascending=False, ignore_index=True)
        
        # get list of id of books rated by user
        user_read_books = df_user["book_id"].values.tolist()
        
        # count how many books the user liked (liked means: rating >= average_all_ratings)
        liked_books = df_user[df_user["rating"] >= self.rating_mean].count()
    
        # for each book liked, we're recommanding others books the user didn't read already
        # these books are the most related with books the user liked the most
        stars_count = []
        for i in range(0, 5):
            stars_count.append(len(df_user[df_user["rating"] == i + 1]))

        min_accepted = int(np.ceil(self.rating_mean))
        
        i = 0
        while min_accepted + i <= 5:
            if stars_count[min_accepted + i - 1] > 0:
                min_rated_in_accepted_ratings = min_accepted + i
                break
            i += 1
            
        denominator = 0
        i = 0
        while min_accepted + i <= 5:
            denominator += stars_count[min_accepted + i - 1] * np.power(2, i)
            i += 1
            
        print("self.rating_mean", self.rating_mean)
        
        book_list = []
        i = 0
        while i < len(df_user.index):
            if df_user["rating"][df_user.index[i]] < self.rating_mean:
                break
                
            nominator = np.power(2, df_user["rating"][df_user.index[i]] - min_rated_in_accepted_ratings)
            wanted_n_books = int(np.ceil(nominator * (book_list_size / denominator)))
            
            print("actual stars", df_user["rating"][df_user.index[i]])
            print("nominator", nominator)
            print("denominator", denominator)
            print("wanted_n_books non rounded", nominator * (book_list_size / denominator))
            print("wanted_n_books rounded", wanted_n_books)
            
            book_list += self.related_books(book=df_user["book_id"][df_user.index[i]],
                                            n_books=wanted_n_books, unwanted_id=book_list+user_read_books)
            i += 1
                
        if len(book_list) < book_list_size:
            book_list += self.popularity_recommender(n_books=book_list_size-len(book_list), unwanted_id=book_list+user_read_books)
            
        # shuffle
        random.shuffle(book_list)

        return book_list[:n_books]
    
    
    def popularity_recommender(self, n_books=50, unwanted_id=[]):
        # goal: create a dataframe of weighted ratings for each book, and return the 
        # indexes of the n_books best rated books.
        # (v*R + m*C) / (v+m)
        # v: number of votes for the book
        # m: minimum number of votes required to appear in the list
        # R: average rating of the book
        # C: mean value of all the votes
        # create a pandas Dataframe with book_id and the number of ratings for the book
        df_wr = self.df_r[["book_id", "rating"]].groupby("book_id", as_index=False).count().rename(columns={"rating": "v"})
        # add the mean score for each book
        df_wr["R"] = self.df_r[["book_id", "rating"]].groupby("book_id", as_index=False).mean()["rating"]
        # let m be the median of the numbers of votes (248)
        m = df_wr["v"].quantile(0.90)
        # print(m)
        C = self.df_r["rating"].mean()
        # compute the weighted ratings for each books
        df_wr["wr"] = df_wr.apply(lambda row: (row["v"]*row["R"] + m*C) / (row["v"] + m), axis=1)
        # sort the books by their weighted ratings
        df_wr = df_wr.sort_values(by="wr", ascending=False, ignore_index=True).reset_index(drop=True)
        book_list = []
        i = 0
        while len(book_list) < n_books and i < len(df_wr):
            if df_wr["book_id"][i] not in unwanted_id:
                book_list.append(df_wr["book_id"][i])
            i += 1
        return book_list

    
    def add_ratings(self, user_id, book_id, rating):
        """
         if user doesn't exist -> create user
        """
        if len(book_id) == 0 or len(book_id) != len(rating):
            # print("wrong params")
            return
        
        if self.df_r[self.df_r["user_id"] == user_id].shape[0] > 0:
            # if user already exist
            # drop doublons
            all_books = self.df_r[self.df_r["user_id"] == user_id]["book_id"].values.tolist()
            known_books = []
            for i in all_books:
                if i in book_id:
                    known_books.append(i)
            i = 0
            while i < len(book_id):
                if book_id[i] in known_books:
                    book_id.pop(i)
                    rating.pop(i)
                    i -= 1
                i += 1
            if len(book_id) == 0 or len(book_id) != len(rating):
                return
            # and then add book_id and ratings on known user
        
        _dict = {"user_id": [user_id]*len(book_id), "book_id": book_id, "rating": rating}
        df = pd.DataFrame(data=_dict)
        self.df_r = self.df_r.append(df, ignore_index=True)
        self.base_update = True

        
    def del_user(self, user_id):
        self.df_r = self.df_r.drop(self.df_r[self.df_r["user_id"] == user_id].index)
        self.df_r.reset_index(drop=True)

In [2]:
book_recommendation = BookRecommendation(verbose=1)

Loading ratings.csv ...
Done
Loading books.csv ...
Done
Loading book_tags.csv ...
Done
Loading tags.csv ...
Done
Loading matrix ...
Done
Loading corr ...
Done
Time to init: 77.41700720787048 sec


In [97]:
def show_books(start=0, end=10):
    print("\n", str(" "+str(start)+" ").center(50, "-"))
    book_recommendation.show_books(start, end)
    print(str(" "+str(end)+" ").center(50, "-"), "\n")

    
def show_related_books(book_id, n_books=10):
    book_recommendation.show_book_title_from_id(book_id)
    book_list = book_recommendation.related_books(book=book_id, n_books=n_books)
    print("\n", " 0 ".center(50, "-"))
    for i in book_list:
        book_recommendation.show_book_title_from_id(i)
    print("\n", str(" "+str(n_books)+" ").center(50, "-"))
    
    
def show_user(user_id):
    book_recommendation.show_books_from_user_id(user_id)

    
def recommend_user(user_id, n_books=10):
    print("\n", " already read ".center(50, "-"))
    book_recommendation.show_books_from_user_id(user_id)
    print("\n", " end ".center(50, "-"))
    print("\n", " recommendations ".center(50, "-"))
    book_list = book_recommendation.recommend_books_from_user_id(user_id, n_books=n_books)
    print("nb_books", len(book_list))
    for i in book_list:
        book_recommendation.show_book_title_from_id(i)
    print("\n", " end ".center(50, "-"))

    
def add_ratings(user_id=1000000, book_id=[1], rating=[5]):
    book_recommendation.add_ratings(user_id, book_id, rating)
    
    
def del_user(user_id=1000000):
    book_recommendation.del_user(user_id)

In [None]:
#show_books(0, 10)
#show_related_books(1, n_books=10)
add_ratings(user_id=1000000, book_id=[1, 2, 3, 4, 5], rating=[3, 5, 5, 4, 5])
show_user(user_id=1000000)
#del_user(user_id=1000000)
recommend_user(1000000, n_books=10)

# K-means clusters to recommend similar books acoording to their tags

- create a matrix with tags as the columns and books as the lines
- fill the matrix with 1 if the book is associated with the tag, 0 if not
- train a k-means model on the books
- find best number of clusters with elbow method

every book in the same cluster can be recommended together

In [4]:
book_recommendation.df_bt

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716
...,...,...,...
999907,33288638,21303,7
999908,33288638,17271,7
999909,33288638,1126,7
999910,33288638,11478,7


In [85]:
"izi pizi"

'izi pizi'