In [2]:
import pandas as pd
import numpy as np
import re

# Pandas Settings
import warnings
warnings.filterwarnings("ignore")
pd.set_option('precision', 2)

## Import

In [3]:
df = pd.read_csv('data/cleaned_books.csv')

In [40]:
df.columns

Index(['Unnamed: 0', 'author', 'country', 'genre', 'isbn', 'language', 'pages',
       'publisher', 'summary', 'title', 'product_url', 'clean_rating',
       'clean_rating_count', 'clean_genre', 'soup'],
      dtype='object')

## Recommendation Engine with Count Vectorizer

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Linear Kernel is faster than cosine_similarities
from sklearn.metrics.pairwise import linear_kernel

# Parse the stringified features into their corresponding python objects
from ast import literal_eval
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [6]:
def create_soup(x):
    return x['author'] + ' ' +x['summary'] + ' ' + x['publisher'] + ' ' + x['country'] + x['genre']

In [7]:
features = ['author','summary','publisher','country']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [8]:
df['soup'] = df.apply(create_soup,axis=1)

In [9]:
count = CountVectorizer(stop_words='english',ngram_range=(1,3),analyzer='word')
count_matrix = count.fit_transform(df.soup)

In [10]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [26]:
def get_recommendations(title,cosine_sim=cosine_sim):
    # Get the index of the book that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    book_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return df['title'].iloc[book_indices]

In [12]:
#df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [27]:
get_recommendations('American Gods')

545                              Orlando: A Biography
434                   Something Wicked This Way Comes
9                                                Worm
53                                             Uglies
740                              Flowers for Algernon
98                            The Wolves in the Walls
743                                         Fledgling
114    The Water-Babies, A Fairy Tale for a Land Baby
863                 Charlie and the Chocolate Factory
232                                     The Magicians
Name: title, dtype: object

## TF-IDF

In [28]:
from sklearn.metrics.pairwise import linear_kernel

In [29]:
tfidf = TfidfVectorizer(stop_words='english',ngram_range=(1,3),analyzer='word')
tfidf_matrix = tfidf.fit_transform(df['summary'])
tfidf_matrix.shape

(965, 469409)

In [30]:
cosine_sim2 = linear_kernel(tfidf_matrix,tfidf_matrix)

In [31]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

## Comparing TF-IDF vs Count Vectorizer

In [32]:
# TF-IDF
get_recommendations('American Gods', cosine_sim2)

434                   Something Wicked This Way Comes
545                              Orlando: A Biography
570                                        Neverwhere
272                                        The Hobbit
930              Aleriel, or A Voyage to Other Worlds
114    The Water-Babies, A Fairy Tale for a Land Baby
650               Kandide and the Secret of the Mists
632                                            Lilith
9                                                Worm
301                         The First Men in the Moon
Name: title, dtype: object

In [33]:
# CV
get_recommendations('American Gods')

545                              Orlando: A Biography
434                   Something Wicked This Way Comes
9                                                Worm
53                                             Uglies
740                              Flowers for Algernon
98                            The Wolves in the Walls
743                                         Fledgling
114    The Water-Babies, A Fairy Tale for a Land Baby
863                 Charlie and the Chocolate Factory
232                                     The Magicians
Name: title, dtype: object

In [34]:
# TF-IDF
get_recommendations("Ender's Game",cosine_sim2)

770               Ender's Shadow
772               Ender in Exile
426         Speaker for the Dead
460               Shadow Puppets
455            Shadows in Flight
408            Starship Troopers
7                       Xenocide
244    The Left Hand of Darkness
878                     Brisingr
294               The Glass Bees
Name: title, dtype: object

In [35]:
# CV
get_recommendations("Ender's Game")

770                   Ender's Shadow
772                   Ender in Exile
426             Speaker for the Dead
408                Starship Troopers
244        The Left Hand of Darkness
294                   The Glass Bees
878                         Brisingr
460                   Shadow Puppets
659    Jonathan Strange & Mr Norrell
455                Shadows in Flight
Name: title, dtype: object

In [36]:
# TF-IDF
get_recommendations("The Hobbit",cosine_sim2)

408                                  Starship Troopers
207    The Narrative of Arthur Gordon Pym of Nantucket
754                                     Fahrenheit 451
244                          The Left Hand of Darkness
908                                     Atlas Shrugged
659                      Jonathan Strange & Mr Norrell
165                                   The Silmarillion
499                        Red Moon and Black Mountain
788                                               Dune
563                               Nineteen Eighty-Four
Name: title, dtype: object

In [37]:
# CV
get_recommendations("The Hobbit")

408                                  Starship Troopers
244                          The Left Hand of Darkness
207    The Narrative of Arthur Gordon Pym of Nantucket
165                                   The Silmarillion
245                              The Lays of Beleriand
475                                         Roverandom
659                      Jonathan Strange & Mr Norrell
754                                     Fahrenheit 451
908                                     Atlas Shrugged
359                             The Book of Lost Tales
Name: title, dtype: object

In [38]:
get_recommendations("The Name of the Wind",cosine_sim2)

591     Millroy the Magician
844        Close to Critical
271            The Hollowing
475               Roverandom
138      The Three Impostors
737            Forever Peace
938              Accelerando
286        The Great God Pan
394    Synthetic Men of Mars
696     Hereafter, and After
Name: title, dtype: object

In [39]:
get_recommendations("The Name of the Wind")

776                         Emergence
391                           Tam Lin
576    Mr. Magorium's Wonder Emporium
271                     The Hollowing
3                   Ylana of Callisto
620                     Lost on Venus
844                 Close to Critical
100                 The Wizard of Zao
765                   Escape on Venus
570                        Neverwhere
Name: title, dtype: object

I will probably go with count vectorizer for the recommendation engine since readers are probably likely to enjoy similar works from the same author and genre so I do not want to down weight those. Also based soley on the recommendations I am getting from the two different cosine similarities it loos like Count Vectorizer is doing well.

Overall though they are very similar.