In [1]:
import pandas as pd
import numpy as np
import re

# Pandas Settings
import warnings
warnings.filterwarnings("ignore")
pd.set_option('precision', 2)

## Import

In [2]:
df = pd.read_csv('data/cleaned_books.csv')

## Recommendation Engine with Count Vectorizer

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Linear Kernel is faster than cosine_similarities
from sklearn.metrics.pairwise import linear_kernel

# Parse the stringified features into their corresponding python objects
from ast import literal_eval
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [5]:
def create_soup(x):
    return x['author'] + ' ' +x['summary'] + ' ' + x['publisher'] + ' ' + x['country']

In [6]:
features = ['author','summary','publisher','country']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [7]:
df['soup'] = df.apply(create_soup,axis=1)

In [8]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df.soup)

In [9]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [28]:
def get_recommendations(title,cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    book_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return df['title'].iloc[book_indices]

In [29]:
#df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [30]:
get_recommendations('American Gods')

545                              Orlando: A Biography
434                   Something Wicked This Way Comes
9                                                Worm
98                            The Wolves in the Walls
740                              Flowers for Algernon
53                                             Uglies
114    The Water-Babies, A Fairy Tale for a Land Baby
863                 Charlie and the Chocolate Factory
470                                 Saturn's Children
301                         The First Men in the Moon
Name: title, dtype: object

## TF-IDF

In [31]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['soup'])
tfidf_matrix.shape

(965, 143940)

In [32]:
cosine_sim2 = cosine_similarity(tfidf_matrix,tfidf_matrix)

In [33]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

## Comparing TF-IDF vs Count Vectorizer

In [34]:
# TF-IDF
get_recommendations('American Gods', cosine_sim2)

388                                          Template
434                   Something Wicked This Way Comes
545                              Orlando: A Biography
570                                        Neverwhere
272                                        The Hobbit
98                            The Wolves in the Walls
740                              Flowers for Algernon
930              Aleriel, or A Voyage to Other Worlds
114    The Water-Babies, A Fairy Tale for a Land Baby
711                                        Good Omens
Name: title, dtype: object

In [35]:
# CV
get_recommendations('American Gods')

545                              Orlando: A Biography
434                   Something Wicked This Way Comes
9                                                Worm
98                            The Wolves in the Walls
740                              Flowers for Algernon
53                                             Uglies
114    The Water-Babies, A Fairy Tale for a Land Baby
863                 Charlie and the Chocolate Factory
470                                 Saturn's Children
301                         The First Men in the Moon
Name: title, dtype: object

In [36]:
# TF-IDF
get_recommendations("Ender's Game",cosine_sim2)

770               Ender's Shadow
772               Ender in Exile
426         Speaker for the Dead
460               Shadow Puppets
455            Shadows in Flight
408            Starship Troopers
7                       Xenocide
244    The Left Hand of Darkness
294               The Glass Bees
878                     Brisingr
Name: title, dtype: object

In [37]:
# CV
get_recommendations("Ender's Game")

770                   Ender's Shadow
772                   Ender in Exile
426             Speaker for the Dead
408                Starship Troopers
294                   The Glass Bees
244        The Left Hand of Darkness
878                         Brisingr
659    Jonathan Strange & Mr Norrell
119            The War of the Worlds
460                   Shadow Puppets
Name: title, dtype: object

In [38]:
# TF-IDF
get_recommendations("The Hobbit",cosine_sim2)

408                                  Starship Troopers
475                                         Roverandom
245                              The Lays of Beleriand
207    The Narrative of Arthur Gordon Pym of Nantucket
754                                     Fahrenheit 451
499                        Red Moon and Black Mountain
165                                   The Silmarillion
244                          The Left Hand of Darkness
908                                     Atlas Shrugged
359                             The Book of Lost Tales
Name: title, dtype: object

In [39]:
# CV
get_recommendations("The Hobbit")

408                                  Starship Troopers
475                                         Roverandom
207    The Narrative of Arthur Gordon Pym of Nantucket
244                          The Left Hand of Darkness
245                              The Lays of Beleriand
754                                     Fahrenheit 451
659                      Jonathan Strange & Mr Norrell
165                                   The Silmarillion
908                                     Atlas Shrugged
499                        Red Moon and Black Mountain
Name: title, dtype: object

In [41]:
get_recommendations("The Name of the Wind",cosine_sim2)

591     Millroy the Magician
271            The Hollowing
475               Roverandom
737            Forever Peace
138      The Three Impostors
286        The Great God Pan
394    Synthetic Men of Mars
938              Accelerando
696     Hereafter, and After
844        Close to Critical
Name: title, dtype: object

In [42]:
get_recommendations("The Name of the Wind")

798                   Dorothea Dreams
858              Children of Tomorrow
776                         Emergence
591              Millroy the Magician
578           Mountain of Black Glass
612                            Lurulu
468               Sea of Silver Light
600                   Masters of Time
271                     The Hollowing
576    Mr. Magorium's Wonder Emporium
Name: title, dtype: object