In [99]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv("books.csv")
df

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,0002005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,0002261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,0006163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,0006178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,0006280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6805,9788185300535,8185300534,I Am that,Talks with Sri Nisargadatta Maharaj,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0
6806,9788185944609,8185944601,Secrets Of The Heart,,Khalil Gibran,Mysticism,http://books.google.com/books/content?id=XcrVp...,,1993.0,4.08,74.0,324.0
6807,9788445074879,8445074873,Fahrenheit 451,,Ray Bradbury,Book burning,,,2004.0,3.98,186.0,5733.0
6808,9789027712059,9027712050,The Berlin Phenomenology,,Georg Wilhelm Friedrich Hegel,History,http://books.google.com/books/content?id=Vy7Sk...,Since the three volume edition ofHegel's Philo...,1981.0,0.00,210.0,0.0


In [100]:
print(df.isnull().sum())

isbn13               0
isbn10               0
title                0
subtitle          4429
authors             72
categories          99
thumbnail          329
description        262
published_year       6
average_rating      43
num_pages           43
ratings_count       43
dtype: int64


In [101]:
df = df[["title", "authors", "categories", "description"]]
df

Unnamed: 0,title,authors,categories,description
0,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...
1,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...
2,The One Tree,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...
3,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b..."
4,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...
...,...,...,...,...
6805,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,This collection of the timeless teachings of o...
6806,Secrets Of The Heart,Khalil Gibran,Mysticism,
6807,Fahrenheit 451,Ray Bradbury,Book burning,
6808,The Berlin Phenomenology,Georg Wilhelm Friedrich Hegel,History,Since the three volume edition ofHegel's Philo...


In [102]:
df = df.dropna()

In [103]:
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
df["description"] = df["description"].apply(clean)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tonizeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["description"] = df["description"].apply(clean)


In [104]:
df

Unnamed: 0,title,authors,categories,description
0,Gilead,Marilynne Robinson,Fiction,novel reader critic eager anticip decad gilead...
1,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,new christi christma fulllength novel adapt a...
2,The One Tree,Stephen R. Donaldson,American fiction,volum two stephen donaldson acclaim second tri...
3,Rage of angels,Sidney Sheldon,Fiction,memor mesmer heroin jennif brilliant beauti a...
4,The Four Loves,Clive Staples Lewis,Christian life,lewi work natur love divid love four categori ...
...,...,...,...,...
6803,Journey to the East,Hermann Hesse,Adventure stories,book tell tale man goe wonder amaz journey asi...
6804,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,wisdom creat life passion purpos peac inspir t...
6805,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,collect timeless teach one greatest sage india...
6808,The Berlin Phenomenology,Georg Wilhelm Friedrich Hegel,History,sinc three volum edit ofhegel philosophi subje...


In [105]:
print(df.title.sample(10))

4870    If I Have a Wicked Stepmother, Where's My Prince?
770                                      Bound for Oregon
2474                             The history of sexuality
1923                       Tsubasa 12 Reservoir Chronicle
6741    There's No Toilet Paper... on the Road Less Tr...
5127                  The Collected Poems of Wilfred Owen
6301                                          The Tempest
5995                             Healing with Whole Foods
5912                                         Go Ask Alice
1856                               Boogaloo on 2nd Avenue
Name: title, dtype: object


In [122]:
from scipy.sparse import hstack

feature_categories = df["categories"].tolist()
tfidf_categories = text.TfidfVectorizer(input=feature_categories, stop_words="english")
tfidf_matrix_categories = tfidf_categories.fit_transform(feature_categories)

feature_description = df["description"].tolist()
tfidf_description = text.TfidfVectorizer(input=feature_description, stop_words="english")
tfidf_matrix_description = tfidf_description.fit_transform(feature_description)

weight_categories = 2.0  # Higher weight for categories
weight_description = 1.0  # Lower weight for description

weighted_tfidf_matrix_categories = tfidf_matrix_categories * weight_categories
weighted_tfidf_matrix_description = tfidf_matrix_description * weight_description

combined_tfidf_matrix = hstack([weighted_tfidf_matrix_categories, weighted_tfidf_matrix_description])

similarity = cosine_similarity(combined_tfidf_matrix)


In [123]:
indices = pd.Series(df.index,index=df['title']).drop_duplicates()

In [124]:
indices_author = pd.Series(df.index, index=df['authors']).drop_duplicates()

In [125]:
print(indices)

title
Gilead                                                                                            0
Spider's Web                                                                                      1
The One Tree                                                                                      2
Rage of angels                                                                                    3
The Four Loves                                                                                    4
                                                                                               ... 
Journey to the East                                                                            6803
The Monk Who Sold His Ferrari: A Fable About Fulfilling Your Dreams & Reaching Your Destiny    6804
I Am that                                                                                      6805
The Berlin Phenomenology                                                                      

In [126]:
print(indices_author)

authors
Marilynne Robinson                                 0
Charles Osborne;Agatha Christie                    1
Stephen R. Donaldson                               2
Sidney Sheldon                                     3
Clive Staples Lewis                                4
                                                ... 
Hermann Hesse                                   6803
Robin Sharma                                    6804
Sri Nisargadatta Maharaj;Sudhakar S. Dikshit    6805
Georg Wilhelm Friedrich Hegel                   6808
Helena Grice;Tim Woods                          6809
Length: 6446, dtype: int64


In [127]:
def book_recommendation(title, similarity=similarity, df=df):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    book_indices = [i[0] for i in similarity_scores[:10]]
    
    recommendations = df.iloc[book_indices][['title', 'authors']]
    
    return recommendations


In [128]:
print(book_recommendation("Rage of angels"))

                                  title               authors
3                        Rage of angels        Sidney Sheldon
5245                              Moods     Louisa May Alcott
2285                         The Client          John Grisham
2607                     10 Lb. Penalty          Dick Francis
283                           Odalisque       Neal Stephenson
1662  Harry Bosch Novels, The: Volume 2      Michael Connelly
3498       A Dark and Hungry God Arises  Stephen R. Donaldson
5469           Mockingbird Wish Me Luck      Charles Bukowski
1571          The Demon Princes, Vol. 1            Jack Vance
5788                 The Eiger Sanction             Trevanian
