In [16]:
import pandas as pd

books = pd.read_csv(r'../data/processed/books_cleaned.csv')

books.drop(['Unnamed: 0'], axis=1, inplace=True)

books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...
1,440234743,The Testament,John Grisham,1999,Dell,http://images.amazon.com/images/P/0440234743.0...,http://images.amazon.com/images/P/0440234743.0...,http://images.amazon.com/images/P/0440234743.0...
2,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...
3,609804618,Our Dumb Century: The Onion Presents 100 Years...,The Onion,1999,Three Rivers Press,http://images.amazon.com/images/P/0609804618.0...,http://images.amazon.com/images/P/0609804618.0...,http://images.amazon.com/images/P/0609804618.0...
4,971880107,Wild Animus,Rich Shapero,2004,Too Far,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...


In [17]:
books['Content'] = books['Book-Title'] + ' ' + books['Book-Author'] + ' ' + books['Publisher']

books = books.reindex(columns=['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Content'])

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english') # Create the TF-IDF objesct to use

tfidf_matrix = vectorizer.fit_transform(books['Content'])

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [20]:
def get_recommendations(isbn, k=5):

    indices = books.reset_index().set_index('ISBN') # new dataframe with isbn as index

    if isbn not in indices.index:
        return []

    idx = indices.loc[isbn]['index'] #return the index of the choosen book

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:k+1]

    # sorted(sim_scores, key=lambda x: x[1] هرتب حسب التشابه اللي هو x[1
    # reverse=True --> sort the similarity scores in descending order
    # [1:k+1] skip [0] refers to the book itself

    book_indices = [i[0] for i in sim_scores]  #list of indices only

    return books.iloc[book_indices]['ISBN'].tolist()


In [37]:
# Find books with "harry potter" in the title (case-insensitive)
harry_potter_mask = books.loc[:, "Book-Title"].str.lower().str.contains("harry potter")

# Get the ISBNs of those books
harry_potter_isbns = books[harry_potter_mask].loc[:, "ISBN"].tolist()

# Get recommendations based on the first Harry Potter book
recommended_isbns = get_recommendations(harry_potter_isbns[0])

# Show recommended book titles and their ISBNs
recommended_books_df = books[books['ISBN'].isin(recommended_isbns)].loc[:, ["Book-Title", "ISBN"]]
recommended_books_df

Unnamed: 0,Book-Title,ISBN
573,Harry Potter and the Sorcerer's Stone (Harry P...,059035342X
682,Harry Potter and the Sorcerer's Stone (Book 1),0590353403
795,Harry Potter and the Chamber of Secrets (Book 2),0439064872
1120,Harry Potter and the Goblet of Fire (Book 4),0439139597
2088,It's A Magical World: A Calvin and Hobbes Coll...,0836221362
