---
---
# Recommendation
Purpose is: given a book name, find top `n` similar books based on cosine similarity score. In real use cases, the input book could be the book a user has read, has rated highly or have added to the read later list.
Books are recommended utilising the following information through keywords:

In [2]:
import pandas as pd
import numpy as np

import re
import string
# BERT-Embeddings
from keybert import KeyBERT
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
#import altair as alt
#alt.renderers.enable('mimetype')
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm
2024-12-06 18:02:14.461607: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
book_cosine_sim_tf = np.load('book_cosine_sim_tf.npy')[:500,:500]
book_cosine_sim_w2v = np.load('book_cosine_sim_w2v.npy')[:500,:500]
book_cosine_sim = (book_cosine_sim_w2v + book_cosine_sim_tf) / 2
book_data2 = pd.read_csv('keywords.csv')
# Add a new column with sequential values (0, 1, 2, 3, ...)
book_data2.insert(0, 'Index', range(len(book_data2)))
print(book_data2)
books = pd.Series(book_data2['Name'])

def recommend_books_similar_to(book_name, n=5, cosine_sim_mat=book_cosine_sim):
    # get index of the imput book
    input_idx = books[books == book_name].index[0]   
    top_n_books_idx = list(pd.Series(cosine_sim_mat[input_idx]).sort_values(ascending = False).iloc[1:n+1].index)
    # [1:6] to exclude 0 (index 0 is the input movie itself)
    # print(top_n_books_idx)
    books_list = list(books)
    recommended_books = [books_list[i] for i in top_n_books_idx]
        
    return recommended_books


def recommend_bookIDs_similar_to(book_name, n=5, cosine_sim_mat=book_cosine_sim):
    # Get index of the input book
    input_idx = books[books == book_name].index[0]   
    
    # Find top n similar books with decreasing order of similarity score
    top_n_books_idx = list(pd.Series(cosine_sim_mat[input_idx]).sort_values(ascending=False).iloc[1:n+1].index)
    
    # Get the ID of the input book
    input_book_id = book_data2['Id'].iloc[input_idx]
    
    # Create a list of recommended book IDs
    recommended_ids = book_data2['Id'].iloc[top_n_books_idx].tolist()
    
    # Return the input book ID plus the list of recommended IDs
    return input_book_id, recommended_ids

     Index       Id                                               Name  \
0        0  1100003                                         the prince   
1        1  1100004  sermons from duke chapel: voices from "a great...   
2        2  1100009                           the idea of a university   
3        3  1100010  caring and curing: health and medicine in the ...   
4        4  1100013  the alamo remembered: tejano accounts and pers...   
..     ...      ...                                                ...   
495    495  1101338                  on the eve of uncertain tomorrows   
496    496  1101339               the new complete guide to beekeeping   
497    497  1101341  the story of paul j. meyer: the million dollar...   
498    498  1101343             psychology of learning for instruction   
499    499  1101346                      attitudes and attitude change   

                 Authors  PublishYear                       Publisher  \
0    niccolò_machiavelli         1998 

In [4]:
books

0                                             the prince
1      sermons from duke chapel: voices from "a great...
2                               the idea of a university
3      caring and curing: health and medicine in the ...
4      the alamo remembered: tejano accounts and pers...
                             ...                        
495                    on the eve of uncertain tomorrows
496                 the new complete guide to beekeeping
497    the story of paul j. meyer: the million dollar...
498               psychology of learning for instruction
499                        attitudes and attitude change
Name: Name, Length: 500, dtype: object

In [5]:
# Recommendations with series information
print("Recommendation based on the read: The Eastland Disaster the prince")
display(recommend_books_similar_to("the prince", 5))

Recommendation based on the read: The Eastland Disaster the prince


['the silver castle',
 'the absolute bourgeois: artists & politics in france, 1848-1851 (cloth)',
 'the absolute bourgeois: artists & politics in france, 1848-1851 (paper): artists and politics in france, 1848-1851',
 'configuring ipcop firewalls: closing borders with open source',
 'the palo muerto: a novel of santeria']

In [6]:
# Recommendations with series information
print("\033[1m{}\033[0m".format("Recommendation based on the read: The Eastland Disaster the prince"))
display(recommend_bookIDs_similar_to("the prince", 5))

[1mRecommendation based on the read: The Eastland Disaster the prince[0m


(1100003, [1100768, 1100892, 1100893, 1100787, 1101292])

In [7]:
# Initialize a new column for recommended IDs
book_data2['recommended_ids'] = None



# Iterate over each row in model_data
for index, row in book_data2.iterrows():
    book_name = row['Name']
    _, recommended_ids = recommend_bookIDs_similar_to(book_name)  # Get recommended IDs
    book_data2.at[index, 'recommended_ids'] = recommended_ids  # Update the DataFrame

# Optionally save the updated DataFrame to a new CSV file
book_data2.to_csv("keywords_final.csv", index=False)
print("Updated DataFrame with recommended IDs saved to data/keywords.csv")

Updated DataFrame with recommended IDs saved to data/keywords.csv
