In [86]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity



In [87]:
#Read in Data
Df = pd.read_csv('Resources/GoodReads_100k_books.csv')

In [125]:
Df.dtypes

author           object
desc             object
genre            object
pages             int64
rating          float64
reviews           int64
title            object
totalratings      int64
dtype: object

In [88]:
#Display the Dataframe
Df.head()


Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,"Reveals that several hundred thousand Indians were affected by the Civil War and that twenty thousand Indians enlisted on both sides in an attempt to gain legitimacy, autonomy, or simply land.","History,Military History,Civil War,American History,American Civil War,Nonfiction,North American Hi...,American History,Native Americans",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1387738765l/1001053.jpg,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Between_Two_Fires,0,3.52,5,Between Two Fires: American Indians in the Civil War,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,"Fashion Sourcebook - 1920s is the first book in a brand-new series by Fiell Publishing that documents comprehensively the seasonal fashion styles of the 20th century, decade by decade. Sumptuously...","Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1421011497l/10010552.jpg,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashion-sourcebook-1920s,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,"The seminal history and analysis of the Hungarian Revolution and the workers' councils, perhaps the single most important revolutionary event ever, and this is simply the best book on it.","Politics,History",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1348117708l/1001077.jpg,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungary_56,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life and work of Joseph A. Labadie (1850-1933), Detroit's prominent labor organizer and one of early labor's most influential activists. A dynamic participa...","Labor,History",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1356461214l/1001079.jpg,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_American_Anarchist,324,3.83,1,All-American Anarchist: Joseph A. Labadie and the Labor Movement,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa table, table surprenante par sa diversitÃ© et son originalitÃ©. Tous initient leurs petits Ã la vie gourmande en puisant dans un panier aux ressources ...",,https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1293221069l/10010880.jpg,2761920813,,https://goodreads.com/book/show/10010880-les-oiseaux-gourmands,177,4.0,1,Les oiseaux gourmands,1


In [89]:
#Describe the Data
Df.columns.tolist() 

['author',
 'bookformat',
 'desc',
 'genre',
 'img',
 'isbn',
 'isbn13',
 'link',
 'pages',
 'rating',
 'reviews',
 'title',
 'totalratings']

In [90]:
#Drop unnecessary columns

Df = Df.drop(columns=['link','bookformat','img','isbn', 'isbn13'])

In [91]:
#Check if there are any null values

null_check = Df.isnull().sum()
print("Count of null values in each column:")
print(null_check)

Count of null values in each column:
author              0
desc             6772
genre           10467
pages               0
rating              0
reviews             0
title               1
totalratings        0
dtype: int64


In [92]:
#Drop rows with null values in target columns (desc and genre) 
Df = Df.dropna(subset=['desc', 'genre'])

In [93]:
genre_set=[]

for genres in Df["genre"]:
    genre_list = genres.split(",")
    for single_genre in genre_list:
        genre_set.append(single_genre)

genre_set = set(genre_set)

len(genre_set)

1179

In [94]:
Df["genre"] = Df["genre"].str.split(",")

Df.head()

Unnamed: 0,author,desc,genre,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,"Reveals that several hundred thousand Indians were affected by the Civil War and that twenty thousand Indians enlisted on both sides in an attempt to gain legitimacy, autonomy, or simply land.","[History, Military History, Civil War, American History, American Civil War, Nonfiction, North American Hi..., American History, Native Americans]",0,3.52,5,Between Two Fires: American Indians in the Civil War,33
1,"Charlotte Fiell,Emmanuelle Dirix","Fashion Sourcebook - 1920s is the first book in a brand-new series by Fiell Publishing that documents comprehensively the seasonal fashion styles of the 20th century, decade by decade. Sumptuously...","[Couture, Fashion, Historical, Art, Nonfiction]",576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,"The seminal history and analysis of the Hungarian Revolution and the workers' councils, perhaps the single most important revolutionary event ever, and this is simply the best book on it.","[Politics, History]",124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,"""All-American Anarchist"" chronicles the life and work of Joseph A. Labadie (1850-1933), Detroit's prominent labor organizer and one of early labor's most influential activists. A dynamic participa...","[Labor, History]",324,3.83,1,All-American Anarchist: Joseph A. Labadie and the Labor Movement,6
5,Jeffrey Pfeffer,Why is common sense so uncommon when it comes to managing people? How is it that so many seemingly intelligent organizations implement harmful management practices and ideas? In his provocative ne...,"[Business, Leadership, Romance, Historical Romance, Nonfiction, Business, Management, Management, Human Resources]",368,3.73,7,The Human Equation: Building Profits by Putting People First,119


In [95]:
input_genre = "History"

subset_df = Df[Df["genre"].apply(lambda x: input_genre in x)]

subset_df

Unnamed: 0,author,desc,genre,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,"Reveals that several hundred thousand Indians were affected by the Civil War and that twenty thousand Indians enlisted on both sides in an attempt to gain legitimacy, autonomy, or simply land.","[History, Military History, Civil War, American History, American Civil War, Nonfiction, North American Hi..., American History, Native Americans]",0,3.52,5,Between Two Fires: American Indians in the Civil War,33
2,Andy Anderson,"The seminal history and analysis of the Hungarian Revolution and the workers' councils, perhaps the single most important revolutionary event ever, and this is simply the best book on it.","[Politics, History]",124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,"""All-American Anarchist"" chronicles the life and work of Joseph A. Labadie (1850-1933), Detroit's prominent labor organizer and one of early labor's most influential activists. A dynamic participa...","[Labor, History]",324,3.83,1,All-American Anarchist: Joseph A. Labadie and the Labor Movement,6
7,Edward Joesting,"""Even if you know Hawaiian history you will find much here that is new, besides Knowledge, both love and understanding of th islands "" Jacob Adler","[History, Nonfiction]",353,3.93,2,Hawaii: An Uncommon History,15
15,Philip Mansel,"Throughout history rulers have used clothes as a form of legitimization and propaganda. While palaces, pictures, and jewels might reflect the choice of a monarchâ€™s predecessors or advisers, clot...","[History, Couture, Fashion, Nonfiction, Historical, Cultural, France]",256,3.58,5,Dressed to Rule: Royal and Court Costume from Louis XIV to Elizabeth II,24
...,...,...,...,...,...,...,...,...
99976,Alfred Vincent Kidder,"In a new edition of this classic work in the field of New World archaeology, Alfred Vincent Kidder presents the first regional synthesis and an unsurpassed summary of Pueblo archaeology. He provid...","[History, Archaeology, Anthropology, History]",0,3.80,1,An Introduction to the Study of Southwestern Archaeology,10
99981,John Shearman,""",Manierismo ,was the extreme consciousness of elegant style for its own sake, a passion which unites the cold narcissistic nudes of Bronzino, the elaborate chiselling of Benvenuto Cellini's saltc...","[Art, Art, Art History, Cultural, Italy, History, Nonfiction]",216,3.69,4,Mannerism,62
99991,Leonard Verduin,"The Reformers and Their Stepchildren is a brilliant and well-documented book that reveals the tension between the church and Christendom. According to Leonard Verduin, the American formula of a so...","[History, Church, Church History, Religion, Theology, Christian, Nonfiction, Religion, Christianity, Christianity, Ecclesiology]",292,4.06,26,The Reformers and Their Stepchildren,135
99993,Philip Hoare,"A startling new book, his most personal to date, from Philip Hoare, winner of the 2009 ,Samuel Johnson Prize, for ,Leviathan,. The sea surrounds us. It gives us life, provides us with the air we b...","[Nonfiction, Environment, Nature, Travel, Science, Autobiography, Memoir, History, Science, Natural History, Animals, Science Nature, Animals, Birds]",350,3.77,74,The Sea Inside,497


In [97]:
# Import the SentenceTransformer class and utility function class from the sentence_transformers module 
from sentence_transformers import SentenceTransformer, util
# Use the `all-MiniLM-L6-v2` model.
model = SentenceTransformer('all-MiniLM-L6-v2')

pd.set_option('max_colwidth', 200)

In [98]:
#Convert book descriptions to vectors
book_embeddings = model.encode(subset_df["desc"].tolist(), convert_to_tensor=True)


In [109]:
def find_similar_books(query, subset_df, book_embeddings, model, top_n=10):
    # Convert query description into a vector
    query_embedding = model.encode([query], convert_to_tensor=True)
    
    # Compute cosine similarity between the query and all book descriptions
    similarities = cosine_similarity(query_embedding.cpu().numpy(), book_embeddings.cpu().numpy())[0]
    
    # Get the indices of the top N most similar books
    top_indices = np.argsort(similarities)[::-1][:top_n]
    
    # Return top similar books with their similarity scores
    return subset_df.iloc[top_indices][["title", "desc"]].assign(similarity=similarities[top_indices])

# User Input
query_description = "Roman history generals."
similar_books = find_similar_books(query_description, subset_df, book_embeddings, model)


In [110]:
#Print the titles of similar books
for index, row in similar_books.iterrows():
    print(f"Title: {row['title']}\nSimilarity: {row['similarity']:.2f}\nDescription: {row['desc']}\n")


Title: Great Commanders of the Ancient World, 1479BC - 453AD
Similarity: 0.66
Description: Which was the most brilliant of Hannibal's three crushing defeats of Roman armies? What tactics did Julius Caesar employ to defeat Pompey at Pharsalus? How was Alexander the Great able to command sufficient loyalty from his troops to lead them across half of the Asian landmass in search of new territories to conquer? ,The answers to these and a myriad other fascinating questions can be found in Great Commanders of the Ancient World, a sumptuous chronological survey of the 50 greatest commanders of the ancient world. Compiled by an distinguished team of historians (including such names as Robin Lane Fox, Tom Holland and John Julius Norwich) working under the general editorship of Andrew Roberts, Great Commanders of the Ancient World is an authoritative and beautifully illustrated account of the lives and careers of the 25 greatest military commanders of the period, from Julius Caesar to Judas Macc

In [135]:
def rank_books(similar_books, weight_similarity=0.5, weight_rating=0.3, weight_totalratings=0.2):
 
    # Normalize similarity score (already between 0-1)
    similar_books["normalized_similarity"] = similar_books["similarity"]

    # Normalize rating (assuming ratings are on a 1-5 scale)
    similar_books["normalized_rating"] = similar_books["rating"] / 5.0

    # Normalize total ratings using log scale to reduce the effect of extreme values
    similar_books["normalized_totalratings"] = np.log1p(similar_books["totalratings"]) / np.log1p(similar_books["totalratings"].max())

    # Compute final weighted score
    similar_books["final_score"] = (
        (similar_books["normalized_similarity"] * weight_similarity) +
        (similar_books["normalized_rating"] * weight_rating) +
        (similar_books["normalized_totalratings"] * weight_totalratings)
    )

    # Sort books by final score in descending order
    return similar_books.sort_values(by="final_score", ascending=False)


In [138]:
def find_and_rank_books(query, subset_df, book_embeddings, model, top_n=5):
    # Encode query into a vector
    query_embedding = model.encode([query], convert_to_tensor=True)
    
    # Compute cosine similarity
    similarities = cosine_similarity(query_embedding.cpu().numpy(), book_embeddings.cpu().numpy())[0]
    
    # Get top similar books
    top_indices = np.argsort(similarities)[::-1][:top_n]
    
    # Extract relevant book details
    similar_books = subset_df.iloc[top_indices][["title", "desc", "rating", "totalratings"]].copy()
    similar_books["similarity"] = similarities[top_indices]

    # Rank books using our weighted algorithm
    ranked_books = rank_books(similar_books)

    return ranked_books


In [145]:
ranked_books.head(5)  # Shows top 5 ranked books in a nice table format


Unnamed: 0,title,desc,rating,totalratings,similarity,normalized_similarity,normalized_rating,normalized_totalratings,final_score
46413,Savage Continent: Europe in the Aftermath of World War II,"The Second World War might have officially ended in May 1945, but in reality it rumbled on for another ten years...,The end of the Second World War in Europe is one of the twentieth centuryâ€™s mo...",4.2,3391,0.613129,0.613129,0.84,1.0,0.758564
69316,History of the Second World War,"History of the Second World War,, B. H. Liddell Hart's last work as well as his magnum opus, embodies the fruits of twenty years of research and a lifetime of thinking on war. It abounds with cont...",4.21,985,0.592612,0.592612,0.842,0.848014,0.718509
91770,The American Heritage Picture History of World War II,A Pictoral history of World War II; More than 720 great photographs from World War II,4.18,175,0.607667,0.607667,0.836,0.63604,0.681842
96477,After Hitler: The Last Days of World War Two in Europe,"On 30 April 1945, Adolf Hitler committed suicide. The following day, his propaganda minister Joseph Goebbels also killed himself and the crumbling Third Reich passed to Admiral Karl DÃ¶nitz. The N...",3.93,328,0.588635,0.588635,0.786,0.712995,0.672717
58768,"Roosevelt and Churchill, 1939-1941: The Partnership That Saved the West","An account of the personalities and official practices of the two wartime leaders, of the relationship between them, and of the early events of World War II with which they contended and which sha...",3.69,32,0.607894,0.607894,0.738,0.430118,0.611371


In [146]:
#Summary Code/better output

query_description = "A historical account of World War II and its impact on Europe."
ranked_books = find_and_rank_books(query_description, subset_df, book_embeddings, model)

# Print the top 5 ranked books
print("\nTop 5 Recommended Books:\n")
for i, row in ranked_books.head(5).iterrows():
    print(f"📖 Title: {row['title']}")
    print(f"⭐ Rating: {row['rating']} ({int(row['totalratings'])} ratings)")
    print(f"🔍 Similarity Score: {row['similarity']:.2f}")
    print(f"🏆 Final Score: {row['final_score']:.4f}")
    print(f"📖 Description: {row['desc'][:200]}...")  # Truncate long descriptions
    print("-" * 80)



Top 5 Recommended Books:

📖 Title: Savage Continent: Europe in the Aftermath of World War II
⭐ Rating: 4.2 (3391 ratings)
🔍 Similarity Score: 0.61
🏆 Final Score: 0.7586
📖 Description: The Second World War might have officially ended in May 1945, but in reality it rumbled on for another ten years...,The end of the Second World War in Europe is one of the twentieth centuryâ€™s most i...
--------------------------------------------------------------------------------
📖 Title: History of the Second World War
⭐ Rating: 4.21 (985 ratings)
🔍 Similarity Score: 0.59
🏆 Final Score: 0.7185
📖 Description: History of the Second World War,, B. H. Liddell Hart's last work as well as his magnum opus, embodies the fruits of twenty years of research and a lifetime of thinking on war. It abounds with controve...
--------------------------------------------------------------------------------
📖 Title: The American Heritage Picture History of World War II
⭐ Rating: 4.18 (175 ratings)
🔍 Similarity Score: 0