In [1]:
# Load required modules
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from sentence_transformers import CrossEncoder

import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

books = pd.read_csv('books_cleaned.csv')

In [3]:
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title and subtitle,tagged_description
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222 On A Train Journey Home To North...
5193,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 This book tells the tale of a ma...
5194,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...


In [4]:
books["tagged_description"].to_csv("tagged_description.txt",
                                   sep="\n",
                                   index=False,
                                   header=False)

In [5]:
# Load the file (force UTF-8 to avoid UnicodeDecodeError)
raw_documents = TextLoader("tagged_description.txt", encoding="utf-8").load()

# Use a large chunk size instead of 0 (0 is not valid now)
text_splitter = CharacterTextSplitter(
    chunk_size=10_000,  # large enough so text isn't split
    chunk_overlap=0,
    separator="\n"
)

documents = text_splitter.split_documents(raw_documents)

In [6]:
documents[0]

Document(metadata={'source': 'tagged_description.txt'}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gi

In [7]:
# 1. Better embedding model (optimized for QA/search)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")

# 2. Reranker for better sorting (optional but recommended)
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# 3. Build/Load vector store
db_books = Chroma.from_documents(
    documents, 
    embedding_model
)

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")


In [8]:
# Function to query similar books
def search_books(query: str, k: int = 5):
    results = db_books.similarity_search(query, k=k)
    for idx, res in enumerate(results, start=1):
        print(f"{idx}. {res.page_content}\n")

# Example query
search_books("magic school with wizards like Harry Potter", k=5)

1. 9780439314558 From her birth in Chipping Sodbury near Bristol, England, to the stories about her favorite teachers, to the funny misunderstanding in her first fan letter, the life of the author of the Harry Potter books is revealed. Original. 250,000 first printing.
9780439321624 A history of the sport Quidditch, answering such questions such as where the Golden Snitch came from, or why the Wigtown Wanderers have meat cleavers on their robes.
9780439341202 Abby Hayes is starting fifth grade. Her supersibs, the perfect older twins and her genius little brother, have already proved their Hayes worthiness. This is going to be Abby's year ... and she's going to record every moment of it.
"9780439358071 In Harry Potter and the Order of the Phoenix, Lord Voldemort has returned to the Wizarding world, presenting a threat that neither the magical government nor the authorities at Hogwarts can stop. In response to his reappearance, Dumbledore reactivates the Order of the Phoenix, a secret so

In [9]:
query = "A book to teach children about nature"
docs = db_books.similarity_search(query, k=10)
docs

[Document(id='57ff7147-1139-492c-8acd-70a62c00a552', metadata={'source': 'tagged_description.txt'}, page_content='"9780064405850 The land was theirs, but so were its hardships Strawberries -- big, ripe, and juicy. Ten-year-old Birdie Boyer can hardly wait to start picking them. But her family has just moved to the Florida backwoods, and they haven′t even begun their planting. "";Don′t count your biddies ′fore they′re hatched, gal young un!""; her father tells her. Making the new farm prosper is not easy. There is heat to suffer through, and droughts, and cold snaps. And, perhaps most worrisome of all for the Boyers, there are rowdy neighbors, just itching to start a feud."\n9780064405959 When seven Siberian snow spiders, frozen during the Ice Age, defrost and escape en route to Harvard for analysis, they wreak havoc on a small New England town, disrupting the school Halloween pageant with hilarious results. An ALA Notable Children\'s Book. Reissue.\n9780064406307 \'Like Cushman\'s 1995

In [10]:
isbn = docs[0].page_content.split()[0].strip().strip('"')
result = books[books["isbn13"] == isbn]

In [11]:
result

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title and subtitle,tagged_description


In [12]:
import pandas as pd
from typing import List

def retrieve_semantic_recommendation(
    query: str,
    top_k: int = 10,
    rerank: bool = True
) -> pd.DataFrame:
    """
    Retrieve semantic book recommendations from the database, optionally reranked.

    Args:
        query (str): The search query.
        top_k (int): Number of top recommendations to return.
        rerank (bool): Whether to rerank results using a reranker model.

    Returns:
        pd.DataFrame: Top recommended books from the books DataFrame, sorted by relevance.
    """
    # Perform initial similarity search
    docs = db_books.similarity_search(query, k=50 if rerank else top_k)

    isbn_scores: List[tuple[int, float]] = []

    if rerank:
        # Compute reranker scores
        scores = reranker.predict([(query, doc.page_content) for doc in docs])
        for doc, score in zip(docs, scores):
            try:
                isbn = int(doc.page_content.strip('"').split()[0])
                isbn_scores.append((isbn, score))
            except (ValueError, IndexError):
                continue
        # Sort by score descending
        isbn_scores.sort(key=lambda x: x[1], reverse=True)
    else:
        # Just take the first k docs without reranking
        for doc in docs[:top_k]:
            try:
                isbn = int(doc.page_content.strip('"').split()[0])
                isbn_scores.append((isbn, 0))  # score = 0 for no rerank
            except (ValueError, IndexError):
                continue

    # Extract ISBNs in order
    ordered_isbns = [isbn for isbn, _ in isbn_scores[:top_k]]

    # Filter books DataFrame and preserve order
    result = books[books["isbn13"].isin(ordered_isbns)]
    result["score"] = result["isbn13"].apply(lambda x: next((s for i, s in isbn_scores if i == x), 0))
    result = result.sort_values(by="score", ascending=False).drop(columns=["score"])

    return result


In [13]:
retrieve_semantic_recommendation("A book to teach children about nature")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result["score"] = result["isbn13"].apply(lambda x: next((s for i, s in isbn_scores if i == x), 0))


Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title and subtitle,tagged_description
442,9780067575208,006757520X,The Sense of Wonder,Rachel Carson,Nature,http://books.google.com/books/content?id=Zee5S...,"First published more than three decades ago, t...",1998.0,4.39,112.0,1160.0,The Sense of Wonder,9780067575208 First published more than three ...
4284,9780971500747,0971500746,Transcending the Levels of Consciousness,David R. Hawkins,Religion,http://books.google.com/books/content?id=kXImm...,Explores the ego's expressions and inherent li...,2005.0,4.53,407.0,423.0,Transcending the Levels of Consciousness: The ...,9780971500747 Explores the ego's expressions a...
4455,9781405201711,1405201711,The Faraway Tree Stories,Enid Blyton,"Children's stories, English",http://books.google.com/books/content?id=2d93O...,"Jo, Bessie and Fanny move to the country and f...",1987.0,4.29,583.0,12286.0,The Faraway Tree Stories,"9781405201711 Jo, Bessie and Fanny move to the..."
2855,9780586071397,0586071393,Faerie Tale,Raymond E. Feist,Fairy tales,http://books.google.com/books/content?id=Msdac...,A contemporary fantasy novel about the Hasting...,1989.0,3.87,490.0,7700.0,Faerie Tale,9780586071397 A contemporary fantasy novel abo...
5145,9781931882200,1931882207,Invisible Residents,Ivan T. Sanderson,"Body, Mind & Spirit",http://books.google.com/books/content?id=_BKYT...,This book is a groundbreaking contribution to ...,2005.0,3.68,100.0,59.0,Invisible Residents: The Reality of Underwater...,9781931882200 This book is a groundbreaking co...
89,9780060005696,0060005696,The Paradox of Choice,Barry Schwartz,Business & Economics,,The author of The Battle for Human Nature expl...,2005.0,3.84,265.0,23734.0,The Paradox of Choice: Why More Is Less,9780060005696 The author of The Battle for Hum...
4842,9781590302088,1590302087,Making a Change for Good,Cheri Huber,Philosophy,http://books.google.com/books/content?id=rnt1n...,"In her new book, Huber takes on the topic of c...",2007.0,3.92,128.0,291.0,Making a Change for Good: A Guide to Compassio...,"9781590302088 In her new book, Huber takes on ..."
4194,9780881507195,0881507199,King Arthur Flour Whole Grain Baking,,Cooking,http://books.google.com/books/content?id=nXLcP...,A guide to whole grain baking explains how to ...,2006.0,4.13,544.0,4704.0,King Arthur Flour Whole Grain Baking: Deliciou...,9780881507195 A guide to whole grain baking ex...
4441,9781402714597,1402714599,The Secret Garden,Frances Hodgson Burnett,Juvenile Fiction,http://books.google.com/books/content?id=f_9CD...,A ten-year-old orphan comes to live in a lonel...,2004.0,4.13,248.0,553.0,The Secret Garden,9781402714597 A ten-year-old orphan comes to l...
291,9780060924980,0060924985,The Infinite Plan,Isabel Allende,Fiction,http://books.google.com/books/content?id=80pDR...,"Selling more than 65,000 copies and topping be...",1994.0,3.71,384.0,7102.0,The Infinite Plan: A Novel,"9780060924980 Selling more than 65,000 copies ..."
