In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain.vectorstores import Chroma
import pandas as pd
from langchain.schema import Document


In [2]:
books = pd.read_csv('books_cleaned_new.csv')

In [3]:
books.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_&_subtitle,tagged_description
0,9780002000000.0,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883.0 A NOVEL THAT READERS and criti...
1,9780002000000.0,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web:A Novel,9780002261982.0 A new 'Christie for Christmas'...
2,9780006000000.0,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736.0 A memorable, mesmerizing heroi..."
3,9780006000000.0,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897.0 Lewis' work on the nature of l...
4,9780006000000.0,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934.0 ""In The Problem of Pain, C.S. ..."


In [4]:
books['tagged_description'].head()

0    9780002005883.0 A NOVEL THAT READERS and criti...
1    9780002261982.0 A new 'Christie for Christmas'...
2    9780006178736.0 A memorable, mesmerizing heroi...
3    9780006280897.0 Lewis' work on the nature of l...
4    9780006280934.0 "In The Problem of Pain, C.S. ...
Name: tagged_description, dtype: object

In [5]:
books['tagged_description'].to_csv('tagged_description.txt',
                                   sep='\n',
                                   index= False,
                                   header= False)

In [5]:
raw_documents = TextLoader('tagged_description.txt').load()

In [6]:
text_splitter = CharacterTextSplitter(chunk_size = 0, chunk_overlap=0,separator='\n' )

In [7]:
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 1170, which is longer than the specified 0
Created a chunk of size 1216, which is longer than the specified 0
Created a chunk of size 375, which is longer than the specified 0
Created a chunk of size 311, which is longer than the specified 0
Created a chunk of size 485, which is longer than the specified 0
Created a chunk of size 484, which is longer than the specified 0
Created a chunk of size 962, which is longer than the specified 0
Created a chunk of size 190, which is longer than the specified 0
Created a chunk of size 845, which is longer than the specified 0
Created a chunk of size 298, which is longer than the specified 0
Created a chunk of size 199, which is longer than the specified 0
Created a chunk of size 883, which is longer than the specified 0
Created a chunk of size 1090, which is longer than the specified 0
Created a chunk of size 1191, which is longer than the specified 0
Created a chunk of size 306, which is longer than the specified 0
Create

In [8]:
documents

[Document(metadata={'source': 'tagged_description.txt'}, page_content='9780002005883.0 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details,

In [9]:
from langchain_huggingface import HuggingFaceEmbeddings


hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

db_books = Chroma.from_documents(
    documents,
    embedding=hf_embeddings
)


In [10]:
query = "A book to teach chidren about nature"
docs = db_books.similarity_search(query, k=10)
docs

[Document(metadata={'source': 'tagged_description.txt'}, page_content="9780374522599.0 The Control of Nature is John McPhee's bestselling account of places where people are locked in combat with nature. Taking us deep into these contested territories, McPhee details the strageties and tactics through which people attempt to control nature. Most striking is his depiction of the main contestants: nature in complex and awesome guises, and those attempting to wrest control from her - stubborn, sometimes foolhardy, more often ingenious, and always arresting characters."),
 Document(metadata={'source': 'tagged_description.txt'}, page_content='9780786808069.0 Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.'),
 Document(metadata={'source': 'tagged_description.txt'}, page_content="9

In [15]:
print(repr(docs[0].page_content.split()[0].strip()))



'9780374522599.0'


In [16]:
print(books.dtypes)
print(books['isbn13'].head())


isbn13                 object
isbn10                 object
title                  object
authors                object
categories             object
thumbnail              object
description            object
published_year        float64
average_rating        float64
num_pages             float64
ratings_count         float64
title_&_subtitle       object
tagged_description     object
dtype: object
0    9780002005883.0
1    9780002261982.0
2    9780006178736.0
3    9780006280897.0
4    9780006280934.0
Name: isbn13, dtype: object


In [None]:

books['isbn13'] = books['isbn13'].str.replace('.0', '', regex=False)

isbn_str = docs[0].page_content.split()[0].strip()


if isbn_str.endswith(".0"):
    isbn_str = isbn_str[:-2]


result = books[books['isbn13'] == isbn_str]


print(result)


             isbn13      isbn10                  title      authors  \
1642  9780374522599  0374522596  The Control of Nature  John McPhee   

     categories                                          thumbnail  \
1642     Nature  http://books.google.com/books/content?id=p1qKQ...   

                                            description  published_year  \
1642  The Control of Nature is John McPhee's bestsel...          1990.0   

      average_rating  num_pages  ratings_count       title_&_subtitle  \
1642            4.24      288.0         3365.0  The Control of Nature   

                                     tagged_description  
1642  9780374522599.0 The Control of Nature is John ...  


In [19]:

books['isbn13'] = books['isbn13'].astype(str).str.replace('.0', '', regex=False)


In [23]:
def retrieve_semantic_recommendations(query: str, top_k: int = 10) -> pd.DataFrame:
    recs = db_books.similarity_search(query, k=10)
    
    books_list = []
    
    for i in range(len(recs)):
        isbn_str = recs[i].page_content.strip('"').split()[0]
        

        if isbn_str.endswith(".0"):
            isbn_str = isbn_str[:-2]
        
        books_list.append(isbn_str)  

    return books[books["isbn13"].isin(books_list)]


In [26]:
retrieve_semantic_recommendations('A book about war')

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_&_subtitle,tagged_description
66,9780007162994,7162995,If I Die in a Combat Zone,Tim O'Brien,"Vietnam War, 1961-1975",http://books.google.com/books/content?id=0qUtS...,Perhaps the best book to emerge from the Vietn...,2003.0,3.95,208.0,11.0,If I Die in a Combat Zone,9780007162994.0 Perhaps the best book to emerg...
524,9780099483472,99483475,All Quiet on the Western Front,Erich Maria Remarque,"World War, 1914-1918",,All Quiet on the Western Front is probably the...,2005.0,3.95,216.0,1018.0,All Quiet on the Western Front,9780099483472.0 All Quiet on the Western Front...
1199,9780312265052,312265050,The Naked and the Dead,Norman Mailer,Fiction,http://books.google.com/books/content?id=c66GL...,Portrays the contrasting personalities and nos...,2000.0,3.94,721.0,20541.0,The Naked and the Dead:50th Anniversary Editio...,9780312265052.0 Portrays the contrasting perso...
2828,9780571207992,571207995,The Wars,Timothy Findley,Fiction,http://books.google.com/books/content?id=AqnDQ...,"Robert Ross, a sensitive nineteen-year-old Can...",2001.0,3.87,218.0,6229.0,The Wars,"9780571207992.0 Robert Ross, a sensitive ninet..."
3139,9780684813219,684813211,Achilles in Vietnam,Jonathan Shay,Psychology,http://books.google.com/books/content?id=6EEnD...,An original and groundbreaking book that exami...,1994.0,4.24,272.0,1057.0,Achilles in Vietnam:Combat Trauma and the Undo...,9780684813219.0 An original and groundbreaking...
3154,9780684844077,684844079,Soul of the Sword,Robert L. O'Connell,Technology & Engineering,http://books.google.com/books/content?id=eoEag...,A sweeping illustrated history of war and the ...,2002.0,4.08,400.0,36.0,Soul of the Sword:An Illustrated History of We...,9780684844077.0 A sweeping illustrated history...
3180,9780688085872,688085873,A Short History of World War II,James L. Stokesbury,History,http://books.google.com/books/content?id=uDBhl...,"Despite the numerous books on World War II, un...",1980.0,3.93,416.0,454.0,A Short History of World War II,9780688085872.0 Despite the numerous books on ...
4414,9781401204105,1401204104,The Sgt. Rock Archives,Robert Kanigher;Bob Haney,Comics & Graphic Novels,http://books.google.com/books/content?id=MnWYK...,"An intense portrait of combat and conflict, th...",2005.0,4.56,224.0,18.0,The Sgt. Rock Archives,9781401204105.0 An intense portrait of combat ...
4644,9781565842212,1565842219,The War,Marguerite Duras,Biography & Autobiography,http://books.google.com/books/content?id=1h1uP...,"The extraordinary pages of The War, written in...",1994.0,3.85,192.0,918.0,The War:A Memoir,9781565842212.0 The extraordinary pages of The...
4843,9781590302255,1590302257,The Art of War,Sun-Tzu;Sunzi,Philosophy,http://books.google.com/books/content?id=Kjw1j...,"Written around the 6th century BC, The Art of ...",2005.0,3.97,273.0,214954.0,The Art of War,9781590302255.0 Written around the 6th century...
