In [None]:

%pip install numpy==1.24.2
%pip install spacy
%pip install faiss-cpu
%pip install -U sentence-transformers

!python -m spacy download en_core_web_sm

In [None]:
import json
import spacy
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from tqdm import tqdm

In [None]:

!python -m spacy download en_core_web_sm

In [None]:
# Load Data
with open('./data-collection/data/chapter-data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

nlp = spacy.load('en_core_web_sm')
model = SentenceTransformer('all-MiniLM-L6-v2')

print(data)

In [None]:
embeddedData = []

for entry in tqdm(data):
    book_title = entry.get("book_title", "")
    chapter_name = entry.get("chapter_name", "")
    paragraphs = entry.get("paragraphs", [])

    for i, paragraph in enumerate(paragraphs):
        doc = nlp(paragraph)
        sentences = [sent.text for sent in doc.sents]
        paragraphEmbed = model.encode(sentences)
        paragraphEmbed = np.mean(paragraphEmbed, axis=0)

        embeddedData.append({
                "book_title": book_title,
                "chapter_name": chapter_name,
                "paragraph": paragraph,
                "embeddedParagraph": paragraphEmbed
            })
        
    break

embeddings = np.array([info['embeddedParagraph'] for info in embeddedData])
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [None]:
query = "What is commosis?"
query_embedding = model.encode([query])

k = 5  # Number of nearest neighbors to retrieve
D, I = index.search(np.array(query_embedding), k)

# Step 8: Get the metadata of the top-N retrieved results
retrieved_paragraphs = [embeddedData[idx] for idx in I[0]]

# Display retrieved paragraphs and their metadata
for result in retrieved_paragraphs:
    print(f"Book: {result['book_title']}")
    print(f"Chapter: {result['chapter_name']}")
    print(f"Paragraph: {result['paragraph']}")
    print("\n")