Vector Database FAISS/ChromaDB/PineCone?

-  Step 1: Load sec1 and sec2 PDFs
-  Step 2: Split into Chunks + Tag with Metadata ( Page Number + sec 2/sec1 label)
-  Step 3: Combine into VectorDB

Each Query Should Return:
- Page Number + Sec 2/Sec1 source
- UI should be able to filter between sec 1 and sec 2 content.

langchain==0.3.9
langchain-community==0.3.1
pypdf==4.2.0
python-dotenv
faiss-cpu==1.7.4


In [9]:
import os
import pickle
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableLambda
from langchain.chat_models import ChatOpenAI

In [None]:


# Load PDFs
pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
documents = []

# Save the full text from each PDF
with open('full_text_from_pdfs.txt', 'w', encoding='utf-8') as full_text_file:
    for label, path in pdf_paths.items():
        loader = PyPDFLoader(path)
        pages = loader.load()
        full_text_file.write(f"--- {label} ---\n")
        for page in pages:
            # Write the full text from each page into the file
            full_text_file.write(page.page_content + "\n")



In [None]:
# run this part only to reload documents

In [20]:
from langchain.document_loaders import PyPDFLoader

# Load PDFs
pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
documents = []


# Verify the PDF loading process
for label, path in pdf_paths.items():
    try:
        loader = PyPDFLoader(path)
        pages = loader.load()
        print(f"Successfully loaded {len(pages)} pages from {label}")
        
        for page in pages:
            documents.append({
                "text": page.page_content,
                "metadata": {"page": page.metadata["page"], "source": label}
            })
    except Exception as e:
        print(f"Error loading {path}: {e}")

# Check if documents are populated
if not documents:
    print("Documents list is empty after loading PDFs!")
else:
    print(f"Documents loaded: {len(documents)}")
#delete this

Successfully loaded 220 pages from Sec1
Successfully loaded 216 pages from Sec2
Documents loaded: 436


In [22]:
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader

# Load PDFs
pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
documents = []

# Ensure documents are stored as Document objects
for label, path in pdf_paths.items():
    try:
        loader = PyPDFLoader(path)
        pages = loader.load()
        print(f"Successfully loaded {len(pages)} pages from {label}")
        
        for page in pages:
            # Create a Document object with the correct structure
            doc = Document(
                page_content=page.page_content,
                metadata={"page": page.metadata["page"], "source": label}
            )
            documents.append(doc)
    except Exception as e:
        print(f"Error loading {path}: {e}")

# Check if documents are populated
if not documents:
    print("Documents list is empty after loading PDFs!")
else:
    print(f"Documents loaded: {len(documents)}")



Successfully loaded 220 pages from Sec1
Successfully loaded 216 pages from Sec2
Documents loaded: 436


## Trying different Text Splitters

1. Recursive Character
2. Paragraph
3. Semantic

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Chunk the documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=50
)
chunks = []

# Save the chunks into a file for observation
with open('text_chunks_recursive.txt', 'w', encoding='utf-8') as chunk_file:
    for label, path in pdf_paths.items():
        loader = PyPDFLoader(path)
        pages = loader.load()
        for page in pages:
            text = page.page_content
            splits = text_splitter.split_text(text)  # Split the text into chunks
            for split in splits:
                chunks.append({
                    "text": split,
                    "metadata": {"page": page.metadata["page"], "source": label}
                })
                # Write the chunk text into the file
                chunk_file.write(f"--- Chunk from {label} (Page {page.metadata['page']}) ---\n")
                chunk_file.write(split + "\n")


In [15]:
import re

# Chunking text based on paragraph (assuming each paragraph is separated by a new line)
paragraph_chunks = []

with open('paragraph_text_chunks.txt', 'w', encoding='utf-8') as chunk_file:
    for label, path in pdf_paths.items():
        loader = PyPDFLoader(path)
        pages = loader.load()
        for page in pages:
            text = page.page_content
            # Split based on new lines (assumed paragraph boundary)
            splits = re.split(r'\n\n+', text)  # Split by double newlines
            for split in splits:
                paragraph_chunks.append({
                    "text": split,
                    "metadata": {"page": page.metadata["page"], "source": label}
                })
                # Write each paragrapbh into the file
                chunk_file.write(f"--- Paragraph Chunk from {label} (Page {page.metadata['page']}) ---\n")
                chunk_file.write(split + "\n")


document check

In [35]:
# Ensure documents list is populated
if not documents:
    print("Documents list is empty!")
else:
    print(f"Documents loaded: {len(documents)}")

# Generate embeddings for the documents to check if embeddings are created correctly
embedding_texts = [doc["text"] for doc in documents]

# Generate embeddings using OpenAI embeddings model
embeddings_vectors = embeddings.embed_documents(embedding_texts)

# Check if embeddings were generated correctly
if len(embeddings_vectors) == len(embedding_texts):
    print("Embeddings generated correctly!")
else:
    print(f"Error: Expected {len(embedding_texts)} embeddings, but got {len(embeddings_vectors)}.")


Documents loaded: 436


TypeError: 'Document' object is not subscriptable

In [36]:
import pickle

# Save the combined documents (articles + PDF pages) to a file
with open('textbooks.pkl', 'wb') as f:
    pickle.dump(documents, f)


In [30]:
#rerun this one for results.
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
import os

# Load PDFs
pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
documents = []

# Load the PDFs and create documents list
for label, path in pdf_paths.items():
    loader = PyPDFLoader(path)
    pages = loader.load()
    for page in pages:
        documents.append(Document(page_content=page.page_content, metadata={"page": page.metadata["page"], "source": label}))

# Initialize the OpenAI Embeddings model
embed_model = OpenAIEmbeddings(model="text-embedding-3-small")

# Initialize the SemanticChunker with the model
semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type="percentile")

# Chunk the documents semantically
semantic_chunks = semantic_chunker.create_documents([d.page_content for d in documents])

# Save semantic chunks to a file for observation
with open('semantic_text_chunks.txt', 'w', encoding='utf-8') as chunk_file:
    for semantic_chunk in semantic_chunks:
        # Write the chunk's content and metadata to the file
        chunk_file.write(f"--- Semantic Chunk from {semantic_chunk.metadata['source']} (Page {semantic_chunk.metadata['page']}) ---\n")
        chunk_file.write(semantic_chunk.page_content + "\n")
        chunk_file.write("\n")  # Add a blank line between chunks

# Optional: Print a chunk if it contains a specific phrase, e.g., "Effect of Pre-training Tasks"
for semantic_chunk in semantic_chunks:
    if "Effect of Pre-training Tasks" in semantic_chunk.page_content:
        print(f"--- Found Chunk ---\n{semantic_chunk.page_content}")
        print(f"Length: {len(semantic_chunk.page_content)}")


KeyError: 'source'

In [34]:

# Initialize the SemanticChunker with the model
semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type="percentile")

# Chunk the documents semantically
semantic_chunks = semantic_chunker.create_documents([d.page_content for d in documents])

# Save semantic chunks to a file for observation
with open('semantic_text_chunks.txt', 'w', encoding='utf-8') as chunk_file:
    chunk_index = 0  # Initialize chunk index
    for doc in documents:
        # Get the number of chunks for the current document
        doc_chunks = [chunk for chunk in semantic_chunks if chunk.page_content in doc.page_content]
        
        # Add each chunk with the corresponding metadata
        for semantic_chunk in doc_chunks:
            source = doc.metadata.get('source', 'Unknown')
            page = doc.metadata.get('page', 'Unknown')
            
            # Write the chunk's content and metadata to the file
            chunk_file.write(f"--- Semantic Chunk from {source} (Page {page}) ---\n")
            chunk_file.write(semantic_chunk.page_content + "\n")
            chunk_file.write("\n")  # Add a blank line between chunks

            chunk_index += 1  # Increment the chunk index

# Optional: Print a chunk if it contains a specific phrase, e.g., "Effect of Pre-training Tasks"
for semantic_chunk in semantic_chunks:
    if "Effect of Pre-training Tasks" in semantic_chunk.page_content:
        print(f"--- Found Chunk ---\n{semantic_chunk.page_content}")
        print(f"Length: {len(semantic_chunk.page_content)}")

## Vector Database Creation Attempts (FAISS)

In [10]:
# this code is for only creation from the PDF textbooks
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableLambda
from langchain.chat_models import ChatOpenAI
#from langchain.output_parsers import StrOutputParser
import os

# Load PDFs
pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
documents = []

for label, path in pdf_paths.items():
    loader = PyPDFLoader(path)
    pages = loader.load()
    for page in pages:
        documents.append({
            "text": page.page_content,
            "metadata": {"page": page.metadata["page"], "source": label}
        })

# Chunk the documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=50
)
chunks = []

for doc in documents:
    splits = text_splitter.split_text(doc["text"])
    for split in splits:
        chunks.append({
            "text": split,
            "metadata": doc["metadata"]
        })

# Generate Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=OPENAI_API_KEY)

faiss_index = FAISS.from_texts(
    [chunk["text"] for chunk in chunks], 
    embeddings, 
    metadatas=[chunk["metadata"] for chunk in chunks]
)

# Save VectorDB
faiss_index.save_local("faiss_index")

print("Vector database created successfully!")

Vector database created successfully!


In [49]:
# second attempt for combined vectorDB creation
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableLambda
from langchain.chat_models import ChatOpenAI
import os
import pickle
import pandas as pd

# Step 1: Load Articles from CSV (assuming columns: url, text, source, title)
csv_file = "roots_sg_articles_cleaned.csv"  # Replace with your CSV file path
df = pd.read_csv(csv_file)

# Fill NaN values in 'text' column with an empty string or a default string
df['text'] = df['text'].fillna('missing content')

# Ensure 'text' column is a string
df['text'] = df['text'].astype(str)

# Create documents from CSV data
articles = []
for index, row in df.iterrows():
    articles.append(Document(
        page_content=row['text'],
        metadata={
            'title': row['title'],
            'source': row['source'],
            'url': row['url']
        }
    ))

# Load the processed documents from the pickle file
with open('textbooks.pkl', 'rb') as f:
    pdf_documents = pickle.load(f)

# # Load PDFs
# pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
# documents = []

# for label, path in pdf_paths.items():
#     loader = PyPDFLoader(path)
#     pages = loader.load()
#     for page in pages:
#         documents.append({
#             "text": page.page_content,
#             "metadata": {"page": page.metadata["page"], "source": label}
#         })

# Chunk the documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800, chunk_overlap=100
)
chunks = []

# Combine both article and PDF documents into one list
all_documents = articles + pdf_documents

# Process the documents (both articles and PDFs)
for doc in all_documents:
    # Access the content using dot notation, not dict-style indexing
    splits = text_splitter.split_text(doc.page_content)  # Access the content using .page_content
    for split in splits:
        chunks.append({
            "text": split,
            "metadata": doc.metadata  # Access the metadata using .metadata
        })

# Generate Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=os.getenv("OPENAI_API_KEY"))

faiss_index = FAISS.from_texts(
    [chunk["text"] for chunk in chunks], 
    embeddings, 
    metadatas=[chunk["metadata"] for chunk in chunks]
)

# Save VectorDB
faiss_index.save_local("faiss_index_full")

print("Vector database created successfully!")


Vector database created successfully!


In [4]:
# include infopedia
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
import os
import pickle
import pandas as pd

# Function to sanitize metadata (ensures valid data types for FAISS)
def sanitize_metadata(metadata):
    return {k: (v if isinstance(v, (str, int, float, bool)) else "Unknown") for k, v in metadata.items()}

# Step 1: Load Articles from CSV
csv_file = "data/roots_sg_articles_cleaned.csv"
df = pd.read_csv(csv_file)

# Fill NaN values and ensure text is string type
df['text'] = df['text'].fillna('missing content').astype(str)

# Convert CSV data into LangChain Document objects
articles = []
for _, row in df.iterrows():
    articles.append(Document(
        page_content=row['text'],
        metadata=sanitize_metadata({
            'title': row['title'],
            'source': row['source'],
            'url': row['url']
        })
    ))

# Step 2: Load Processed PDF Documents from Pickle File
with open('data/textbooks.pkl', 'rb') as f:
    pdf_documents = pickle.load(f)

# Ensure PDF documents have sanitized metadata
pdf_documents = [
    Document(page_content=doc.page_content, metadata=sanitize_metadata(doc.metadata))
    for doc in pdf_documents
]

# Step 3: Load Infopedia Articles from Pickle File
with open("data/infopedia.pickle", "rb") as f:
    infopedia_data = pickle.load(f)

# Convert Infopedia data into LangChain Document objects
infopedia_articles = []
for title, details in infopedia_data.items():
    infopedia_articles.append(Document(
        page_content=details["content"],
        metadata=sanitize_metadata({
            'title': title,
            'source': details.get('source', 'Unknown'),
            'url': details.get('url', 'No URL'),
            'last_update_date': details.get('last_update_date', 'Unknown')
        })
    ))

# Combine all document sources (CSV, PDF, Infopedia)
all_documents = articles + pdf_documents + infopedia_articles

# Step 4: Chunk the Documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800, chunk_overlap=100
)
chunks = []

for doc in all_documents:
    splits = text_splitter.split_text(doc.page_content)
    for split in splits:
        chunks.append({
            "text": split,
            "metadata": doc.metadata
        })

# Step 5: Generate Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=os.getenv("OPENAI_API_KEY"))

# Step 6: Create FAISS Vector Store
faiss_index = FAISS.from_texts(
    [chunk["text"] for chunk in chunks], 
    embeddings, 
    metadatas=[chunk["metadata"] for chunk in chunks]
)

# Step 7: Save the FAISS Vector Store
faiss_index.save_local("faiss_index_infopedia")

print(f"✅ Vector database created successfully! Total documents stored: {len(chunks)}")


  embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=os.getenv("OPENAI_API_KEY"))


✅ Vector database created successfully! Total documents stored: 39513


## Testing Response from Vector DB

In [19]:
def answer_question_from_vector_store(vector_store, input_question):
    prompt = PromptTemplate.from_template(
        template="""
You are the Heritage Education Research Assistant, an AI-powered tool designed to help educators in Singapore create comprehensive and balanced lesson plans about Singapore's history and culture. Your task is to provide multiple perspectives on historical questions, with a focus on validated sources from the National Heritage Board (NHB) and other reputable institutions.

Generate 3-5 different perspectives on the question, each with a brief summary (2-3 sentences) explaining the reasoning behind that perspective. For each perspective, include a source citation in one of the following formats:

Page Number (if the source is a book or document with specific page references),
Website Link (if the source is a digital resource or website),
Or both if applicable (e.g., a book citation with a page number and a link to the digital source).
Format the answer as follows:

Perspective #: [Answer summary]
Page: [Page Number], Book Title: Sec1 or Sec2
OR
Website Link: [Link to the source]
OR
Page: [Page Number] | Website Link: [Link to the source]
Ensure that the language and content complexity is appropriate for the specified student age group (if provided).

If a specific historical timeframe or theme is specified, tailor your responses to fit within those parameters.

After presenting the perspectives, suggest 2-3 discussion questions that could encourage critical thinking among students about these different viewpoints.

Remember, your goal is to provide educators with balanced, well-sourced information that they can use to create engaging and thought-provoking lessons about Singapore's history and culture. Each citation should be appropriately linked to the perspective it corresponds to, whether it is a page number, website link, or both.

Context: {context}

Question: {question}
        """
    )

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)
    
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
    retrieved_docs = retriever.invoke(input_question)
    
    formatted_context = format_docs(retrieved_docs)
    
    rag_chain_from_docs = (
        RunnableLambda(lambda x: {"context": x["context"], "question": x["question"]})
        | prompt
        | ChatOpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))
    )

    result = rag_chain_from_docs.invoke({"context": formatted_context, "question": input_question})
    return {"answer": result.content, "context": retrieved_docs} 

embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=os.getenv("OPENAI_API_KEY"))

# Load FAISS index
vectorstore = FAISS.load_local("faiss_index_infopedia", embeddings,allow_dangerous_deserialization=True)


In [22]:
# Test query
question = "How would Raffles’ actions have been viewed by the different parties??"
response = answer_question_from_vector_store(vectorstore, question)

print(response['answer'])
#print(f"Referenced sources: {[doc.metadata['source'] for doc in response['context']]}")
# print("Referenced sources:")
# for doc in response['context']:
#     print(f"Page {doc.metadata['page']} (Source: {doc.metadata['source']}):\n{doc.page_content}\n")


Perspective 1: Raffles as a Visionary Leader
Summary: Raffles had grand ambitions for Singapore, including education, urban planning, and law enforcement. He aimed to address issues like slavery and piracy while promoting harmony among immigrant groups.
Page: 290, Book Title: Memoir of the Life and Public Services of Sir Thomas Stamford Raffles

Perspective 2: Farquhar's Pragmatic Approach
Summary: Farquhar, who took over leadership in Raffles' absence, focused on ensuring the port's survival with the available resources, even if it meant deviating from Raffles' specific plans.
Page: 63-64, Book Title: An Anecdotal History of Old Times in Singapore

Perspective 3: Raffles' Discontent with Farquhar's Leadership
Summary: Upon his return to Singapore, Raffles was displeased with the neglect of his plans and the tolerance of local vices like opium and slave trading. This led to Farquhar's dismissal and Raffles taking steps to clean up the colony.
Page: 68, Book Title: Memoir of the Life an

In [23]:
print("Referenced sources:")
for doc in response['context']:
    # Print all metadata keys and values
    #print("Metadata:")
    for key, value in doc.metadata.items():
        # Check if the key is 'title', 'source', or 'url' and handle them accordingly
        if key in ['title', 'source', 'url']:
            print(f"{key.capitalize()}: {value}")
        else:
            # Print other metadata normally
            print(f"{key.capitalize()}: {value}")
    
    # Print the page content as well
    print(f"\nPage Content:\n{doc.page_content}\n")


Referenced sources:
Page: 66
Source: Sec1

Page Content:
How would Raffles’ 
actions have been 
viewed by the  
different parties?Think!
 Pages of the 1819 Treaty between Sir Stamford Raffles, 
Sultan Hussein and Temenggong Abdul Rahman

Title: The First Resident
Source: Roots Website
Url: https://www.roots.gov.sg/stories-landing/stories/the-first-resident/the-first-resident

Page Content:
Raffles held on to lofty goals for Singapore such as education, urban planning, and the enforcement of law and order, while aiming to resolve issues such as slavery, piracy and friction among immigrant groups. However, the reality was far from ideal as Farquhar took a more permissive approach in his leadership. He made do with the resources he had to ensure the port's survival â€“ even if the plans deviated from Raffles' specific directions.
Farquhar's dismissal from Singapore
When Raffles returned to Singapore in 1823, he was furious to find out that his plans had been neglected, and that local vice

In [24]:

# Function to query from the vector store
def answer_question_from_vectorstore(vector_store, input_question):
    prompt = PromptTemplate.from_template(
        template="""
You are the Heritage Education Research Assistant, an AI-powered tool designed to help educators in Singapore create comprehensive and balanced lesson plans about Singapore's history and culture. Your task is to provide multiple perspectives on historical questions, with a focus on validated sources from the National Heritage Board (NHB) and other reputable institutions.

Given a user's question and any provided filters (student age group, historical timeframe, theme), please:

1. Generate 3-5 different perspectives on the question, each with a brief summary (2-3 sentences) explaining the reasoning behind that perspective.
For each perspective, provide citations in page number

2. Ensure that the language and content complexity is appropriate for the specified student age group (if provided).
3. If a specific historical timeframe or theme is specified, tailor your responses to fit within those parameters.
4. After presenting the perspectives, suggest 2-3 discussion questions that could encourage critical thinking among students about these different viewpoints.

Remember, your goal is to provide educators with balanced, well-sourced information that they can use to create engaging and thought-provoking lessons about Singapore's history and culture.

Context: {context}

Question: {question}
        """
    )

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    retriever = vector_store.as_retriever(search_kwargs={"k": 10,"score_threshold": 0.5})
    retrieved_docs = retriever.invoke(input_question)

    formatted_context = format_docs(retrieved_docs)

    rag_chain_from_docs = (
        RunnableLambda(lambda x: {"context": x["context"], "question": x["question"]})
        | prompt
        | ChatOpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))
    )

    result = rag_chain_from_docs.invoke({"context": formatted_context, "question": input_question})
    return {"answer": result.content, "context": retrieved_docs}

In [60]:
## testing with similarity search with relevance score.

In [25]:
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnableLambda
import os

import warnings

# Suppress the specific user warning
warnings.filterwarnings("ignore", message="Relevance scores must be between 0 and 1")


def answer_question_from_vector_store(vector_store, input_question, k=5):
    prompt = PromptTemplate.from_template(
        template="""
You are the Heritage Education Research Assistant, an AI-powered tool designed to help educators in Singapore create comprehensive and balanced lesson plans about Singapore's history and culture. Your task is to provide multiple perspectives on historical questions.

Generate 3-5 different perspectives on the question, each with a brief summary (2-3 sentences) explaining the reasoning behind that perspective. 

Please format your response using the following structure:

Perspective 1 : (Brief summary of perspective)

Perspective 2 : (Brief summary of perspective)

[Additional Perspectives if supported by context...]

[Discussion Questions]
(Only include questions that can be answered using the provided context)
1. (question that encourages critical thinking)
2. (question that encourages critical thinking)
3. (question that encourages critical thinking)

Context: {context}

Question: {question}

If the query is not relevant to Singapore History, Please reply that you dont know the answer.Only answer questions based on the context, do not hallucinate.
        """
    )

    # Function to format documents for input into the prompt
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Retrieve documents using similarity search with relevance scores
    ranked_results = vector_store.similarity_search_with_relevance_scores(input_question, k=k)

    # # Collect relevant documents based on the similarity score
    # relevant_docs = []
    # for doc, score in ranked_results:
    #     doc.metadata["similarity_score"] = score  # Add the similarity score to metadata
    #     relevant_docs.append(doc)

    # formatted_context = format_docs(relevant_docs)

        # Collect relevant documents based on the similarity score (filter out negative scores)
    relevant_docs = []
    for doc, score in ranked_results:
      #  if 0.2 <= score <= 1:  # Only accept valid scores between 0 and 1
            doc.metadata["similarity_score"] = score  # Add the similarity score to metadata
            relevant_docs.append(doc)

    formatted_context = format_docs(relevant_docs)
    
    # Chain the context and question into a format suitable for the prompt
    rag_chain_from_docs = (
        RunnableLambda(lambda x: {"context": x["context"], "question": x["question"]})
        | prompt
        | ChatOpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))
    )

    # Get the result from the RAG chain
    result = rag_chain_from_docs.invoke({"context": formatted_context, "question": input_question})

    return {"answer": result.content, "context": relevant_docs}

# Test query
question = "Who is the founder of Singapore ?"
response = answer_question_from_vector_store(vectorstore, question)

print(response['answer'])

Perspective 1: Raffles was the founder of Singapore as he signed the 1819 Treaty that allowed the British to set up a trading post in the southern part of Singapore. His contributions in establishing Singapore as a thriving settlement are significant.

Perspective 2: Farquhar was the founder of Singapore as he did the work of building Singapore from scratch. His efforts in developing the early infrastructure and administration of Singapore were crucial in its foundation.

Perspective 3: Crawfurd could be considered the founder of Singapore as he signed the 1824 Treaty of Friendship and Alliance that gave the British control over the whole island. His role in solidifying British control over Singapore was essential in its development.

Discussion Questions:
1. How do the different perspectives on who founded Singapore reflect the complexities of historical narratives and the importance of multiple viewpoints?
2. In what ways do the contributions of Raffles, Farquhar, and Crawfurd highli

In [26]:
# Test query
question = "What caused Singapore to become Independent?"
response = answer_question_from_vector_store(vectorstore, question)

print(response['answer'])

Perspective 1: Singapore became an independent nation due to its separation from Malaysia on 9 August 1965. This perspective emphasizes the historical event that led to Singapore's independence.

Perspective 2: Singapore's independence was a result of the challenges it faced as a nation, including the need to survive in a volatile regional climate. This perspective highlights the external factors that influenced Singapore's path to independence.

Perspective 3: The decision for Singapore to become an independent nation was met with mixed responses, with concerns raised about its ability to thrive on its own. This perspective focuses on the reactions and doubts surrounding Singapore's independence.

Discussion Questions:
1. How did the volatile regional climate, particularly the Konfrontasi conflict, impact Singapore's journey to independence?
2. In what ways did the concerns raised about Singapore's ability to survive as an independent nation shape its early policies and strategies?
3.

In [None]:
#- pydantic/json output?
#load as markdown and chunk from Markdown
# docling - issue from quality of output from PDF file?
# reranking - pinecone and 
# langchain reranker - flash rank reranker - 
#chunk size/markdown
#.structured output extraction langchain - output in json
# additional comments/ json one-shot
# key 1 perspective 1 
# async -

In [27]:
print("Referenced sources:")
for doc in response['context']:
    # Print all metadata keys and values
    #print("Metadata:")
    for key, value in doc.metadata.items():
        # Check if the key is 'title', 'source', or 'url' and handle them accordingly
        if key in ['title', 'source', 'url']:
            print(f"{key.capitalize()}: {value}")
        else:
            # Print other metadata normally
            print(f"{key.capitalize()}: {value}")
    
    # Print the page content as well
    print(f"\nPage Content:\n{doc.page_content}\n")


Referenced sources:
Page: 122
Source: Sec2
Similarity_score: 0.6950448127989101

Page Content:
Within a few weeks, the Independence of Singapore Agreement was 
signed and Singapore was no longer a part of Malaysia. On 9 August 1965, 
an emotional Prime Minister Lee announced Singapore’s separation from 
the Federation in a live televised press conference:
Everytime we look back to the moment we signed this 
document it is for us a moment of anguish. … All my 
life, my whole adult life, I have believed in merger and 
the unity of the two territories. We are connected 
by geography, economics and ties of kinship. It broke 
everything we stood for.
– Adapted from The Straits Times, 10 August 1965
113
HOW DID SINGAPORE BECOME AN INDEPENDENT NATION?

Page: 112
Source: Sec2
Similarity_score: 0.6340082736210397

Page Content:
declare, on behalf of the people of 
Singapore, that as from today, the 
16th day of September, 1963, Singapore 
shall be forever a part of the sovereign 
democratic and

## Match to each perspective to top source by Cosine Similarity...?
##Johnny Tay
2:15 PM
https://cohere.com/rerank
Meldrick Wee
2:15 PM
https://python.langchain.com/docs/integrations/retrievers/flashrank-reranker/
Johnny Tay
2:16 PM
https://python.langchain.com/docs/integrations/retrievers/cohere-reranker/
Johnny Tay
2:18 PM
https://alain-airom.medium.com/my-first-hands-on-experience-with-docling-ed96eb7f864b
Johnny Tay
2:19 PM
https://huggingface.co/tasks/sentence-similarity
Johnny Tay
2:23 PM
retriever = db.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5}
)
https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/