Vector Database FAISS/ChromaDB/PineCone?

-  Step 1: Load sec1 and sec2 PDFs
-  Step 2: Split into Chunks + Tag with Metadata ( Page Number + sec 2/sec1 label)
-  Step 3: Combine into VectorDB

Each Query Should Return:
- Page Number + Sec 2/Sec1 source
- UI should be able to filter between sec 1 and sec 2 content.

langchain==0.3.9
langchain-community==0.3.1
pypdf==4.2.0
python-dotenv
faiss-cpu==1.7.4


In [1]:

import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Check if a specific variable is loaded
print(os.getenv("OPENAI_API_KEY"))


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


In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableLambda
from langchain.chat_models import ChatOpenAI
import os

# Load PDFs
pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
documents = []

# Save the full text from each PDF
with open('full_text_from_pdfs.txt', 'w', encoding='utf-8') as full_text_file:
    for label, path in pdf_paths.items():
        loader = PyPDFLoader(path)
        pages = loader.load()
        full_text_file.write(f"--- {label} ---\n")
        for page in pages:
            # Write the full text from each page into the file
            full_text_file.write(page.page_content + "\n")



In [None]:
# run this part only to reload documents

In [20]:
from langchain.document_loaders import PyPDFLoader

# Load PDFs
pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
documents = []

# Verify the PDF loading process
for label, path in pdf_paths.items():
    try:
        loader = PyPDFLoader(path)
        pages = loader.load()
        print(f"Successfully loaded {len(pages)} pages from {label}")
        
        for page in pages:
            documents.append({
                "text": page.page_content,
                "metadata": {"page": page.metadata["page"], "source": label}
            })
    except Exception as e:
        print(f"Error loading {path}: {e}")

# Check if documents are populated
if not documents:
    print("Documents list is empty after loading PDFs!")
else:
    print(f"Documents loaded: {len(documents)}")
#delete this

Successfully loaded 220 pages from Sec1
Successfully loaded 216 pages from Sec2
Documents loaded: 436


In [22]:
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader

# Load PDFs
pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
documents = []

# Ensure documents are stored as Document objects
for label, path in pdf_paths.items():
    try:
        loader = PyPDFLoader(path)
        pages = loader.load()
        print(f"Successfully loaded {len(pages)} pages from {label}")
        
        for page in pages:
            # Create a Document object with the correct structure
            doc = Document(
                page_content=page.page_content,
                metadata={"page": page.metadata["page"], "source": label}
            )
            documents.append(doc)
    except Exception as e:
        print(f"Error loading {path}: {e}")

# Check if documents are populated
if not documents:
    print("Documents list is empty after loading PDFs!")
else:
    print(f"Documents loaded: {len(documents)}")



Successfully loaded 220 pages from Sec1
Successfully loaded 216 pages from Sec2
Documents loaded: 436


## Trying different Text Splitters

1. Recursive Character
2. Paragraph
3. Semantic

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Chunk the documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=50
)
chunks = []

# Save the chunks into a file for observation
with open('text_chunks_recursive.txt', 'w', encoding='utf-8') as chunk_file:
    for label, path in pdf_paths.items():
        loader = PyPDFLoader(path)
        pages = loader.load()
        for page in pages:
            text = page.page_content
            splits = text_splitter.split_text(text)  # Split the text into chunks
            for split in splits:
                chunks.append({
                    "text": split,
                    "metadata": {"page": page.metadata["page"], "source": label}
                })
                # Write the chunk text into the file
                chunk_file.write(f"--- Chunk from {label} (Page {page.metadata['page']}) ---\n")
                chunk_file.write(split + "\n")


In [15]:
import re

# Chunking text based on paragraph (assuming each paragraph is separated by a new line)
paragraph_chunks = []

with open('paragraph_text_chunks.txt', 'w', encoding='utf-8') as chunk_file:
    for label, path in pdf_paths.items():
        loader = PyPDFLoader(path)
        pages = loader.load()
        for page in pages:
            text = page.page_content
            # Split based on new lines (assumed paragraph boundary)
            splits = re.split(r'\n\n+', text)  # Split by double newlines
            for split in splits:
                paragraph_chunks.append({
                    "text": split,
                    "metadata": {"page": page.metadata["page"], "source": label}
                })
                # Write each paragrapbh into the file
                chunk_file.write(f"--- Paragraph Chunk from {label} (Page {page.metadata['page']}) ---\n")
                chunk_file.write(split + "\n")


document check

In [19]:
# Ensure documents list is populated
if not documents:
    print("Documents list is empty!")
else:
    print(f"Documents loaded: {len(documents)}")

# Generate embeddings for the documents to check if embeddings are created correctly
embedding_texts = [doc["text"] for doc in documents]

# Generate embeddings using OpenAI embeddings model
embeddings_vectors = embeddings.embed_documents(embedding_texts)

# Check if embeddings were generated correctly
if len(embeddings_vectors) == len(embedding_texts):
    print("Embeddings generated correctly!")
else:
    print(f"Error: Expected {len(embedding_texts)} embeddings, but got {len(embeddings_vectors)}.")


Documents list is empty!
Embeddings generated correctly!


## Vector Database Creation (FAISS)

In [10]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableLambda
from langchain.chat_models import ChatOpenAI
#from langchain.output_parsers import StrOutputParser
import os

# Load PDFs
pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
documents = []

for label, path in pdf_paths.items():
    loader = PyPDFLoader(path)
    pages = loader.load()
    for page in pages:
        documents.append({
            "text": page.page_content,
            "metadata": {"page": page.metadata["page"], "source": label}
        })

# Chunk the documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=50
)
chunks = []

for doc in documents:
    splits = text_splitter.split_text(doc["text"])
    for split in splits:
        chunks.append({
            "text": split,
            "metadata": doc["metadata"]
        })

# Generate Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=OPENAI_API_KEY)

faiss_index = FAISS.from_texts(
    [chunk["text"] for chunk in chunks], 
    embeddings, 
    metadatas=[chunk["metadata"] for chunk in chunks]
)

# Save VectorDB
faiss_index.save_local("faiss_index")

print("Vector database created successfully!")

Vector database created successfully!


In [30]:
#rerun this one for results.
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
import os

# Load PDFs
pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
documents = []

# Load the PDFs and create documents list
for label, path in pdf_paths.items():
    loader = PyPDFLoader(path)
    pages = loader.load()
    for page in pages:
        documents.append(Document(page_content=page.page_content, metadata={"page": page.metadata["page"], "source": label}))

# Initialize the OpenAI Embeddings model
embed_model = OpenAIEmbeddings(model="text-embedding-3-small")

# Initialize the SemanticChunker with the model
semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type="percentile")

# Chunk the documents semantically
semantic_chunks = semantic_chunker.create_documents([d.page_content for d in documents])

# Save semantic chunks to a file for observation
with open('semantic_text_chunks.txt', 'w', encoding='utf-8') as chunk_file:
    for semantic_chunk in semantic_chunks:
        # Write the chunk's content and metadata to the file
        chunk_file.write(f"--- Semantic Chunk from {semantic_chunk.metadata['source']} (Page {semantic_chunk.metadata['page']}) ---\n")
        chunk_file.write(semantic_chunk.page_content + "\n")
        chunk_file.write("\n")  # Add a blank line between chunks

# Optional: Print a chunk if it contains a specific phrase, e.g., "Effect of Pre-training Tasks"
for semantic_chunk in semantic_chunks:
    if "Effect of Pre-training Tasks" in semantic_chunk.page_content:
        print(f"--- Found Chunk ---\n{semantic_chunk.page_content}")
        print(f"Length: {len(semantic_chunk.page_content)}")


KeyError: 'source'

In [34]:

# Initialize the SemanticChunker with the model
semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type="percentile")

# Chunk the documents semantically
semantic_chunks = semantic_chunker.create_documents([d.page_content for d in documents])

# Save semantic chunks to a file for observation
with open('semantic_text_chunks.txt', 'w', encoding='utf-8') as chunk_file:
    chunk_index = 0  # Initialize chunk index
    for doc in documents:
        # Get the number of chunks for the current document
        doc_chunks = [chunk for chunk in semantic_chunks if chunk.page_content in doc.page_content]
        
        # Add each chunk with the corresponding metadata
        for semantic_chunk in doc_chunks:
            source = doc.metadata.get('source', 'Unknown')
            page = doc.metadata.get('page', 'Unknown')
            
            # Write the chunk's content and metadata to the file
            chunk_file.write(f"--- Semantic Chunk from {source} (Page {page}) ---\n")
            chunk_file.write(semantic_chunk.page_content + "\n")
            chunk_file.write("\n")  # Add a blank line between chunks

            chunk_index += 1  # Increment the chunk index

# Optional: Print a chunk if it contains a specific phrase, e.g., "Effect of Pre-training Tasks"
for semantic_chunk in semantic_chunks:
    if "Effect of Pre-training Tasks" in semantic_chunk.page_content:
        print(f"--- Found Chunk ---\n{semantic_chunk.page_content}")
        print(f"Length: {len(semantic_chunk.page_content)}")

## Testing Response from Vector DB

In [21]:
def answer_question_from_vector_store(vector_store, input_question):
    prompt = PromptTemplate.from_template(
        template="""
You are the Heritage Education Research Assistant, an AI-powered tool designed to help educators in Singapore create comprehensive and balanced lesson plans about Singapore's history and culture. Your task is to provide multiple perspectives on historical questions, with a focus on validated sources from the National Heritage Board (NHB) and other reputable institutions.

Given a user's question and any provided filters (student age group, historical timeframe, theme), please:

1. Generate 3-5 different perspectives on the question, each with a brief summary (2-3 sentences) explaining the reasoning behind that perspective.
For each perspective, provide citations in page number

2. Ensure that the language and content complexity is appropriate for the specified student age group (if provided).
3. If a specific historical timeframe or theme is specified, tailor your responses to fit within those parameters.
4. After presenting the perspectives, suggest 2-3 discussion questions that could encourage critical thinking among students about these different viewpoints.

Remember, your goal is to provide educators with balanced, well-sourced information that they can use to create engaging and thought-provoking lessons about Singapore's history and culture.

Context: {context}

Question: {question}
        """
    )

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)
    
    retriever = vector_store.as_retriever(search_kwargs={"k": 10})
    retrieved_docs = retriever.invoke(input_question)
    
    formatted_context = format_docs(retrieved_docs)
    
    rag_chain_from_docs = (
        RunnableLambda(lambda x: {"context": x["context"], "question": x["question"]})
        | prompt
        | ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
    )

    result = rag_chain_from_docs.invoke({"context": formatted_context, "question": input_question})
    return {"answer": result.content, "context": retrieved_docs} 

# Load FAISS index
vectorstore = FAISS.load_local("faiss_index", embeddings,allow_dangerous_deserialization=True)


In [22]:
# Test query
question = "Who is the founder of Singapore?"
response = answer_question_from_vector_store(vectorstore, question)

print(response['answer'])
print()
print(f"Referenced sources: {[doc.metadata['source'] for doc in response['context']]}")


Perspective 1: Sir Stamford Raffles
Summary: Some argue that Raffles should be considered the founder of Singapore due to his role in signing the 1819 Treaty that allowed the British to establish a trading post in the region. His contributions to the early development of Singapore are significant and well-documented.

Perspective 2: William Farquhar
Summary: Others believe that Farquhar should be recognized as the founder of Singapore because of his efforts in building the settlement from scratch. Farquhar played a crucial role in the early development of Singapore alongside Raffles.

Perspective 3: John Crawfurd
Summary: Some may consider Crawfurd as the founder of Singapore because he signed the 1824 Treaty of Friendship and Alliance that gave the British control over the entire island. His diplomatic efforts and contributions to the British colonial presence in Singapore are noteworthy.

Discussion Questions:
1. How do the different perspectives on who founded Singapore reflect the 

In [23]:
print(response['answer'])
print()
print("Referenced sources:")
for doc in response['context']:
    print(f"Page {doc.metadata['page']} (Source: {doc.metadata['source']}):\n{doc.page_content}\n")


Perspective 1: Sir Stamford Raffles
Summary: Some argue that Raffles should be considered the founder of Singapore due to his role in signing the 1819 Treaty that allowed the British to establish a trading post in the region. His contributions to the early development of Singapore are significant and well-documented.

Perspective 2: William Farquhar
Summary: Others believe that Farquhar should be recognized as the founder of Singapore because of his efforts in building the settlement from scratch. Farquhar played a crucial role in the early development of Singapore alongside Raffles.

Perspective 3: John Crawfurd
Summary: Some may consider Crawfurd as the founder of Singapore because he signed the 1824 Treaty of Friendship and Alliance that gave the British control over the entire island. His diplomatic efforts and contributions to the British colonial presence in Singapore are noteworthy.

Discussion Questions:
1. How do the different perspectives on who founded Singapore reflect the 

In [None]:
#run this later at home

In [None]:
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import os

# Step 1: Load Articles from CSV (assuming columns: url, text, source, title)
csv_file = "roots_sg_articles_cleaned.csv"  # Replace with your CSV file path
df = pd.read_csv(csv_file)

# Create documents from CSV data
articles = []
for index, row in df.iterrows():
    articles.append(Document(
        page_content=row['text'],
        metadata={
            'title': row['title'],
            'source': row['source'],
            'url': row['url']
        }
    ))

# Step 2: Load and Process PDFs
pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
pdf_documents = []

for label, path in pdf_paths.items():
    loader = PyPDFLoader(path)
    pages = loader.load()
    for page in pages:
        pdf_documents.append({
            "text": page.page_content,
            "metadata": {"page": page.metadata["page"], "source": label}
        })

# Combine both article and PDF documents into one list
all_documents = articles + pdf_documents

# Step 3: Chunk the Documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
all_chunks = []

for doc in all_documents:
    splits = text_splitter.split_text(doc.page_content)
    for split in splits:
        all_chunks.append({
            "text": split,
            "metadata": doc.metadata
        })

# Step 4: Generate Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=os.getenv("OPENAI_API_KEY"))

# Step 5: Create the Combined FAISS Vector Store
faiss_index = FAISS.from_texts(
    [chunk["text"] for chunk in all_chunks], 
    embeddings, 
    metadatas=[chunk["metadata"] for chunk in all_chunks]
)

# Step 6: Save the Combined Vector Store
faiss_index.save_local("combined_faiss_index")

print("Combined vector store created and saved successfully!")


In [None]:

# Function to query from the vector store
def answer_question_from_vectorstore(vector_store, input_question):
    prompt = PromptTemplate.from_template(
        template="""
You are the Heritage Education Research Assistant, an AI-powered tool designed to help educators in Singapore create comprehensive and balanced lesson plans about Singapore's history and culture. Your task is to provide multiple perspectives on historical questions, with a focus on validated sources from the National Heritage Board (NHB) and other reputable institutions.

Given a user's question and any provided filters (student age group, historical timeframe, theme), please:

1. Generate 3-5 different perspectives on the question, each with a brief summary (2-3 sentences) explaining the reasoning behind that perspective.
For each perspective, provide citations in page number

2. Ensure that the language and content complexity is appropriate for the specified student age group (if provided).
3. If a specific historical timeframe or theme is specified, tailor your responses to fit within those parameters.
4. After presenting the perspectives, suggest 2-3 discussion questions that could encourage critical thinking among students about these different viewpoints.

Remember, your goal is to provide educators with balanced, well-sourced information that they can use to create engaging and thought-provoking lessons about Singapore's history and culture.

Context: {context}

Question: {question}
        """
    )

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    retriever = vector_store.as_retriever(search_kwargs={"k": 10})
    retrieved_docs = retriever.invoke(input_question)

    formatted_context = format_docs(retrieved_docs)

    rag_chain_from_docs = (
        RunnableLambda(lambda x: {"context": x["context"], "question": x["question"]})
        | prompt
        | ChatOpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))
    )

    result = rag_chain_from_docs.invoke({"context": formatted_context, "question": input_question})
    return {"answer": result.content, "context": retrieved_docs}