In [1]:

import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Check if a specific variable is loaded
#print(os.getenv("OPENAI_API_KEY"))

True

In [12]:
import os

if "OPENAI_API_KEY" in os.environ:
    del os.environ["OPENAI_API_KEY"]

In [3]:
import pickle

# Load the processed documents from the pickle file
with open('textbooks.pkl', 'rb') as f:
    pdf_documents = pickle.load(f)

print("Documents loaded from file.")


Documents loaded from file.


In [5]:
# Load the processed documents from the pickle file
with open('infopedia.pickle', 'rb') as f:
    infopedia = pickle.load(f)

print("Documents loaded from file.")

Documents loaded from file.


In [7]:
import pandas as pd

# Load the pickle file
with open("infopedia.pickle", "rb") as f:
    data = pickle.load(f)

# Print first few rows if it's a DataFrame
if isinstance(data, list):  # If it's a list
    print(data[:5])  # Show first 5 items
elif isinstance(data, dict):  # If it's a dictionary
    print({k: data[k] for k in list(data.keys())[:5]})  # Show first 5 keys and values
else:
    print(data)  # Print if it's another type

{'Blue-throated bee-eater': {'url': 'https://www.nlb.gov.sg/main/article-detail?cmsuuid=fa1e7170-fe30-4323-acc0-25809646deb5', 'content': "The blue-throated bee-eater (Merops viridis) is a bird with blue, green and brown plumage. It is one of two bee-eater species found in Singapore.DescriptionThe blue-throated bee-eater is recognisable by its bright blue throat, chestnut head and back, and black eye stripe. Its upperparts and wing feathers are predominantly green, while its lower back, rump and tail are a brilliant blue. Adults have distinctive long tail feathers. Juveniles lack the long tail feathers and have a bluish green head. Adults measure about 28cm in length, including their tails.1Their flight is jerky and undulating, with a few rapid wingbeats followed by a long glide. Their call sounds like “berek berek”, giving rise to its Malay name, and is frequently uttered in flight.2ReproductionBlue-throated bee-eaters nest communally and excavate nest burrows in sandy banks, quarry f

In [9]:
import pickle
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
import os

# Helper function to sanitize metadata
def sanitize_metadata(metadata):
    """Ensure all metadata values are str, int, float, or bool (replace None with 'Unknown')."""
    return {k: (v if isinstance(v, (str, int, float, bool)) else "Unknown") for k, v in metadata.items()}

# Load Pickle File (Infopedia Articles)
with open("infopedia.pickle", "rb") as f:
    infopedia_data = pickle.load(f)  # This is a dictionary

# Convert Pickle Data into LangChain Document Objects
infopedia_articles = []
for title, details in infopedia_data.items():
    metadata = sanitize_metadata({
        'title': title,
        'source': details.get('source', 'Unknown'),
        'url': details.get('url', 'No URL'),
        'last_update_date': details.get('last_update_date', 'Unknown')
    })
    infopedia_articles.append(Document(page_content=details["content"], metadata=metadata))

# Load Articles from CSV
csv_file = "roots_sg_articles_cleaned.csv"
df = pd.read_csv(csv_file)

# Fill NaN values in 'text' column with an empty string or a default string
df['text'] = df['text'].fillna('missing content')

# Ensure 'text' column is a string
df['text'] = df['text'].astype(str)

# Convert CSV Data into LangChain Documents
articles = []
for _, row in df.iterrows():
    metadata = sanitize_metadata({
        'title': row['title'],
        'source': row['source'],
        'url': row['url']
    })
    articles.append(Document(page_content=row['text'], metadata=metadata))

# Load Processed PDF Documents from Pickle File
with open("textbooks.pkl", "rb") as f:
    pdf_documents = pickle.load(f)  # This should already be in LangChain Document format

# Ensure metadata for PDF documents is sanitized
pdf_documents = [
    Document(page_content=doc.page_content, metadata=sanitize_metadata(doc.metadata))
    for doc in pdf_documents
]

print("Documents loaded from pickle file.")

# Combine All Document Sources (CSV, Infopedia, and PDF)
all_documents = articles + infopedia_articles + pdf_documents

# Step 3: Chunk the Documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800, 
    chunk_overlap=100,
    separators=[".", "!", "?", ",", "\n\n", "\n", " ", ""]
)
all_chunks = []

for doc in all_documents:
    splits = text_splitter.split_text(doc.page_content)
    for split in splits:
        all_chunks.append({
            "text": split,
            "metadata": doc.metadata  # Metadata is already sanitized
        })

# Step 4: Generate Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=OPENAI_API_KEY)

# Step 5: Create the Combined Chroma Vector Store
chroma_vector_store = Chroma.from_texts(
    [chunk["text"] for chunk in all_chunks], 
    embeddings, 
    metadatas=[chunk["metadata"] for chunk in all_chunks],
    collection_name="combined_documents"
)

# Step 6: Save the Combined Vector Store
chroma_vector_store.persist()
print("Combined Chroma vector store created and saved successfully!")


Documents loaded from pickle file.
Combined Chroma vector store created and saved successfully!


  chroma_vector_store.persist()


In [16]:
import os
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnableLambda
from langchain.vectorstores import Chroma

# Load the existing ChromaDB vector store
vector_store = Chroma(persist_directory="./chroma_db", embedding_function=OpenAIEmbeddings())

# Function to query from the vector store
def answer_question_from_vectorstore(vector_store, input_question):
    prompt = PromptTemplate.from_template(
        template="""
You are the Heritage Education Research Assistant, an AI-powered tool designed to help educators in Singapore create comprehensive and balanced lesson plans about Singapore's history and culture. Your task is to provide multiple perspectives on historical questions, with a focus on validated sources from the National Heritage Board (NHB) and other reputable institutions.

Context: {context}

Question: {question}

Provide a well-structured response based on the given context.
        """
    )

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Retrieve the top-k most relevant documents
    retriever = vector_store.as_retriever(search_kwargs={"k": 10})
    retrieved_docs = retriever.invoke(input_question)

    formatted_context = format_docs(retrieved_docs)

    rag_chain_from_docs = (
        RunnableLambda(lambda x: {"context": x["context"], "question": x["question"]})
        | prompt
        | ChatOpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))
    )

    result = rag_chain_from_docs.invoke({"context": formatted_context, "question": input_question})
    
    return {"answer": result.content, "context": retrieved_docs}

# Test query
question = "Who is the founder of Singapore?"
response = answer_question_from_vectorstore(vector_store, question)

# Display AI Answer
print("AI Answer:")
print(response["answer"])
print("\nReferenced Sources:")

# Print each referenced document with all metadata
for doc in response["context"]:
    print("\nMetadata:")
    for key, value in doc.metadata.items():
        print(f"{key.capitalize()}: {value if value else 'Unknown'}")  # Ensure empty values are replaced
    
    # Print an excerpt of the page content
    print("\nPage Content:\n", doc.page_content[:500], "...\n")  # Print only the first 500 characters


AI Answer:
Response:

The question of who is the founder of Singapore is a complex one that has been debated by historians and scholars over the years. While there is no definitive answer, it is generally accepted that Sir Stamford Raffles played a significant role in the founding of modern Singapore.

Sir Stamford Raffles was a British statesman and colonial administrator who is credited with establishing a British settlement on the island of Singapore in 1819. He signed a treaty with the local Malay rulers that allowed the British East India Company to set up a trading post in the region. This marked the beginning of British colonial rule in Singapore.

However, it is important to note that Singapore has a long history that predates the arrival of Raffles. The island was known to have been inhabited by indigenous Malay and Orang Laut communities for centuries before the arrival of the British. Additionally, Singapore was part of the Malay Archipelago and had been a trading hub for va

In [17]:
import os
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnableLambda
from langchain.vectorstores import Chroma

# Load the existing ChromaDB vector store
vector_store = Chroma(persist_directory="./chroma_db", embedding_function=OpenAIEmbeddings())

# Function to query from the vector store
def answer_question_from_vectorstore(vector_store, input_question):
    prompt = PromptTemplate.from_template(
        template="""
You are the Heritage Education Research Assistant, an AI-powered tool designed to help educators in Singapore create comprehensive and balanced lesson plans about Singapore's history and culture. Your task is to provide multiple perspectives on historical questions, with a focus on validated sources from the National Heritage Board (NHB) and other reputable institutions.

Context: {context}

Question: {question}

Provide a well-structured response based on the given context.
        """
    )

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Retrieve the top-k most relevant documents
    retriever = vector_store.as_retriever(search_kwargs={"k": 10})
    retrieved_docs = retriever.invoke(input_question)

    # Debug: Print retrieved metadata to check if it's correctly stored
    print("\nDEBUG: Retrieved Documents Metadata")
    for i, doc in enumerate(retrieved_docs):
        print(f"\nDocument {i+1} Metadata:")
        for key, value in doc.metadata.items():
            print(f"{key}: {value}")

    formatted_context = format_docs(retrieved_docs)

    rag_chain_from_docs = (
        RunnableLambda(lambda x: {"context": x["context"], "question": x["question"]})
        | prompt
        | ChatOpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))
    )

    result = rag_chain_from_docs.invoke({"context": formatted_context, "question": input_question})
    
    return {"answer": result.content, "context": retrieved_docs}

# Test query
question = "Who is the founder of Singapore?"
response = answer_question_from_vectorstore(vector_store, question)

# Display AI Answer
print("\nAI Answer:")
print(response["answer"])
print("\nReferenced Sources:")

# Print each referenced document with all metadata
for doc in response["context"]:
    print("\nMetadata:")
    if doc.metadata:
        for key, value in doc.metadata.items():
            print(f"{key.capitalize()}: {value if value else 'Unknown'}")  # Replace None values
    else:
        print("No metadata available for this document.")

    # Print an excerpt of the page content
    print("\nPage Content:\n", doc.page_content[:500], "...\n")  # Print only the first 500 characters



DEBUG: Retrieved Documents Metadata

AI Answer:
Response:

The question of who is the founder of Singapore is a complex one that has been debated by historians and scholars over the years. While there is no definitive answer, it is generally accepted that Sir Stamford Raffles played a significant role in the founding of modern Singapore.

Sir Stamford Raffles was a British statesman and colonial administrator who is credited with establishing a British settlement on the island of Singapore in 1819. He signed a treaty with the local Malay rulers, allowing the British East India Company to set up a trading post in the region. This marked the beginning of British colonial rule in Singapore.

However, it is important to note that Singapore has a long history that predates the arrival of Raffles. The island was known to have been inhabited by indigenous Malay and Orang Laut communities for centuries before the arrival of the British. Additionally, Singapore was part of the Malay Archipelag

In [23]:
chroma_vector_store = Chroma.from_texts(
    [chunk["text"] for chunk in all_chunks], 
    embeddings, 
    metadatas=[chunk["metadata"] for chunk in all_chunks],  # Ensure metadata is stored
    collection_name="combined_documents"
)

# Debug: Print first 5 documents to check if they were saved
for i, chunk in enumerate(all_chunks[:5]):
    print(f"\nDocument {i+1}:")
    print("Text:", chunk["text"][:300], "...")
    print("Metadata:", chunk["metadata"])




Document 1:
Text: TL;DR
A mythical creature from ancient Chinese mythology that found expression through a ritual dance in Hakka culture struggles to find its place in the modern city-state.
MUSESG Volume 16 Issue 2 - July 2023
Text by Angela Sim, Heritage Researcher
Read the full MUSE SG Vol 16, Issue 2
Image above: ...
Metadata: {'title': 'High Leaps And Clashing Cymbals', 'source': 'Roots Website', 'url': 'https://www.roots.gov.sg/stories-landing/stories/high-leaps-and-clashing-cymbals/story'}

Document 2:
Text: .
In his adulthood, Confucius himself chanced upon the same creature when it was captured in a royal hunt. After the encounter, the sage predicted the end of his life; true to his word, he passed away three years later. What is the story behind this sacred chimera that foretold the birth and passing ...
Metadata: {'title': 'High Leaps And Clashing Cymbals', 'source': 'Roots Website', 'url': 'https://www.roots.gov.sg/stories-landing/stories/high-leaps-and-clashing-cymbals/st

## Integrating Websearch Features...???

In [30]:
from langchain_community.tools import DuckDuckGoSearchRun

search = DuckDuckGoSearchRun()

search.invoke("Who is the founder of Singapore?")

  ddgs_gen = ddgs.text(


'The history of the modern state of Singapore dates back to its founding in the early 19th century; however, evidence suggests that a significant trading settlement existed on the island in the 14th century. The last ruler of the Kingdom of Singapura, Parameswara, was expelled by the Majapahit or the Siamese before he founded Malacca. History of Singapore, a survey of the important events and people in the history of Singapore. Located at the southern tip of the Malay Peninsula, Singapore is the largest port city in Southeast Asia and one of the busiest in the world. It owes its growth and prosperity to its position at the Lee Kuan Yew was a politician and lawyer who was the first prime minister of Singapore, serving from 1959 to 1990. During his long rule, Singapore became the most prosperous country in Southeast Asia. Learn more about Lee\'s life and political career in this article. Lee Kuan Yew is widely recognized as the founding father of Singapore. He co-founded the People\'s Ac

In [29]:
from langchain_community.tools import DuckDuckGoSearchResults

search = DuckDuckGoSearchResults(output_format="list")

search.invoke("Who is the founder of Singapore?")

  ddgs_gen = ddgs.text(


"snippet: The history of the modern state of Singapore dates back to its founding in the early 19th century; however, evidence suggests that a significant trading settlement existed on the island in the 14th century. The last ruler of the Kingdom of Singapura, Parameswara, was expelled by the Majapahit or the Siamese before he founded Malacca.Singapore then came under the Malacca Sultanate and ..., title: History of Singapore - Wikipedia, link: https://en.wikipedia.org/wiki/History_of_Singapore, snippet: Lee Kuan Yew (born September 16, 1923, Singapore—died March 23, 2015, Singapore) was a politician and lawyer who was prime minister of Singapore from 1959 to 1990. Widely regarded as the founding father of modern Singapore, Lee transformed the city-state from a small, resource-deficient British colony with high rates of illiteracy into the most prosperous country in Southeast Asia., title: Lee Kuan Yew | Biography, Education, Achievements, & Facts - Britannica, link: https://www.britan

In [33]:
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper

wrapper = DuckDuckGoSearchAPIWrapper(region="sg-en", max_results=2)

search = DuckDuckGoSearchResults(api_wrapper=wrapper, source="news")

search.invoke("who is the founder of Singapore?")

  ddgs_gen = ddgs.text(


"snippet: The history of the modern state of Singapore dates back to its founding in the early 19th century; however, evidence suggests that a significant trading settlement existed on the island in the 14th century. The last ruler of the Kingdom of Singapura, Parameswara, was expelled by the Majapahit or the Siamese before he founded Malacca.Singapore then came under the Malacca Sultanate and ..., title: History of Singapore - Wikipedia, link: https://en.wikipedia.org/wiki/History_of_Singapore, snippet: history of Singapore, a survey of the important events and people in the history of Singapore.Located at the southern tip of the Malay Peninsula, Singapore is the largest port city in Southeast Asia and one of the busiest in the world. It owes its growth and prosperity to its position at the southern extremity of the peninsula, where it dominates the Strait of Malacca, which connects the ..., title: History of Singapore | People, Culture, Language, & Facts | Britannica, link: https://w