In [None]:
pip install chromadb


In [None]:
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import os

# Step 1: Load Articles from CSV (assuming columns: url, text, source, title)
csv_file = "articles.csv"  # Replace with your CSV file path
df = pd.read_csv(csv_file)

# Create documents from CSV data
articles = []
for index, row in df.iterrows():
    articles.append(Document(
        page_content=row['text'],
        metadata={
            'title': row['title'],
            'source': row['source'],
            'url': row['url']
        }
    ))

# Step 2: Load and Process PDFs
pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
pdf_documents = []

for label, path in pdf_paths.items():
    loader = PyPDFLoader(path)
    pages = loader.load()
    for page in pages:
        pdf_documents.append({
            "text": page.page_content,
            "metadata": {"page": page.metadata["page"], "source": label}
        })

# Combine both article and PDF documents into one list
all_documents = articles + pdf_documents

# Step 3: Chunk the Documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
all_chunks = []

for doc in all_documents:
    splits = text_splitter.split_text(doc.page_content)
    for split in splits:
        all_chunks.append({
            "text": split,
            "metadata": doc.metadata
        })

# Step 4: Generate Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=os.getenv("OPENAI_API_KEY"))

# Step 5: Create the Combined Chroma Vector Store
# Chroma requires a collection name for each vector store
chroma_vector_store = Chroma.from_texts(
    [chunk["text"] for chunk in all_chunks], 
    embeddings, 
    metadatas=[chunk["metadata"] for chunk in all_chunks],
    collection_name="combined_documents"
)

# Step 6: Save the Combined Vector Store
chroma_vector_store.persist()
print("Combined Chroma vector store created and saved successfully!")

# Function to query from the vector store (example)
def answer_question_from_vectorstore(vector_store, input_question):
    prompt = PromptTemplate.from_template(
        template="""
You are the Heritage Education Research Assistant, an AI-powered tool designed to help educators in Singapore create comprehensive and balanced lesson plans about Singapore's history and culture. Your task is to provide multiple perspectives on historical questions, with a focus on validated sources from the National Heritage Board (NHB) and other reputable institutions.

Context: {context}

Question: {question}
        """
    )

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    retriever = vector_store.as_retriever(search_kwargs={"k": 10})
    retrieved_docs = retriever.invoke(input_question)

    formatted_context = format_docs(retrieved_docs)

    rag_chain_from_docs = (
        RunnableLambda(lambda x: {"context": x["context"], "question": x["question"]})
        | prompt
        | ChatOpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))
    )

    result = rag_chain_from_docs.invoke({"context": formatted_context, "question": input_question})
    return {"answer": result.content, "context": retrieved_docs}


In [None]:
# Test query
question = "Who is the founder of Singapore?"
response = answer_question_from_vector_store(vectorstore, question)
print(response['answer'])
print()
print("Referenced sources:")
for doc in response['context']:
    print(f"Page {doc.metadata['page']} (Source: {doc.metadata['source']}):\n{doc.page_content}\n")


In [None]:
import pickle

# Save the combined documents (articles + PDF pages) to a file
with open('processed_documents.pkl', 'wb') as f:
    pickle.dump(all_documents, f)


In [None]:
import pickle

# Load the processed documents from the pickle file
with open('processed_documents.pkl', 'rb') as f:
    all_documents = pickle.load(f)

print("Documents loaded from file.")


In [None]:
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import os
import pinecone

# Initialize Pinecone (use your own API key)
pinecone.init(api_key=os.getenv("PINECONE_API_KEY"), environment="us-west1-gcp")  # Use your region

# # Step 1: Load Articles from CSV (assuming columns: url, text, source, title)
# csv_file = "articles.csv"  # Replace with your CSV file path
# df = pd.read_csv(csv_file)

# # Create documents from CSV data
# articles = []
# for index, row in df.iterrows():
#     articles.append(Document(
#         page_content=row['text'],
#         metadata={
#             'title': row['title'],
#             'source': row['source'],
#             'url': row['url']
#         }
#     ))

# # Step 2: Load and Process PDFs
# pdf_paths = {"Sec1": "sec1.pdf", "Sec2": "sec2.pdf"}
# pdf_documents = []

# for label, path in pdf_paths.items():
#     loader = PyPDFLoader(path)
#     pages = loader.load()
#     for page in pages:
#         pdf_documents.append({
#             "text": page.page_content,
#             "metadata": {"page": page.metadata["page"], "source": label}
#         })

# # Combine both article and PDF documents into one list
# all_documents = articles + pdf_documents

# Step 3: Chunk the Documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
all_chunks = []

for doc in all_documents:
    splits = text_splitter.split_text(doc.page_content)
    for split in splits:
        all_chunks.append({
            "text": split,
            "metadata": doc.metadata
        })

# Step 4: Generate Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=os.getenv("OPENAI_API_KEY"))

# Step 5: Create the Combined Pinecone Vector Store
# Create a unique index name for Pinecone
index_name = "combined_documents"

# Create Pinecone index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=embeddings.embed_query("test").shape[0])

# Connect to the index
index = pinecone.GRPCIndex(index_name)

# Step 6: Add Documents to Pinecone
pinecone_docs = [
    {
        "id": str(i),
        "values": embeddings.embed_documents([chunk["text"]])[0],  # embedding for text
        "metadata": chunk["metadata"]
    }
    for i, chunk in enumerate(all_chunks)
]

# Upsert documents into Pinecone
index.upsert(vectors=pinecone_docs)

print("Combined Pinecone vector store created and saved successfully!")

# Function to query from the vector store (example)
def answer_question_from_vectorstore(vector_store, input_question):
    prompt = PromptTemplate.from_template(
        template="""
You are the Heritage Education Research Assistant, an AI-powered tool designed to help educators in Singapore create comprehensive and balanced lesson plans about Singapore's history and culture. Your task is to provide multiple perspectives on historical questions, with a focus on validated sources from the National Heritage Board (NHB) and other reputable institutions.

Context: {context}

Question: {question}
        """
    )

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    retriever = vector_store.as_retriever(search_kwargs={"k": 10})
    retrieved_docs = retriever.invoke(input_question)

    formatted_context = format_docs(retrieved_docs)

    rag_chain_from_docs = (
        RunnableLambda(lambda x: {"context": x["context"], "question": x["question"]})
        | prompt
        | ChatOpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))
    )

    result = rag_chain_from_docs.invoke({"context": formatted_context, "question": input_question})
    return {"answer": result.content, "context": retrieved_docs}
