### Model 1

In [1]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.chat_models import ChatPerplexity
from langchain.chains import RetrievalQA
import os

PDF_FOLDER = "data"
CHUNK_SIZE = 5000
CHUNK_OVERLAP = 500
VECTOR_STORE_DIR = "chroma_index_finance"
LOG_FILE = "qa_log_1.csv"

# === Load all PDFs ===
all_documents = []
for filename in os.listdir(PDF_FOLDER):
    if filename.endswith(".pdf"):
        file_path = os.path.join(PDF_FOLDER, filename)
        loader = PyMuPDFLoader(file_path)
        documents = loader.load()
        for doc in documents:
            doc.metadata["source"] = filename  # track which PDF it came from
        all_documents.extend(documents)

print(f"Loaded {len(all_documents)} total documents.")

# === Split text into chunks ===
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunks = text_splitter.split_documents(all_documents)

# Add unique chunk IDs and ensure source is in metadata
for i, doc in enumerate(chunks):
    doc.metadata["chunk_id"] = i
    doc.metadata["source"] = doc.metadata.get("source", "unknown")

#Embbedding
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False} #False Euclidean, True cosine similarity
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

#Vector Store
vector_store = FAISS.from_documents(chunks, hf)
vector_store.save_local("faiss_index_open")

#Retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

#LLM
llm = ChatPerplexity(
    model="sonar",
    pplx_api_key = "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",
    temperature=0.2
)

# QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

Loaded 1182 total documents.


  hf = HuggingFaceEmbeddings(
  llm = ChatPerplexity(


In [3]:
import json
import os
import csv

def process_answer(query):
    response = qa_chain({"query": query})
    
    # Handle case where result is a JSON-formatted string
    try:
        result = json.loads(response["result"])
        answer_text = result.get("answer", response["result"])
    except (json.JSONDecodeError, TypeError):
        # Fallback: use the raw string if not JSON
        answer_text = response["result"]

    source_docs = response['source_documents']

    sources_info = []
    for doc in source_docs:
        chunk_id = doc.metadata.get("chunk_id", "N/A")
        source = doc.metadata.get("source", "N/A")
        sources_info.append({"chunk_id": str(chunk_id), "source": source})

    # Prepare data to log
    top_k_chunks = [src["chunk_id"] for src in sources_info if src["chunk_id"] != "N/A"]
    sources_list = [src["source"] for src in sources_info if src["source"] != "N/A"]

    # Write to CSV
    file_exists = os.path.isfile(LOG_FILE)
    with open(LOG_FILE, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=["question", "answer", "sources", "top_k_chunks"])
        if not file_exists:
            writer.writeheader()
        writer.writerow({
            "question": query,
            "answer": answer_text,
            "sources": "; ".join(list(set(sources_list))),
            "top_k_chunks": "; ".join(list(set(top_k_chunks)))
        })


In [5]:
import pandas as pd
questions= pd.read_csv("session_1/questions.csv")

for index, row in questions.iterrows():
    question = row['question']
    process_answer(question)

  response = qa_chain({"query": query})


In [6]:
df = pd.read_csv("qa_log_1.csv")
df.head(10)

Unnamed: 0,question,answer,sources,top_k_chunks
0,How did changes in marketing personnel expense...,"In the fiscal year 2024, Amazon's sales and ma...",2024-amazon-annual-report-10K.pdf; 2024-google...,50; 477; 51
1,What factors influence the success of Apple In...,Several key factors influence the success of A...,2024-apple-annual-report-10K.pdf,124; 148; 119
2,How does Cisco’s 2024 employee engagement stra...,Cisco’s 2024 employee engagement strategy fost...,2024-cisco-full-annual-report.pdf,281; 260; 282
3,What factors may affect the revenues of Alphab...,Several factors may affect the revenues of Alp...,2024-google-annual-report-10K.pdf,479; 474; 469
4,What are the potential risks and uncertainties...,"Meta Platforms, Inc. faced several potential r...",2024-oracle-annual-report-10K.pdf; 2024-meta-f...,544; 547; 961
5,What were the main components contributing to ...,To address the query about Netflix's net defer...,2024-netflix-annual-report-10K.pdf,613; 617; 615
6,What was the net change in the number of outst...,To determine the net change in the number of o...,2024-nvidia-annual-report-10K.pdf,821; 859; 752
7,What is Oracle Corporation's strategic approac...,Oracle Corporation’s strategic approach to acq...,2024-oracle-annual-report-10K.pdf,1022; 896; 888
8,"How does the conversion of Reddit, Inc.'s Clas...","The conversion of Reddit, Inc.'s Class B commo...",2024-reddit-annual-report-10K.pdf,1172; 1231; 1173
9,"How did Tesla, Inc.'s interest expense for the...","To compare Tesla, Inc.'s interest expense for ...",2024-tsla-annual-report-10K.pdf,1342; 1347; 1343
