### Model 2

In [2]:
import pandas as pd

In [None]:
import os
import csv
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama
from langchain_community.chat_models import ChatPerplexity
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter

# === Settings ===
PDF_FOLDER = "data"
CHUNK_SIZE = 5000
CHUNK_OVERLAP = 500
VECTOR_STORE_DIR = "chroma_index_finance"
LOG_FILE = "qa_log_2.csv"

# === Load all PDFs ===
all_documents = []
for filename in os.listdir(PDF_FOLDER):
    if filename.endswith(".pdf"):
        file_path = os.path.join(PDF_FOLDER, filename)
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        for doc in documents:
            doc.metadata["source"] = filename  # track which PDF it came from
        all_documents.extend(documents)

print(f"Loaded {len(all_documents)} total documents.")

# === Split text into chunks ===
text_splitter = TokenTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunks = text_splitter.split_documents(all_documents)

# Add unique chunk IDs and ensure source is in metadata
for i, doc in enumerate(chunks):
    doc.metadata["chunk_id"] = i
    doc.metadata["source"] = doc.metadata.get("source", "unknown")

# === Embedding model ===
model_name = "BAAI/bge-base-en"
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# === Vector Store ===
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=hf,
    persist_directory=VECTOR_STORE_DIR
)
# vector_store.persist()

# === Prompt Template (only answer output) ===
prompt_template = """
You are a professional financial advisor with expertise in corporate finance, investment analysis, and career development in finance-related roles.

Use only the information provided in the context to answer the user's question.
Do not make assumptions or fabricate any details.

Respond clearly and professionally, as if advising a client on their financial career or investment decisions.

{context}

Question: {question}

If the answer is not explicitly stated in the context, respond with: "I don't know based on the provided document".
"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# === Retriever Setup ===
base_retriever = vector_store.as_retriever(search_kwargs={"k": 5})

# Perplexity LLM
perplexity_llm = ChatPerplexity(
    model="sonar",
    pplx_api_key="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",
    temperature=0.2
)

# Compression retriever
compressor = LLMChainExtractor.from_llm(perplexity_llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=base_retriever
)

# === QA Chain ===
qa_chain = RetrievalQA.from_chain_type(
    llm=perplexity_llm,
    chain_type="stuff",
    retriever=compression_retriever,
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

# === Process QA + Save to CSV ===



Loaded 1182 total documents.


  hf = HuggingFaceBgeEmbeddings(
  perplexity_llm = ChatPerplexity(


In [4]:
import json
import os
import csv

def process_answer(query):
    response = qa_chain({"query": query})
    
    # Handle case where result is a JSON-formatted string
    try:
        result = json.loads(response["result"])
        answer_text = result.get("answer", response["result"])
    except (json.JSONDecodeError, TypeError):
        # Fallback: use the raw string if not JSON
        answer_text = response["result"]

    source_docs = response['source_documents']

    sources_info = []
    for doc in source_docs:
        chunk_id = doc.metadata.get("chunk_id", "N/A")
        source = doc.metadata.get("source", "N/A")
        sources_info.append({"chunk_id": str(chunk_id), "source": source})

    # Prepare data to log
    top_k_chunks = [src["chunk_id"] for src in sources_info if src["chunk_id"] != "N/A"]
    sources_list = [src["source"] for src in sources_info if src["source"] != "N/A"]

    # Write to CSV
    file_exists = os.path.isfile(LOG_FILE)
    with open(LOG_FILE, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=["question", "answer", "sources", "top_k_chunks"])
        if not file_exists:
            writer.writeheader()
        writer.writerow({
            "question": query,
            "answer": answer_text,
            "sources": "; ".join(list(set(sources_list))),
            "top_k_chunks": "; ".join(list(set(top_k_chunks)))
        })


In [None]:
import pandas as pd
questions= pd.read_csv("session_1/questions.csv")

for index, row in questions.iterrows():
    question = row['question']
    process_answer(question)

  response = qa_chain({"query": query})


In [None]:
df = pd.read_csv("qa_log_2.csv")
df.head(10)

Unnamed: 0,question,answer,sources,top_k_chunks
0,How did Amazon's efforts to develop and retain...,I don't know based on the provided document. T...,2024-amazon-annual-report-10K.pdf,1; 14; 86
1,During Apple Inc.’s fiscal year ended Septembe...,"Based on the context you provided, the followi...",2024-apple-annual-report-10K.pdf,125; 113; 200; 201
2,What are the potential risks and challenges th...,Cisco faced several challenges in fiscal year ...,2024-cisco-full-annual-report.pdf,214; 215; 216; 218; 263
3,Describe the major regulatory and legal challe...,Alphabet Inc. faced several major regulatory a...,2024-google-annual-report-10K.pdf,347; 438; 399; 396; 345
4,What are the limitations and key consideration...,The search results do not explicitly detail th...,2024-meta-full-annual-report.pdf; 2024-reddit-...,449; 967; 442; 448; 447
5,"What is the auditor's opinion on Netflix, Inc....","Based on the context you provided, the auditor...",2024-netflix-annual-report-10K.pdf,488; 495; 483; 490; 492
6,How does NVIDIA Corporation's Board of Directo...,I don't know based on the provided document. T...,2024-nvidia-annual-report-10K.pdf,675; 585; 558; 583; 620
7,What types of server products does Oracle Corp...,Oracle Corporation offers two main types of se...,2024-oracle-annual-report-10K.pdf,728; 738; 726; 740; 724
8,What were the key consolidated financial resul...,Based on the provided context and the addition...,2024-reddit-annual-report-10K.pdf,983; 984; 982; 981; 978
9,"Question: \nAccording to Tesla, Inc.’s annual...",I don't know based on the provided document. T...,2024-tsla-annual-report-10K.pdf,1091; 1067; 1016
