### Model 1

In [1]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.chat_models import ChatPerplexity
from langchain.chains import RetrievalQA
import os

PDF_FOLDER = "data"
CHUNK_SIZE = 5000
CHUNK_OVERLAP = 500
VECTOR_STORE_DIR = "chroma_index_finance"
LOG_FILE = "qa_log_1.csv"

# === Load all PDFs ===
all_documents = []
for filename in os.listdir(PDF_FOLDER):
    if filename.endswith(".pdf"):
        file_path = os.path.join(PDF_FOLDER, filename)
        loader = PyMuPDFLoader(file_path)
        documents = loader.load()
        for doc in documents:
            doc.metadata["source"] = filename  # track which PDF it came from
        all_documents.extend(documents)

print(f"Loaded {len(all_documents)} total documents.")

# === Split text into chunks ===
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunks = text_splitter.split_documents(all_documents)

# Add unique chunk IDs and ensure source is in metadata
for i, doc in enumerate(chunks):
    doc.metadata["chunk_id"] = i
    doc.metadata["source"] = doc.metadata.get("source", "unknown")

#Embbedding
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False} #False Euclidean, True cosine similarity
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

#Vector Store
vector_store = FAISS.from_documents(chunks, hf)
vector_store.save_local("faiss_index_open")

#Retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

#LLM
llm = ChatPerplexity(
    model="sonar",
    pplx_api_key = "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",
    temperature=0.2
)

# QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

Loaded 1182 total documents.


  hf = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
  llm = ChatPerplexity(


In [2]:
import json
import os
import csv

def process_answer(query):
    response = qa_chain({"query": query})
    
    # Handle case where result is a JSON-formatted string
    try:
        result = json.loads(response["result"])
        answer_text = result.get("answer", response["result"])
    except (json.JSONDecodeError, TypeError):
        # Fallback: use the raw string if not JSON
        answer_text = response["result"]

    source_docs = response['source_documents']

    sources_info = []
    for doc in source_docs:
        chunk_id = doc.metadata.get("chunk_id", "N/A")
        source = doc.metadata.get("source", "N/A")
        sources_info.append({"chunk_id": str(chunk_id), "source": source})

    # Prepare data to log
    top_k_chunks = [src["chunk_id"] for src in sources_info if src["chunk_id"] != "N/A"]
    sources_list = [src["source"] for src in sources_info if src["source"] != "N/A"]

    # Write to CSV
    file_exists = os.path.isfile(LOG_FILE)
    with open(LOG_FILE, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=["question", "answer", "sources", "top_k_chunks"])
        if not file_exists:
            writer.writeheader()
        writer.writerow({
            "question": query,
            "answer": answer_text,
            "sources": "; ".join(list(set(sources_list))),
            "top_k_chunks": "; ".join(list(set(top_k_chunks)))
        })


In [3]:
import pandas as pd
questions= pd.read_csv("session_1/questions.csv")

for index, row in questions.iterrows():
    question = row['question']
    process_answer(question)

  response = qa_chain({"query": query})


In [4]:
df = pd.read_csv("qa_log_1.csv")
df.head(10)

Unnamed: 0,question,answer,sources,top_k_chunks
0,"How could challenges in hiring, retaining, and...","Challenges in hiring, retaining, and managing ...",2024-amazon-annual-report-10K.pdf,27; 1; 39
1,What was the total lease liability for Apple I...,"As of September 28, 2024, Apple Inc. had fixed...",2024-cisco-full-annual-report.pdf; 2024-apple-...,154; 386; 157
2,What are the key competitive challenges Cisco ...,"In 2024, Cisco faced several key competitive c...",2024-cisco-full-annual-report.pdf,321; 293; 248
3,Based on the Alphabet Inc. fiscal year ended D...,For Alphabet Inc. in the fiscal year ended Dec...,2024-google-annual-report-10K.pdf,528; 527; 497
4,What were the key financial results reported b...,Here are the key financial results reported by...,2024-meta-full-annual-report.pdf,544; 552; 549
5,What operational and strategic challenges did ...,"Based on the available information, Netflix fa...",2024-netflix-annual-report-10K.pdf,610; 609; 617
6,"Question: \nWhere, as required by law, does N...",NVIDIA Corporation files lobbying disclosure r...,2024-nvidia-annual-report-10K.pdf,768; 660; 723
7,What highlights or key financial metrics are i...,Oracle Corporation's fiscal year 2024 Form 10-...,2024-oracle-annual-report-10K.pdf,1022; 886; 1018
8,"How does Reddit, Inc. plan to manage the poten...","Reddit, Inc. acknowledges the risk of user dec...",2024-reddit-annual-report-10K.pdf,1121; 1133; 1120
9,"Which Tesla, Inc. executives signed the compan...","The following Tesla, Inc. executives and direc...",2024-tsla-annual-report-10K.pdf,1337; 1425; 1250


In [5]:
from openai import OpenAI
import json, random, re

API_KEY = "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"    
#INITIALIZATION
client = OpenAI(api_key=API_KEY, base_url="https://api.perplexity.ai")

In [6]:
prompt = """
    You are a financial data Q&A evaluator.

    You are given:
    - A **question** generated from a document chunk.
    - The **document chunk** (ground truth source).
    - A **model-generated answer** to the question.

    Your job is to score the model’s answer by carefully comparing it to the document chunk.

    Use the following rubric for each category:

    ---
    **Factual Correctness**
    - 5 = All facts are fully correct and consistent with the chunk.
    - 4 = Minor factual inaccuracies but mostly correct.
    - 3 = Some factual inaccuracies, partly correct.
    - 2 = Major factual mistakes, mostly incorrect.
    - 1 = Completely factually wrong.

    ---
    **Completeness**
    - 5 = Fully answers the question with all key details.
    - 4 = Mostly complete, missing minor details.
    - 3 = Partially complete, missing important parts.
    - 2 = Mostly incomplete, only touches on part of the question.
    - 1 = Completely incomplete.

    ---
    3**Clarity**
    - 5 = Clear, precise, and easy to understand.
    - 4 = Mostly clear, with minor awkwardness.
    - 3 = Understandable but somewhat confusing or vague.
    - 2 = Hard to understand or poorly phrased.
    - 1 = Completely unclear or nonsensical.

    ---
    **Response Format**
    Return ONLY this JSON (no extra explanation):
    {
        "factual_correctness_score": [1-5],
        "completeness_score": [1-5],
        "clarity_score": [1-5],
        "comments": "A brief explanation (1-2 sentences) why you assigned these scores."
    }
"""


def evaluate_answer(question, chunk, answer):
    response = client.chat.completions.create(
        model="sonar",
        messages=[
            {"role": "system", "content": prompt},
            {
                "role": "user",
                "content": f"""
                Please evaluate the following answer based on the provided question and document chunk. 
                Return ONLY a valid JSON object.

                Question: {question}

                Document Chunk: {chunk}

                Model Answer: {answer}
                """
            }
        ],
    )

    response_content = response.choices[0].message.content.strip()
    print("LLM Raw Output:", response_content)

    # Remove duplicate keys by keeping only the last occurrence
    cleaned_content = re.sub(
        r'(,\s*")(\w+_score)":\s*\d,\s*"\2":\s*\d',
        lambda m: f',{m.group(2)}": {m.group(0).split(":")[-1]}',
        response_content
    )

    result = json.loads(cleaned_content)
    return result

In [7]:
import pandas as pd

# Prepare a list to collect all processed rows
final_rows = []

for _, row in df.iterrows():
    question = row['question']
    top_k_chunk = row['top_k_chunks']
    answer = row['answer']

    success = False
    while not success:
        try:
            evaluation = evaluate_answer(question, top_k_chunk, answer)
            success = True  # Break loop if successful
        except Exception as e:
            print(f"Retrying for question: {question} due to error: {e}")

    # Build a combined result dictionary
    result_row = {
        'question': question,
        'top_k_chunk': top_k_chunk,
        'answer': answer
    }
    # Add evaluation results
    for key, value in evaluation.items():
        result_row[f'evaluation_{key}'] = value

    final_rows.append(result_row)

# Convert list of results to DataFrame
final_df = pd.DataFrame(final_rows)

# Save to CSV
final_df.to_csv('final_1.csv', index=False)
print("Saved final results to final_1.csv")

LLM Raw Output: {
    "factual_correctness_score": 3,
    "completeness_score": 3,
    "clarity_score": 4,
    "comments": "The answer is mostly clear and contains some relevant points, but several elements (like references to Item 7) are not supported by the provided document chunk. It raises valid issues related to employee dissatisfaction and DEI rollback, but misses direct evidence about supplier and commercial agreement risks in the chunk."
}
LLM Raw Output: ```json
{
    "factual_correctness_score": 4,
    "completeness_score": 3,
    "clarity_score": 5,
    "comments": "The model's answer is mostly clear and well-phrased, but it contains a minor factual inaccuracy regarding the specific lease liability figures for 2023. Additionally, it does not fully compare the figures for 2023 and 2024 since the 2023 data is not explicitly provided in the search results."
}
```
Retrying for question: What was the total lease liability for Apple Inc. as of September 28, 2024, and how did it co

In [8]:
final_df = pd.read_csv("final_1.csv")
final_df

Unnamed: 0,question,top_k_chunk,answer,evaluation_factual_correctness_score,evaluation_completeness_score,evaluation_clarity_score,evaluation_comments
0,"How could challenges in hiring, retaining, and...",27; 1; 39,"Challenges in hiring, retaining, and managing ...",3,3,4,The answer is mostly clear and contains some r...
1,What was the total lease liability for Apple I...,154; 386; 157,"As of September 28, 2024, Apple Inc. had fixed...",4,3,5,The answer correctly states the total lease li...
2,What are the key competitive challenges Cisco ...,321; 293; 248,"In 2024, Cisco faced several key competitive c...",3,3,5,"The answer provides a clear, well-structured o..."
3,Based on the Alphabet Inc. fiscal year ended D...,528; 527; 497,For Alphabet Inc. in the fiscal year ended Dec...,2,1,4,The model answer provides specific basic and d...
4,What were the key financial results reported b...,544; 552; 549,Here are the key financial results reported by...,5,4,5,"The answer is factually consistent and clear, ..."
5,What operational and strategic challenges did ...,610; 609; 617,"Based on the available information, Netflix fa...",5,5,5,The model accurately and clearly addresses all...
6,"Question: \nWhere, as required by law, does N...",768; 660; 723,NVIDIA Corporation files lobbying disclosure r...,5,4,5,The answer fully matches the factual content a...
7,What highlights or key financial metrics are i...,1022; 886; 1018,Oracle Corporation's fiscal year 2024 Form 10-...,5,5,5,The model answer accurately reports the key fi...
8,"How does Reddit, Inc. plan to manage the poten...",1121; 1133; 1120,"Reddit, Inc. acknowledges the risk of user dec...",5,5,5,The answer accurately reflects Reddit's strate...
9,"Which Tesla, Inc. executives signed the compan...",1337; 1425; 1250,"The following Tesla, Inc. executives and direc...",3,3,5,The answer lists several executives and direct...


In [9]:
import plotly.express as px
import pandas as pd

# Make sure relevant columns are numeric
numeric_cols = [
    'evaluation_factual_correctness_score', 
    'evaluation_completeness_score', 
    'evaluation_clarity_score'
]

df = final_df.copy()
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # convert invalid to NaN

# Calculate overall average score per question
df['overall_score'] = df[numeric_cols].mean(axis=1)

# Melt DataFrame to long format (including overall score)
melted_df = df.melt(
    id_vars=['question'],
    value_vars=numeric_cols + ['overall_score'],
    var_name='Metric',
    value_name='Score'
)

# Clean up metric names for display
melted_df['Metric'] = (
    melted_df['Metric']
    .str.replace('evaluation_', '', regex=False)
    .str.replace('_score', '', regex=False)
    .str.replace('_', ' ')
    .str.title()
)

# Drop rows with missing scores (optional, if needed)
melted_df = melted_df.dropna(subset=['Score'])

# Plot boxplot
fig = px.box(
    melted_df,
    x='Metric',
    y='Score',
    points='all',  # show individual points
    hover_data=['question'],
    title='Score Distributions per Metric (with Overall Score)',
    height=500
)

fig.update_layout(
    yaxis=dict(range=[0, 6], dtick=1),
    xaxis_title='Metric',
    yaxis_title='Score (1-5)'
)

fig.show()

In [10]:
import plotly.figure_factory as ff

# Select relevant columns
metrics = [
    'evaluation_factual_correctness_score',
    'evaluation_completeness_score',
    'evaluation_clarity_score',
    'overall_score'
]

# Calculate correlation matrix
corr_matrix = df[metrics].corr().round(2)

# Create heatmap
fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns.tolist(),
    y=corr_matrix.index.tolist(),
    annotation_text=corr_matrix.values,
    colorscale='Blues',
    showscale=True
)

fig.update_layout(
    title='Correlation Heatmap of Evaluation Metrics',
    width=600,
    height=600
)

fig.show()


In [11]:
#Worst answer by overall
worst_answer = df.loc[df['overall_score'].idxmin()]
print(f"Worst Answer by Overall Score: {df['overall_score'].min()}")
print(f"Question: {worst_answer['question']}")
# print(f"Answer: {worst_answer['answer']}")
print(f"Reasons: {worst_answer['evaluation_comments']}")

Worst Answer by Overall Score: 2.3333333333333335
Question: Based on the Alphabet Inc. fiscal year ended December 31, 2024, what were the basic and diluted net income per share amounts for Class A, Class B, and Class C shares, and how did these compare to the previous year ended December 31, 2023?
Reasons: The model answer provides specific basic and diluted net income per share amounts for Class A, B, and C shares for 2024 and 2023, but these figures are not supported or found in the provided document chunk or search results. The document chunk references are numerical and do not contain explicit data on net income per share by class or year, and the search results discuss overall revenues and net income but not the per share breakdowns requested. Thus, the answer is mostly factually incorrect and incomplete. The answer is clearly presented but based on unsupported data.
