In [1]:
import pandas as pd 
import os, warnings
from pathlib import Path
from dotenv import load_dotenv
from IPython.display import display, Markdown

#Local
from config import Config
from data_processing import data_processing
from utils import remove_dirs, check_and_create_dirs
from question_generator import question_generator

#Setting
warnings.filterwarnings("ignore")
# pd.set_option('display.max_colwidth', None)

#CONFIG
load_dotenv()
api_key = os.getenv("PERPLEXITY_API_KEY")



# 1. Initialization

In [2]:
print('INITIALIZING SESSION')
cfg = Config(
    project_name="session_1",# Name of the project
    input_dir="data", # Input directory containing the data files
    api_key= api_key, # API key for Perplexity
    n_questions_per_file=2, #This mean 1 question per file (There are 10 files)
    n_page_summary=3, # Number of pages to summarize
    chunk_size=5000, 
    chunk_overlap=500
)

# remove_dirs(cfg) # This will delete the output directories if they exist
check_and_create_dirs(cfg)


INITIALIZING SESSION
Folder 'session_1' deleted.
Folder 'session_1' and 'session_1/chunks' has been created.


In [3]:
print("USER'S RAG MODEL EXAMPLE")
display(Markdown("The user having a FAISS-based retrieval-augmented system that chunks PDFs, embeds them with all-mpnet-base-v2, and queries them using the Sonar LLM for question-answering."))

from user_models.qa import create_unified_chain
qa_chain = create_unified_chain()

USER'S RAG MODEL EXAMPLE


The user having a FAISS-based retrieval-augmented system that chunks PDFs, embeds them with all-mpnet-base-v2, and queries them using the Sonar LLM for question-answering.

Loading existing vector store for model_1...
Vector store loaded from: faiss_index_open
Using standard retriever
QA chain created successfully for model_1


# QUESTION GENERATION

In [4]:
#I QUESTION GENERATION

print('DATA PROCESSING AND CHUNKING ...')
data_processing(api_key, cfg, verbose = False)

print('QUESTION GENERATION ...')
df = question_generator(cfg, verbose = False)
df.to_csv(cfg.question_file, index = False)

print('QUESTION EXAMPLE:')
questions = df['question'].to_list()
display(Markdown(questions[1]))
display(Markdown(questions[2]))

DATA PROCESSING AND CHUNKING ...
Processing file: 2024-oracle-annual-report-10K.pdf 1/10
Processing file: 2024-cisco-full-annual-report.pdf 2/10
Processing file: 2024-netflix-annual-report-10K.pdf 3/10
Processing file: 2024-amazon-annual-report-10K.pdf 4/10
Processing file: 2024-meta-full-annual-report.pdf 5/10
Processing file: 2024-nvidia-annual-report-10K.pdf 6/10
Processing file: 2024-google-annual-report-10K.pdf 7/10
Processing file: 2024-reddit-annual-report-10K.pdf 8/10
Processing file: 2024-tsla-annual-report-10K.pdf 9/10
Processing file: 2024-apple-annual-report-10K.pdf 10/10
QUESTION GENERATION ...
QUESTION EXAMPLE:


Based on the provided chunk text and context, here is one question that meets your requirements:

How did Oracle Corporation’s proportion of cloud services revenues relative to total revenues change over fiscal year 2024 compared to the previous two years, according to the 10-K report?[3]

What risks and potential consequences related to acquisitions did Cisco Systems highlight in its fiscal year 2024 report, and how might these affect the company's financial condition and operations?

# GENERATE ANSWER FROM USER MODEL

(Should take about 7s per questions)

In [5]:
print("Number of questions generated:", len(questions))

Number of questions generated: 20


In [6]:
answers = []
for question in df['question']:
    ans = qa_chain.invoke(question)['result'] 
    answers.append(ans)

df['answer'] = answers

# EVALUATION

In [7]:
from evaluation import evaluate_answers, radar_plot, statistics, overall_histogram

print('EVALUATION ...')
final_df = evaluate_answers(api_key, df, verbose = False)

EVALUATION ...


In [8]:
final_df

Unnamed: 0,question,factual_correctness_score,completeness_score,clarity_score,overall_score
0,How did Oracle Corporation’s net cash provided...,5,5,5,5.0
1,"Based on the provided chunk text and context, ...",5,4,5,4.7
2,What risks and potential consequences related ...,4,4,5,4.2
3,What are the key customer priorities and strat...,3,3,4,3.2
4,What was the total stock-based compensation ex...,5,5,5,5.0
5,Based on the 2024 Annual Report on Form 10-K f...,5,5,5,5.0
6,What are the key changes to Amazon’s commercia...,5,5,5,5.0
7,What factors contributed to the variation in A...,2,1,3,1.9
8,"What changes occurred in Meta Platforms, Inc.'...",5,5,5,5.0
9,What was the year-over-year change in revenue ...,5,5,5,5.0


In [9]:
statistics(final_df)

Unnamed: 0,count,mean,std,min,median,max
factual_correctness_score,20.0,4.4,0.94,2.0,5.0,5.0
completeness_score,20.0,4.35,1.04,1.0,5.0,5.0
clarity_score,20.0,4.8,0.52,3.0,5.0,5.0
overall_score,20.0,4.46,0.86,1.9,5.0,5.0


In [10]:
radar_plot(final_df)

In [11]:
overall_histogram(final_df)