# Set Up 

In [1]:
import sys
import csv 
csv.field_size_limit(sys.maxsize)

if '..' not in sys.path:
    sys.path.append('..')

import os
import logging
import pandas as pd 
from dotenv import load_dotenv
from RAG.rag_agent import RAGAgent 
from RAG.rag_searcher import RAGSearcher
from utils.clients import create_chat_client, create_embed_client 

load_dotenv(override=True)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

True

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,  
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('../logs/rag_workflow_experiment.log')  # Remove StreamHandler to prevent console output
    ]
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)  

# Mute openai_messages_token_helper logger
openai_token_logger = logging.getLogger("openai_messages_token_helper")
openai_token_logger.setLevel(logging.ERROR)

# Load Data

In [3]:
df_benchmark = pd.read_csv("../data/longbench_filtered.csv", delimiter="§", engine="python")
#df_context_asc = pd.read_csv("../data/longbench_context_chunked_asc.csv", delimiter="§", engine="python")
df_context_simple = pd.read_csv("../data/longbench_context_chunked_simple.csv", delimiter="§", engine="python")
#df_context_sentence = pd.read_csv("../data/longbench_context_chunked_sentence.csv", delimiter="§", engine="python")

# Initialise RAG Agent

In [4]:
chat_client = create_chat_client()
embed_client = create_embed_client()

rag_agent = RAGAgent(
    chat_client=chat_client,
    embed_client=embed_client,
    chat_model=os.getenv("MODEL_NAME"),
    searcher=RAGSearcher(),
    max_tokens=130000,
    temperature=0.0
)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

# Demo Workflow

In [5]:
df_benchmark.iloc[0]

_id                                               66f36490821e116aacb2cc22
domain                                                  Single-Document QA
sub_domain                                                       Financial
difficulty                                                            easy
length                                                               short
question                 According to the report, how to promote the co...
choice_A                 Through technology empowerment, change the way...
choice_B                 Establish new types of courts, such as intelle...
choice_C                 Improve the work ability of office staff and s...
choice_D                 Use advanced information systems to improve th...
answer                                                                   D
context                  Contents\nPreface.\n.............................
context_tokens                                                       38133
within_context_window    

In [6]:
import random
n = random.randint(0, len(df_benchmark))

id = df_benchmark.iloc[n]["_id"]
question = df_benchmark.iloc[n]["question"]
choice_a = df_benchmark.iloc[n]["choice_A"]
choice_b = df_benchmark.iloc[n]["choice_B"]
choice_c = df_benchmark.iloc[n]["choice_C"]
choice_d = df_benchmark.iloc[n]["choice_D"]
context = df_benchmark.iloc[n]["context"]
correct_answer = df_benchmark.iloc[n]["answer"]

In [7]:
df_context_simple[df_context_simple["_id"] == id]

Unnamed: 0,_id,chunk_id,chunk_text,embeddings
24369,671b2e2dbb02136c067d515b,0,"{ "" question _ id "" : "" e25c3b8d "", "" question...","[[-0.0157012939453125, -0.0247039794921875, -0..."
24370,671b2e2dbb02136c067d515b,1,"58 "", "" 2023 / 05 / 24 ( wed ) 18 : 36 "", "" 20...","[[-0.0134735107421875, -0.030792236328125, -0...."
24371,671b2e2dbb02136c067d515b,2,"1 "", "" f3a57240 "", "" c51a7155 "", "" ultrachat _...","[[-0.0298919677734375, 0.0020389556884765625, ..."
24372,671b2e2dbb02136c067d515b,3,""", "" 1ccb08a0 "", "" ultrachat _ 216372 "", "" 927...","[[-0.0195465087890625, 0.00836944580078125, -0..."
24373,671b2e2dbb02136c067d515b,4,but not urgent ( schedule ) \ n \ t * urgent b...,"[[-0.00926971435546875, -0.00829315185546875, ..."
...,...,...,...,...
24655,671b2e2dbb02136c067d515b,286,"for, and i can give you some ideas. "" }, { "" r...","[[-0.01071929931640625, 0.0159759521484375, -0..."
24656,671b2e2dbb02136c067d515b,287,family can spend a fun and educational day at ...,"[[-0.043365478515625, -0.0163116455078125, -0...."
24657,671b2e2dbb02136c067d515b,288,"your recommend. "" }, { "" role "" : "" assistant ...","[[-0.033172607421875, 0.0033111572265625, -0.0..."
24658,671b2e2dbb02136c067d515b,289,buildings and streets are illuminated with col...,"[[-0.038055419921875, -0.0012712478637695312, ..."


In [8]:
def format_question(question: str, choice_a: str, choice_b: str, choice_c: str, choice_d: str) -> str:
    return f"Question: {question}\nA: {choice_a}\nB: {choice_b}\nC: {choice_c}\nD: {choice_d}"

formatted_question = format_question(question, choice_a, choice_b, choice_c, choice_d)
print(formatted_question)

Question: Which of the following happened first?
A: the user started playing the Fender CD-60S
B: the user went to that outdoor music festival
C: the user felt overwhelmed with work projects
D: the user took yoga classes at the local studio


In [9]:
llm_answer_rag, token_count = rag_agent.generate_response_rag(formatted_question, df_context_simple, id, top=5)
print(llm_answer_rag)
print(f"Input tokens: {token_count}")

A
Input tokens: 2891


In [10]:
llm_answer_lc, token_count = rag_agent.generate_response_lc(formatted_question, context)
print(llm_answer_lc)
print(f"Input tokens: {token_count}")

C
Input tokens: 139541


In [11]:
print(f"Correct answer: {correct_answer}")

Correct answer: C
