# Import Data 

In [1]:
import sys
import csv 
csv.field_size_limit(sys.maxsize)

if '..' not in sys.path:
    sys.path.append('..')

import os
import logging
import pandas as pd 
from tqdm.auto import tqdm
from dotenv import load_dotenv
from RAG.rag_agent import RAGAgent 
from RAG.rag_searcher import RAGSearcher
from utils.clients import create_chat_client, create_embed_client 

tqdm.pandas()
load_dotenv(override=True)
pd.options.mode.chained_assignment = None

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,  
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('../logs/rag_agent.log')  # Remove StreamHandler to prevent console output
    ]
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)  

# Mute openai_messages_token_helper logger
openai_token_logger = logging.getLogger("openai_messages_token_helper")
openai_token_logger.setLevel(logging.ERROR)

# Load Data

In [3]:
df_benchmark = pd.read_csv("../data/longbench_filtered.csv", delimiter="§", engine="python")
df_context_asc = pd.read_csv("../data/longbench_context_chunked_asc.csv", delimiter="§", engine="python")
df_context_simple = pd.read_csv("../data/longbench_context_chunked_simple.csv", delimiter="§", engine="python")
df_context_sentence = pd.read_csv("../data/longbench_context_chunked_sentence.csv", delimiter="§", engine="python")

# Initialise RAG Agent

In [4]:
chat_client = create_chat_client()
embed_client = create_embed_client()

rag_agent = RAGAgent(
    chat_client=chat_client,
    embed_client=embed_client,
    chat_model=os.getenv("MODEL_NAME"),
    searcher=RAGSearcher(),
    max_tokens=130000,
    temperature=0.0
)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

# Benchmarking Workflow

In [5]:
def format_question(question: str, choice_a: str, choice_b: str, choice_c: str, choice_d: str) -> str:
    return f"Question: {question}\nA: {choice_a}\nB: {choice_b}\nC: {choice_c}\nD: {choice_d}"


def benchmark_question(row, df_context=None, top=5, mode="rag"):
    if mode not in ["rag", "lc"]:
        raise ValueError("Invalid mode. Must be either 'rag' or 'lc'.")

    # Extract question and choices from row 
    id = row["_id"]
    question = row["question"]
    choice_a = row["choice_A"]
    choice_b = row["choice_B"]
    choice_c = row["choice_C"]
    choice_d = row["choice_D"]
    if mode == "lc": 
        context = row["context"]
    
    # Format question into the format required by the RAG agent
    formatted_question = format_question(question, choice_a, choice_b, choice_c, choice_d)

    # Generate response from RAG agent
    if mode == "rag": 
        llm_answer, token_count = rag_agent.generate_response_rag(formatted_question, df_context, id, top=top)
    else: 
        llm_answer, token_count = rag_agent.generate_response_lc(formatted_question, context)
    
    return llm_answer, token_count


def generate_benchmarking_results(df_benchmark, suffix, df_context=None, long_context=False):
    """Generate benchmarking results by running RAG agent predictions on benchmark questions.
    
    Args:
        df_benchmark (pd.DataFrame): DataFrame containing benchmark questions and answer choices
        df_context (pd.DataFrame): DataFrame containing context documents for RAG search
        suffix (str): Suffix to append to the llm_answer column name in results. Takes value ["simple", "asc", "sentence", "lc"]
        long_context (bool): Whether to use long context. Takes value [True, False]
        
    Returns:
        None. Modifies df_benchmark in place by adding llm_answer_{suffix} column with predictions
    """
    if not long_context:
        results = df_benchmark.progress_apply(lambda row: benchmark_question(row, df_context, top=5, mode="rag"), axis=1)
    else: 
        results = df_benchmark.progress_apply(lambda row: benchmark_question(row, mode="lc"), axis=1)

    df_benchmark[f"llm_answer_{suffix}"], df_benchmark[f"input_tokens_{suffix}"] = zip(*results)

### Test workflow

In [6]:
df_test = df_benchmark.iloc[:5]
generate_benchmarking_results(df_test, "simple", df_context_simple, long_context=False)
generate_benchmarking_results(df_test, "lc", long_context=True)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
df_test.head()

Unnamed: 0,_id,domain,sub_domain,difficulty,length,question,choice_A,choice_B,choice_C,choice_D,answer,context,context_tokens,within_context_window,llm_answer_simple,input_tokens_simple,llm_answer_lc,input_tokens_lc
0,66f36490821e116aacb2cc22,Single-Document QA,Financial,easy,short,"According to the report, how to promote the co...","Through technology empowerment, change the way...","Establish new types of courts, such as intelle...",Improve the work ability of office staff and s...,Use advanced information systems to improve th...,D,Contents\nPreface.\n.............................,38133,True,A,3262,A,35327
1,66ebed525a08c7b9b35e1cb4,Single-Document QA,Academic,hard,short,"When Miller tried to answer the question ""shou...",Each must read for himself or herself and test...,Readers must reach a high standrad to some deg...,"It is the readers' obligation to get the ""trut...",The performative interpretation of language tr...,B,Chapter Five\nJOSEPH CONRAD:\nSHOULD WE READ\n...,24007,True,B,2991,B,25610
2,671b3cabbb02136c067d5252,Long-dialogue History Understanding,Agent history QA,hard,short,Which player got the least utility in the game?,player_1,player_3,player_5,player_7,B,"{\n ""meta"": {\n ""name_exp"": ""gemini-1.0-pr...",43168,True,A,2549,B,47177
3,66ec0c4c821e116aacb1994a,Multi-Document QA,Academic,easy,medium,Which of the following statements is correct?,Both contractor data and data crawled from the...,All machine learning methods involved in the t...,Both voyager and VPT control Minecraft agents ...,VPT's modeling of action space is approximate ...,D,Video PreTraining (VPT): Learning to Act by\nW...,67185,True,A,2881,B,66821
4,66f920d8bb02136c067c4b81,Single-Document QA,Literary,hard,medium,What is mainly symbolized by the frequent chol...,Confusion of The Times,The impermanence of the character's fate,Love is dangerous and uncontrollable,Social indifference,C,Chapter 1\nIT WAS INEVITABLE: the scent of bit...,85218,True,C,2855,D,95095


### Run Benchmarking

In [8]:
# Simple chunking
generate_benchmarking_results(df_benchmark, "simple", df_context_simple, long_context=False)
df_benchmark.to_csv("../data/longbench_results_checkpoint_1.csv", index=False, sep="§")

  0%|          | 0/303 [00:00<?, ?it/s]

In [9]:
# Sentence chunking
generate_benchmarking_results(df_benchmark, "sentence", df_context_sentence, long_context=False)
df_benchmark.to_csv("../data/longbench_results_checkpoint_2.csv", index=False, sep="§")

  0%|          | 0/303 [00:00<?, ?it/s]

In [13]:
# Adaptive-semantic chunking
df_benchmark_top = df_benchmark.iloc[:len(df_benchmark)//2] # Breaking df_benchmark into 2 halves to avoid overloading the model
generate_benchmarking_results(df_benchmark_top, "asc", df_context_asc, long_context=False)

  0%|          | 0/151 [00:00<?, ?it/s]

In [14]:
df_benchmark_bottom = df_benchmark.iloc[len(df_benchmark)//2:]
generate_benchmarking_results(df_benchmark_bottom, "asc", df_context_asc, long_context=False)

  0%|          | 0/152 [00:00<?, ?it/s]

In [19]:
df_benchmark = pd.concat([df_benchmark_top, df_benchmark_bottom]) # Concatenating the 2 halves back together
df_benchmark.to_csv("../data/longbench_results_checkpoint_3.csv", index=False, sep="§")

In [20]:
# Long context
generate_benchmarking_results(df_benchmark, "lc", long_context=True)
df_benchmark.to_csv("../data/longbench_results.csv", index=False, sep="§")

  0%|          | 0/303 [00:00<?, ?it/s]