# Set Up 

In [22]:
import sys
import csv 
csv.field_size_limit(sys.maxsize)

if '..' not in sys.path:
    sys.path.append('..')

import os
import logging
import pandas as pd 
from dotenv import load_dotenv
from RAG.rag_agent import RAGAgent 
from RAG.rag_searcher import RAGSearcher
from utils.clients import create_chat_client, create_embed_client 


In [23]:
load_dotenv(dotenv_path='../.env', override=True)

True

In [24]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,  
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('../logs/rag_agent.log')  # Remove StreamHandler to prevent console output
    ]
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)  

# Load Data

In [25]:
df_benchmark = pd.read_csv("../data/longbench_filtered.csv", delimiter="§", engine="python")
df_context_asc = pd.read_csv("../data/longbench_context_chunked_asc.csv", delimiter="§", engine="python")
df_context_simple = pd.read_csv("../data/longbench_context_chunked_simple.csv", delimiter="§", engine="python")

# Initialise RAG Agent

In [26]:
chat_client = create_chat_client()
embed_client = create_embed_client()

rag_agent = RAGAgent(
    chat_client=chat_client,
    embed_client=embed_client,
    chat_model=os.getenv("MODEL_NAME"),
    searcher=RAGSearcher(),
    max_tokens=130000,
    temperature=0.0
)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

# Demo

In [27]:
df_benchmark.iloc[0]

_id                                               66f36490821e116aacb2cc22
domain                                                  Single-Document QA
sub_domain                                                       Financial
difficulty                                                            easy
length                                                               short
question                 According to the report, how to promote the co...
choice_A                 Through technology empowerment, change the way...
choice_B                 Establish new types of courts, such as intelle...
choice_C                 Improve the work ability of office staff and s...
choice_D                 Use advanced information systems to improve th...
answer                                                                   D
context                  Contents\nPreface.\n.............................
context_tokens                                                       38133
within_context_window    

In [28]:
import random
n = random.randint(0, len(df_benchmark))

id = df_benchmark.iloc[n]["_id"]
question = df_benchmark.iloc[n]["question"]
choice_a = df_benchmark.iloc[n]["choice_A"]
choice_b = df_benchmark.iloc[n]["choice_B"]
choice_c = df_benchmark.iloc[n]["choice_C"]
choice_d = df_benchmark.iloc[n]["choice_D"]
correct_answer = df_benchmark.iloc[n]["answer"]

In [29]:
df_context_simple[df_context_simple["_id"] == id]

Unnamed: 0,_id,chunk_id,chunk_text,embeddings
24792,671b13f5bb02136c067d4ee3,0,"{ "" meta "" : { "" name _ exp "" : "" qwen2 - 72b ...","[[-0.045745849609375, 0.0269775390625, -0.0158..."
24793,671b13f5bb02136c067d4ee3,1,"8, 7 ], "" mean "" : 8. 7, "" mean _ ratio "" : 5....","[[-0.01136016845703125, 0.03326416015625, -0.0..."
24794,671b13f5bb02136c067d4ee3,2,"responses "" : [ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]...","[[-0.0289459228515625, 0.0205230712890625, -0...."
24795,671b13f5bb02136c067d4ee3,3,"2, "" winner _ num "" : 10 }, { "" responses "" : ...","[[-0.0155487060546875, 0.0155792236328125, -0...."
24796,671b13f5bb02136c067d4ee3,4,""" }, { "" role "" : "" user "", "" content "" : "" un...","[[0.0008091926574707031, -0.005794525146484375..."
...,...,...,...,...
24848,671b13f5bb02136c067d4ee3,56,round 4 : \ n \ naverage number chosen : 15. 5...,"[[-0.0005040168762207031, 0.0042724609375, -0...."
24849,671b13f5bb02136c067d4ee3,57,": "" congratulation you won. "" }, { "" role "" : ...","[[-0.0101318359375, -0.01042938232421875, -0.0..."
24850,671b13f5bb02136c067d4ee3,58,"_ number \ "" : \ "" 2 \ "" } "" }, { "" role "" : ""...","[[-0.0128936767578125, -0.00884246826171875, -..."
24851,671b13f5bb02136c067d4ee3,59,"\ nyou chose : "" }, { "" role "" : "" assistant ""...","[[0.0120849609375, 0.00799560546875, -0.076354..."


In [30]:
def format_question(question: str, choice_a: str, choice_b: str, choice_c: str, choice_d: str) -> str:
    return f"Question: {question}\nA: {choice_a}\nB: {choice_b}\nC: {choice_c}\nD: {choice_d}"

formatted_question = format_question(question, choice_a, choice_b, choice_c, choice_d)
print(formatted_question)

Question: Which following player won the least times in the game?
A: player_1
B: player_3
C: player_5
D: player_8


In [32]:
llm_answer = rag_agent.generate_response(formatted_question, df_context_simple, id, top=3)
print(llm_answer)
print(f"LLM answer == Correct answer: {llm_answer == correct_answer}")

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/spark/opt/anaconda3/envs/st311/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3667, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/lp/6tqfnwxx7ks38f2yd_pqft0c0000gn/T/ipykernel_72060/2235900583.py", line 1, in <module>
    llm_answer = rag_agent.generate_response(formatted_question, df_context_simple, id, top=3)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/spark/Documents/GitHub/ST311_group_project/notebooks/../RAG/rag_agent.py", line 51, in generate_response
    chat_completion_response = self.chat_client.chat.completions.create(
                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/spark/opt/anaconda3/envs/st311/lib/python3.11/site-packages/openai/_utils/_utils.py", line 279, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/spark/opt/

# Benchmarking Workflow

In [12]:
def benchmark_question(row, context_df, rag_agent):
    # 提取问题和正确答案
    question = row["question"]
    correct_answer = row["answer"]
    
    # 格式化问题
    formatted_question = format_question(
        question, row["choice_A"], row["choice_B"], row["choice_C"], row["choice_D"]
    )
    
    # 生成LLM的回答
    llm_answer = rag_agent.generate_response(formatted_question, context_df, row["_id"], top=3)
    
    # 比较LLM的回答与正确答案
    is_correct = llm_answer == correct_answer
    return is_correct

In [18]:
from tqdm import tqdm

In [20]:
def run_benchmark(df_benchmark, context_dfs, rag_agent):
    results = []
    for index, row in tqdm(df_benchmark.iterrows(), total=df_benchmark.shape[0], desc="Benchmarking Progress"):
        # 根据需要选择不同的上下文数据集
        context_df = context_dfs["simple"]  # 这里可以根据需要选择不同的上下文方法
        
        # 进行基准测试
        is_correct = benchmark_question(row, context_df, rag_agent)
        results.append(is_correct)
    
    # 计算准确率
    accuracy = sum(results) / len(results)
    print(f"Benchmarking accuracy: {accuracy:.2%}")

In [21]:
# 加载数据
df_benchmark = pd.read_csv("../data/longbench_filtered.csv", delimiter="§", engine="python")
context_dfs = {
    "simple": df_context_simple,
    "adaptive-semantic": df_context_asc,  
}

# 运行基准测试
run_benchmark(df_benchmark, context_dfs, rag_agent)

Benchmarking Progress:   0%|          | 0/303 [00:03<?, ?it/s]

Unexpected exception formatting exception. Falling back to standard exception



Traceback (most recent call last):
  File "/Users/spark/opt/anaconda3/envs/st311/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3667, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/lp/6tqfnwxx7ks38f2yd_pqft0c0000gn/T/ipykernel_72060/2978949160.py", line 9, in <module>
    run_benchmark(df_benchmark, context_dfs, rag_agent)
  File "/var/folders/lp/6tqfnwxx7ks38f2yd_pqft0c0000gn/T/ipykernel_72060/3692385663.py", line 8, in run_benchmark
    is_correct = benchmark_question(row, context_df, rag_agent)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/lp/6tqfnwxx7ks38f2yd_pqft0c0000gn/T/ipykernel_72060/2891473983.py", line 12, in benchmark_question
    llm_answer = rag_agent.generate_response(formatted_question, context_df, row["_id"], top=3)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/spark/Documents/GitHub/ST311_group_project/