### 사전 준비 사항 

#### (1) uv add (터미널)

```bash
uv add rank_bm25
```

#### (2) .env 파일 세팅
```bash
OPENAI_API_KEY = ""
HF_TOKEN = ""
```

#### (3) pdf 파일 세팅
pdf 파일 100개를 `data/raw/files` 에 위치합니다.  
eval 파일(csv 2개, jsonl 1개)을 `data/raw/eval` 에 위치합니다.(*30개 파일 합친 버전)  

#### 실행 방법

1. 처음 1회(커널 새로 시작): 처음부터 끝까지 순서대로 실행
2. exp1 결과 저장 확인
3. <실험 ID 변경> 셀의 exp_id 를 변경 후 새로운 실험 진행 (실험 결과 저장 확인 후 커널 재시작)  
\*커널 재시작하지 않는 경우, 실험 ID 변경 + 실험 진행 섹션 코드만 실행해도 됨.(OOM 발생 가능성이 있어 권장하지 않음.)
4. 원하는 실험로 변경 및 반복

In [1]:
# 공통 준비(커널 시작마다 1회)

import json, re, unicodedata
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI
from sentence_transformers import SentenceTransformer

from preprocess.pp_basic import docs, BASE_DIR, EVAL_DIR, GOLD_EVIDENCE_CSV, GOLD_FIELDS_JSONL
from preprocess.rag_experiment import (
    CONFIG, ExperimentSpec, load_questions_df, make_components, RAGExperiment
)

load_dotenv(find_dotenv(), override=False)
client = OpenAI()

# embed 모델은 커널에서 1번만 로드(중요)
embed_model = SentenceTransformer("nlpai-lab/KoE5")

# gold 로드 (baseline 함수 재사용하던 것과 동일하게)
gold_evidence_df = pd.read_csv(GOLD_EVIDENCE_CSV)

def load_gold_fields_jsonl(path):
    rows = []
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            if line.strip():
                rows.append(json.loads(line))
    out = []
    for r in rows:
        iid = r["instance_id"]
        doc_id = r.get("doc_id", "")
        fields = r.get("fields", {}) or {}
        for k, v in fields.items():
            out.append({"instance_id": iid, "doc_id": doc_id, "field": k, "gold": v})
    return pd.DataFrame(out)

gold_fields_df = load_gold_fields_jsonl(GOLD_FIELDS_JSONL)

questions_df = load_questions_df()

print("gold_evidence_df:", gold_evidence_df.shape)
print("gold_fields_df:", gold_fields_df.shape)
print("questions_df:", questions_df.shape)
print("n_docs:", len(docs))

  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|██████████| 391/391 [00:00<00:00, 886.27it/s, Materializing param=pooler.dense.weight]                               


gold_evidence_df: (630, 5)
gold_fields_df: (630, 4)
questions_df: (311, 5)
n_docs: 100


In [2]:
# 평가 문서 필터(커널당 1회)
def name_key(s: str) -> str:
    s = unicodedata.normalize("NFC", s).strip()
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"\s+", " ", s)
    return s

gold_doc_key_set = set(name_key(x) for x in gold_fields_df["doc_id"].astype(str).unique())
EVAL_DOCS = [p for p in docs if name_key(p.name) in gold_doc_key_set]

print("Eval docs:", len(EVAL_DOCS))

Eval docs: 30


### 실험 ID 변경

In [3]:
# 이번 실행 실험 1개만 선택 (여기만 바꿔가며 실행)

RUN_EXP_ID = 1  # 1~18
N_DOCS = 5

# N개 문서 또는 전체 문서 테스트(주석 처리)
RUN_DOCS = EVAL_DOCS[:N_DOCS] 
# RUN_DOCS = EVAL_DOCS

# exp table (18개)
SPECS = {
    1:  ("C1","R1","G1"),  2:  ("C1","R1","G2"),
    3:  ("C1","R2","G1"),  4:  ("C1","R2","G2"),
    5:  ("C1","R3","G1"),  6:  ("C1","R3","G2"),
    7:  ("C2","R1","G1"),  8:  ("C2","R1","G2"),
    9:  ("C2","R2","G1"),  10: ("C2","R2","G2"),
    11: ("C2","R3","G1"),  12: ("C2","R3","G2"),
    13: ("C3","R1","G1"),  14: ("C3","R1","G2"),
    15: ("C3","R2","G1"),  16: ("C3","R2","G2"),
    17: ("C3","R3","G1"),  18: ("C3","R3","G2"),
}

c, r, g = SPECS[RUN_EXP_ID]
spec = ExperimentSpec(exp_id=RUN_EXP_ID, chunker=c, retriever=r, generator=g)
print("Running spec:", spec)
print("RUN_DOCS:", len(RUN_DOCS))

Running spec: ExperimentSpec(exp_id=1, chunker='C1', retriever='R1', generator='G1')
RUN_DOCS: 5


### 실험 수행 및 결과 저장

In [4]:
# 실행 + 저장 (매 exp_id마다 1번 실행)

chunker, retriever, generator = make_components(spec, embed_model=embed_model, client=client)
rag = RAGExperiment(chunker=chunker, retriever=retriever, generator=generator, questions_df=questions_df)

rows = []
for doc_path in tqdm(RUN_DOCS, desc=f"Exp {spec.exp_id} docs"):
    m = rag.run_single_doc_metrics(
        doc_path,
        gold_fields_df=gold_fields_df,
        gold_evidence_df=gold_evidence_df,
        top_k=CONFIG["top_k"],
        sim_threshold=80,
    )
    m["exp_id"] = spec.exp_id
    m["chunker"] = spec.chunker
    m["retriever"] = spec.retriever
    m["generator"] = spec.generator
    rows.append(m)

doc_df = pd.DataFrame(rows)

# exp-level average
avg = doc_df[["ret_recall","ret_mrr","gen_fill","gen_match","gen_sim"]].mean(numeric_only=True)
exp_df = pd.DataFrame([{
    "exp_id": spec.exp_id,
    "chunk": spec.chunker,
    "retriever": spec.retriever,
    "model": spec.generator,
    "n_docs": len(doc_df),
    **{k: float(avg[k]) for k in avg.index},
}])

display(exp_df.round(4))

out_dir = BASE_DIR / "outputs"
out_dir.mkdir(parents=True, exist_ok=True)

doc_out = out_dir / f"exp{spec.exp_id:02d}_doclevel.csv"
exp_out = out_dir / f"exp{spec.exp_id:02d}_explevel.csv"

doc_df.to_csv(doc_out, index=False, encoding="utf-8-sig")
exp_df.to_csv(exp_out, index=False, encoding="utf-8-sig")

print("Saved:", doc_out)
print("Saved:", exp_out)

  "ret_recall": float(np.nanmean([x["recall"] for x in r_list])),
  "ret_mrr": float(np.nanmean([x["mrr"] for x in r_list])),
  "gen_match": float(np.nanmean([x["match"] for x in g_list])),
  "gen_sim": float(np.nanmean([x["sim"] for x in g_list])),
Exp 1 docs: 100%|██████████| 5/5 [04:00<00:00, 48.08s/it]


Unnamed: 0,exp_id,chunk,retriever,model,n_docs,ret_recall,ret_mrr,gen_fill,gen_match,gen_sim
0,1,C1,R1,G1,5,0.7738,0.3604,1.0,0.0369,3.9452


Saved: d:\dev\github\codeit-part3-team4\outputs\exp01_doclevel.csv
Saved: d:\dev\github\codeit-part3-team4\outputs\exp01_explevel.csv
