### 사전 준비 사항 

#### (1) uv add (터미널)

```bash
uv add rank_bm25
```

#### (2) .env 파일 세팅
```bash
OPENAI_API_KEY = ""
HF_TOKEN = ""
```

#### (3) pdf 파일 세팅
pdf 파일 100개를 `data/raw/files` 에 위치합니다.  
eval 파일(csv 2개, jsonl 1개)을 `data/raw/eval` 에 위치합니다.(*30개 파일 합친 버전)  

#### 실행 방법

1. 처음 1회(커널 새로 시작): 처음부터 끝까지 순서대로 실행
2. exp1 결과 저장 확인
3. <실험 ID 변경> 셀의 exp_id 를 변경 후 새로운 실험 진행 (실험 결과 저장 확인 후 커널 재시작)  
\*커널 재시작하지 않는 경우, 실험 ID 변경 + 실험 진행 섹션 코드만 실행해도 됨.(OOM 발생 가능성이 있어 권장하지 않음.)
4. 원하는 실험로 변경 및 반복

In [1]:
import preprocess.pp_v6 as pp
from preprocess.pp_basic import docs

In [3]:
import json, re, unicodedata
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI
from sentence_transformers import SentenceTransformer

from preprocess.pp_basic import docs, BASE_DIR, EVAL_DIR, GOLD_EVIDENCE_CSV, GOLD_FIELDS_JSONL
from preprocess.rag_experiment import (
    CONFIG, ExperimentSpec, load_questions_df, make_components, RAGExperiment
)

load_dotenv(find_dotenv(), override=False)
client = OpenAI()

# embed 모델은 커널에서 1번만 로드(중요)
embed_model = SentenceTransformer("nlpai-lab/KoE5")

# gold 로드
gold_evidence_df = pd.read_csv(GOLD_EVIDENCE_CSV)

def load_gold_fields_jsonl(path):
    rows = []
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            if line.strip():
                rows.append(json.loads(line))
    out = []
    for r in rows:
        iid = r["instance_id"]
        doc_id = r.get("doc_id", "")
        fields = r.get("fields", {}) or {}
        for k, v in fields.items():
            out.append({"instance_id": iid, "doc_id": doc_id, "field": k, "gold": v})
    return pd.DataFrame(out)

gold_fields_df = load_gold_fields_jsonl(GOLD_FIELDS_JSONL)

questions_df = load_questions_df()

print("gold_evidence_df:", gold_evidence_df.shape)
print("gold_fields_df:", gold_fields_df.shape)
print("questions_df:", questions_df.shape)
print("n_docs:", len(docs))

  from .autonotebook import tqdm as notebook_tqdm


gold_evidence_df: (630, 5)
gold_fields_df: (630, 4)
questions_df: (311, 5)
n_docs: 100


In [4]:
# 평가 문서 필터(커널당 1회)
def name_key(s: str) -> str:
    s = unicodedata.normalize("NFC", str(s)).strip()
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"\s+", " ", s)
    return s

gold_doc_key_set = set(name_key(x) for x in gold_fields_df["doc_id"].astype(str).unique())
EVAL_DOCS = [p for p in docs if name_key(p.name) in gold_doc_key_set]

print("Eval docs:", len(EVAL_DOCS))

Eval docs: 30


### 실험 ID 변경

In [5]:
RUN_EXP_ID = 19   # 1~18
N_DOCS = 5       # 디버깅은 1~5 추천, 전체는 None 또는 아래 라인 변경

# N개 문서 또는 전체 문서 테스트
# RUN_DOCS = EVAL_DOCS[:N_DOCS]
RUN_DOCS = EVAL_DOCS

# exp table (18개)
SPECS = {
    1:  ("C1","R1","G1"),  2:  ("C1","R1","G2"),
    3:  ("C1","R2","G1"),  4:  ("C1","R2","G2"),
    5:  ("C1","R3","G1"),  6:  ("C1","R3","G2"),
    7:  ("C2","R1","G1"),  8:  ("C2","R1","G2"),
    9:  ("C2","R2","G1"),  10: ("C2","R2","G2"),
    11: ("C2","R3","G1"),  12: ("C2","R3","G2"),
    13: ("C3","R1","G1"),  14: ("C3","R1","G2"),
    15: ("C3","R2","G1"),  16: ("C3","R2","G2"),
    17: ("C3","R3","G1"),  18: ("C3","R3","G2"),
    19: ("C4","R1","G1"),  20: ("C4","R1","G2"),
    21: ("C4","R2","G1"),  22: ("C4","R2","G2"),
    23: ("C4","R3","G1"),  24: ("C4","R3","G2"),
}

c, r, g = SPECS[RUN_EXP_ID]
spec = ExperimentSpec(exp_id=RUN_EXP_ID, chunker=c, retriever=r, generator=g)
print("Running spec:", spec)
print("RUN_DOCS:", len(RUN_DOCS))

# (선택) 실험 컨텍스트 cap 조정하고 싶은 경우 주석 해제 및 수정
# CONFIG["max_context_chars"] = 4000

Running spec: ExperimentSpec(exp_id=19, chunker='C4', retriever='R1', generator='G1')
RUN_DOCS: 30


### 디버깅 사전 설정(선택)

In [6]:
# (옵션) sentinel 모니터링용
SENT_NOT_FOUND = "NOT_FOUND"
SENT_GEN_FAIL = "GEN_FAIL"

def count_sentinels(pred_map: dict) -> dict:
    if not isinstance(pred_map, dict):
        return {"n_keys": 0, "n_not_found": 0, "n_gen_fail": 0}

    vals = [str(v).strip() for v in pred_map.values()]
    vals_l = [v.lower() for v in vals]

    n_nf = sum(v in {"not_found", "notfound"} for v in vals_l)
    n_gf = sum(v == "gen_fail" for v in vals_l)
    return {"n_keys": len(vals), "n_not_found": n_nf, "n_gen_fail": n_gf}

### 실험 수행 및 결과 저장

In [7]:
chunker, retriever, generator = make_components(spec, embed_model=embed_model, client=client)
rag = RAGExperiment(chunker=chunker, retriever=retriever, generator=generator, questions_df=questions_df)

rows = []
for doc_path in tqdm(RUN_DOCS, desc=f"Exp {spec.exp_id} docs"):
    m = rag.run_single_doc_metrics(
        doc_path,
        gold_fields_df=gold_fields_df,
        gold_evidence_df=gold_evidence_df,
        top_k=CONFIG["top_k"],
        sim_threshold=80,
    )
    m["exp_id"] = spec.exp_id
    m["chunker"] = spec.chunker
    m["retriever"] = spec.retriever
    m["generator"] = spec.generator
    rows.append(m)

doc_df = pd.DataFrame(rows)

# exp-level average
avg = doc_df[["ret_recall","ret_mrr","gen_fill","gen_match","gen_sim"]].mean(numeric_only=True)
exp_df = pd.DataFrame([{
    "exp_id": spec.exp_id,
    "chunk": spec.chunker,
    "retriever": spec.retriever,
    "model": spec.generator,
    "n_docs": len(doc_df),
    **{k: float(avg[k]) for k in avg.index},
}])

display(exp_df.round(4))

out_dir = BASE_DIR / "outputs"
out_dir.mkdir(parents=True, exist_ok=True)

exp_out = out_dir / f"exp{spec.exp_id:02d}_explevel.csv"
exp_df.to_csv(exp_out, index=False, encoding="utf-8-sig")
print("Saved:", exp_out)

Exp 19 docs:   0%|          | 0/30 [00:00<?, ?it/s]

  "ret_recall": float(np.nanmean([x["recall"] for x in r_list])),
  "ret_mrr": float(np.nanmean([x["mrr"] for x in r_list])),
  "gen_match": float(np.nanmean([x["match"] for x in g_list])),
  "gen_sim": float(np.nanmean([x["sim"] for x in g_list])),
  "ret_recall": float(np.nanmean([x["recall"] for x in r_list])),
  "ret_mrr": float(np.nanmean([x["mrr"] for x in r_list])),
  "gen_match": float(np.nanmean([x["match"] for x in g_list])),
  "gen_sim": float(np.nanmean([x["sim"] for x in g_list])),
  "ret_recall": float(np.nanmean([x["recall"] for x in r_list])),
  "ret_mrr": float(np.nanmean([x["mrr"] for x in r_list])),
  "gen_match": float(np.nanmean([x["match"] for x in g_list])),
  "gen_sim": float(np.nanmean([x["sim"] for x in g_list])),
  "ret_recall": float(np.nanmean([x["recall"] for x in r_list])),
  "ret_mrr": float(np.nanmean([x["mrr"] for x in r_list])),
  "gen_match": float(np.nanmean([x["match"] for x in g_list])),
  "gen_sim": float(np.nanmean([x["sim"] for x in g_list])),


Unnamed: 0,exp_id,chunk,retriever,model,n_docs,ret_recall,ret_mrr,gen_fill,gen_match,gen_sim
0,19,C4,R1,G1,30,,,1.0,,


Saved: /Users/won/dev/00_codeit/0_mission/200_DL_RAG/outputs/exp19_explevel.csv


In [9]:
# 문서별 pred_map 저장 (옵션)
pred_dir = out_dir / f"exp{spec.exp_id:02d}_pred_maps"
pred_dir.mkdir(parents=True, exist_ok=True)

saved = 0
for _, row in doc_df.iterrows():
    doc_id = str(row["doc_id"])
    pred_map = row.get("pred_map", None)
    if isinstance(pred_map, dict):
        # 파일명 안전화
        safe = re.sub(r"[\\/:*?\"<>|]", "_", doc_id)
        path = pred_dir / f"{safe}.json"
        with open(path, "w", encoding="utf-8") as f:
            json.dump(pred_map, f, ensure_ascii=False, indent=2)
        saved += 1

print("Saved pred_maps:", saved, "->", pred_dir)

Saved pred_maps: 30 -> /Users/won/dev/00_codeit/0_mission/200_DL_RAG/outputs/exp19_pred_maps


### 디버깅(선택)

In [10]:
DEBUG = True  # 필요할 때만 True

if DEBUG:
    print("last_debug:", getattr(rag.generator, "last_debug", None))

    raw = getattr(rag.generator, "last_raw_text", "") or ""
    print("raw_text_len:", len(raw.strip()))
    print("raw_text_preview:\n", raw[:600])

    d = getattr(rag.generator, "last_resp_dump", None)
    print("dump is None?", d is None)
    if isinstance(d, dict):
        # responses API는 output_text가 별도 필드로 있을 수 있음(덤프엔 없을 때도 있음)
        print("top keys:", list(d.keys())[:30])
        print("status:", d.get("status"))
        usage = d.get("usage") or {}
        print("usage.output_tokens:", usage.get("output_tokens"))
        out = d.get("output")
        if isinstance(out, list):
            print("output item types:", [x.get("type") for x in out if isinstance(x, dict)])

last_debug: {'model': 'gpt-5-mini', 'n_questions': 11, 'context_len': 4000, 'max_context_chars': 4000, 'prompt_len': 5237, 'response_status': 'completed', 'output_tokens': 235, 'output_text_repr': '\'{"project_name": "NOT_FOUND", "agency": "NOT_FOUND", "purpose": "NOT_FOUND", "budget": "NOT_FOUND", "contract_type": "제한경쟁입찰 후 협상에 의한 계약", "deadline": "NOT_FOUND", "duration": "NOT_FOUND", "requiremen\'', 'exception': None, 'parse_error': None}
raw_text_len: 472
raw_text_preview:
 {"project_name": "NOT_FOUND", "agency": "NOT_FOUND", "purpose": "NOT_FOUND", "budget": "NOT_FOUND", "contract_type": "제한경쟁입찰 후 협상에 의한 계약", "deadline": "NOT_FOUND", "duration": "NOT_FOUND", "requirements_must": "사업 계약단계부터 완료단계까지 준수해야할 보안대책; 대표자용 및 참여인력용 보안서약서/보안확약서 제출(입찰 및 제안서 관련 서식)", "eval_items": "기술평가 (90%) / 가격평가 (10%); 기술성 평가기준 및 기술제안서 평가항목은 '기술성평가기준'에 의함", "price_eval": "가격평가 (10%)", "eligibility": "대기업 참여 제한 사업; 제출서류로 자본금 및 매출액(최근 3년), 주요 사업실적, 하도급 관련 서류 등 요구"}
dump is None? False
top keys: ['id', 'created

In [11]:
# (옵션) GEN_FAIL/NOT_FOUND 비율 빠르게 보기: 마지막 doc 1개 기준
try:
    last_row = doc_df.iloc[-1].to_dict()
    pm = last_row.get("pred_map")
    print("pred_map sentinel counts:", count_sentinels(pm))
except Exception as e:
    print("pred_map sentinel counts: skipped:", repr(e))

pred_map sentinel counts: {'n_keys': 11, 'n_not_found': 6, 'n_gen_fail': 0}
