In [1]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:90% !important;}
div.cell.code_cell.rendered{width:100%;}
div.input_prompt{padding:0px;}
div.CodeMirror {font-family:Consolas; font-size:12pt;}
div.text_cell_render.rendered_html{font-size:12pt;}
div.output {font-size:12pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:12pt;}
div.prompt {min-width:70px;}}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:12pt;padding:5px;}
table.dataframe{font-size:12px;}
</style>
"""))

# 1. Í≥µÌÜµÏÑ§Ï†ï

In [1]:
import os
import re
import json
import glob
import gc
import traceback

import pandas as pd
from typing import List, Dict, Any
from datasets import Dataset

from dotenv import load_dotenv
load_dotenv()

from ragas import evaluate

# ---- AnswerRelevancy / Faithfulness import (ragas Î≤ÑÏ†Ñ Ï∞®Ïù¥ ÎåÄÏùë) ----
try:
    from ragas.metrics._answer_relevancy import AnswerRelevancy
except Exception:
    from ragas.metrics import answer_relevancy as AnswerRelevancy

try:
    from ragas.metrics._faithfulness import Faithfulness
except Exception:
    from ragas.metrics import faithfulness as Faithfulness

from langchain_upstage import ChatUpstage, UpstageEmbeddings
from chatbot_app.modules.rag_module import create_pipeline


from ragas.run_config import RunConfig
rc = RunConfig(max_workers=1, timeout=180)


# -----------------------------
# LLM / Embeddings (ÌèâÍ∞ÄÏö©)
# -----------------------------
# ragasÍ∞Ä LangChain LLM wrapperÎ•º ÏöîÍµ¨ÌïòÎäî Î≤ÑÏ†ÑÏù¥ ÏûàÏñ¥ÏÑú ÏïàÏ†ÑÌïòÍ≤å ÎûòÌïë ÏãúÎèÑ
try:
    from ragas.llms import LangchainLLMWrapper
    ragas_llm = LangchainLLMWrapper(ChatUpstage(model="solar-pro2", temperature=0))
except Exception:
    ragas_llm = ChatUpstage(model="solar-pro2", temperature=0)

ragas_embeddings = UpstageEmbeddings(model="solar-embedding-1-large")
upstage_embedding = UpstageEmbeddings(model="solar-embedding-1-large")

print(f"ragas_llm={type(ragas_llm)}")
print(f"ragas_embeddings={type(ragas_embeddings)}")

# -----------------------------
# Metrics
# -----------------------------
def _metric_instance(m):
    # mÏù¥ ÌÅ¥ÎûòÏä§Î©¥ m()Î°ú Ïù∏Ïä§ÌÑ¥Ïä§ ÎßåÎì§Í≥†,
    # Ïù¥ÎØ∏ Ïù∏Ïä§ÌÑ¥Ïä§/Í∞ùÏ≤¥Î©¥ Í∑∏ÎåÄÎ°ú Î∞òÌôò
    return m() if callable(m) else m


AR_METRIC = _metric_instance(AnswerRelevancy)
F_METRIC = _metric_instance(Faithfulness)

OUT_DIR = "./data/RAGAS"
os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------------
# Î™ΩÌÇ§Ìå®Ïπò: AnswerRelevancy Ïª§ÎÑê ÌÅ¨ÎûòÏãú(ÏÑ∏Í∑∏Ìè¥Ìä∏) ÌöåÌîºÏö©
# - AnswerRelevancy ÎÇ¥Î∂Ä cosine similarity Í≥ÑÏÇ∞Ïù¥ numpy/BLAS Í≤ΩÎ°úÎ•º ÌÉÄÎ©¥
#   Windows ÌôòÍ≤ΩÏóêÏÑú ÎìúÎ¨ºÍ≤å ÌïòÎìú ÌÅ¨ÎûòÏãúÍ∞Ä Î∞úÏÉùÌï† Ïàò ÏûàÏñ¥,
#   similarity Í≥ÑÏÇ∞Îßå "ÏôÑÏ†Ñ ÌååÏù¥Ïç¨" Íµ¨ÌòÑÏúºÎ°ú Ïö∞ÌöåÌï©ÎãàÎã§.
# - ÎòêÌïú providers Ï†úÏïΩ(Upstage: n must be 1) ÎïåÎ¨∏Ïóê strictness=1ÏùÑ Í∞ïÏ†úÌï©ÎãàÎã§.
# -----------------------------
import math
import numpy as np

import ragas.metrics._answer_relevance as ar

def _cos_sim(u, v, eps=1e-8):
    # numpy dot/BLAS Ïïà Ïì∞Îäî ÏàúÏàò ÌååÏù¥Ïç¨ ÏΩîÏÇ¨Ïù∏
    su = 0.0
    sv = 0.0
    s  = 0.0
    for a, b in zip(u, v):
        af = float(a); bf = float(b)
        s  += af * bf
        su += af * af
        sv += bf * bf
    return s / (math.sqrt(su) * math.sqrt(sv) + eps)

def safe_calculate_similarity(self, question: str, generated_questions: list[str]):
    """
    ragas AnswerRelevancy/ResponseRelevancyÍ∞Ä Í∏∞ÎåÄÌïòÎäî ÏãúÍ∑∏ÎãàÏ≤ò:
      (self, question: str, generated_questions: list[str]) -> array-like with .mean()
    """
    # 1) metricÏù¥ Îì§Í≥† ÏûàÎäî embeddings Ïû°Í∏∞
    emb = None
    for name in ["embeddings", "_embeddings", "embedding", "_embedding"]:
        if hasattr(self, name):
            emb = getattr(self, name)
            if emb is not None:
                break

    if emb is None:
        raise RuntimeError("AnswerRelevancy metricÏóê embeddingsÍ∞Ä ÏÑ∏ÌåÖÎêòÏßÄ ÏïäÏïòÏäµÎãàÎã§.")

    # 2) ÏûÑÎ≤†Îî© Ìï®Ïàò ÏÑ†ÌÉù (syncÎßå)
    def embed_one(text: str):
        if hasattr(emb, "embed_query"):
            return emb.embed_query(text)
        if hasattr(emb, "embed_documents"):
            return emb.embed_documents([text])[0]
        raise RuntimeError("embeddings Í∞ùÏ≤¥Ïóê embed_query/embed_documentsÍ∞Ä ÏóÜÏäµÎãàÎã§(ÎèôÍ∏∞ Î©îÏÑúÎìú ÌïÑÏöî).")

    def embed_many(texts: list[str]):
        if len(texts) == 0:
            return []
        if hasattr(emb, "embed_documents"):
            return emb.embed_documents(texts)
        # embed_queryÎßå ÏûàÏúºÎ©¥ Î£®ÌîÑ
        if hasattr(emb, "embed_query"):
            return [emb.embed_query(t) for t in texts]
        raise RuntimeError("embeddings Í∞ùÏ≤¥Ïóê embed_query/embed_documentsÍ∞Ä ÏóÜÏäµÎãàÎã§(ÎèôÍ∏∞ Î©îÏÑúÎìú ÌïÑÏöî).")

    # 3) Î¨∏ÏûêÏó¥ -> Î≤°ÌÑ∞
    qv = embed_one(question)
    gvs = embed_many(generated_questions)

    if not gvs:
        return np.array([0.0], dtype="float32")

    # 4) ÏΩîÏÇ¨Ïù∏ Ïú†ÏÇ¨ÎèÑ (BLAS Ïïà ÌÉê)
    sims = [_cos_sim(qv, gv) for gv in gvs]

    # ragasÎäî Îí§ÏóêÏÑú cosine_sim.mean()ÏùÑ Ìò∏Ï∂úÌïòÎØÄÎ°ú numpy arrayÎ°ú Î∞òÌôò
    return np.array(sims, dtype="float32")

# ‚úÖ ÌÅ¥ÎûòÏä§ Î©îÏÑúÎìúÎ•º ÎçÆÏñ¥Ïç®Ïïº Ìï® (Î™®Îìà Ìï®Ïàò Ìå®ÏπòÍ∞Ä ÏïÑÎãò)
ar.ResponseRelevancy.calculate_similarity = safe_calculate_similarity
ar.AnswerRelevancy.calculate_similarity = safe_calculate_similarity


# 1) Î™®Îìà Î†àÎ≤® Ìï®ÏàòÍ∞Ä ÏûàÎã§Î©¥ __code__ ÍµêÏ≤¥
if ar is not None and hasattr(ar, "calculate_similarity"):
    ar.calculate_similarity.__code__ = _cos_sim.__code__
    ar.calculate_similarity.__defaults__ = _cos_sim.__defaults__
    print("‚úÖ patched ar.calculate_similarity (__code__)")
elif ar is not None:
    # 2) Î™®ÎìàÏóê ÏóÜÏúºÎ©¥ Ïù¥Î¶Ñ Ï£ºÏûÖ(lookup Í≤ΩÎ°úÏóê Îî∞Îùº Î®πÏùÑ Ïàò ÏûàÏùå)
    ar.__dict__["calculate_similarity"] = _cos_sim
    print("‚úÖ injected ar.calculate_similarity")

    # 3) ÌÅ¥ÎûòÏä§ Î©îÏÑúÎìúÎ°ú Í≥ÑÏÇ∞ÌïòÎäî Î≤ÑÏ†Ñ ÎåÄÎπÑ: ResponseRelevancy.calculate_similarityÎ•º ÎçÆÏñ¥Ïì∞Í∏∞
    if hasattr(ar, "ResponseRelevancy"):
        try:
            ar.ResponseRelevancy.calculate_similarity = _cos_sim
            print("‚úÖ patched ResponseRelevancy.calculate_similarity")
        except Exception:
            pass

# Upstage Ï†úÏïΩ: n=1 Í∞ïÏ†ú
if hasattr(AR_METRIC, "strictness"):
    AR_METRIC.strictness = 1
    print("‚úÖ AR_METRIC.strictness fixed to 1")

# -----------------------------
# Í≥µÌÜµ Ïú†Ìã∏
# -----------------------------
def clip_rows(rows, *, max_ctx=4, max_chars=1200, clip_answer_chars=700):
    """ÏöîÍµ¨ÏÇ¨Ìï≠ Î∞òÏòÅ: context clip 1200 chars"""
    clipped = []
    for r in rows:
        rr = dict(r)
        ctx = rr.get("contexts", []) or []
        ctx = [c for c in ctx if c is not None]
        rr["contexts"] = [str(c)[:max_chars] for c in ctx[:max_ctx]]
        if rr.get("answer"):
            rr["answer"] = str(rr["answer"])[:clip_answer_chars]
        clipped.append(rr)
    return clipped


def cleanup_memory():
    gc.collect()
    print("üßπ Memory cleaned")


def save_df(df: pd.DataFrame, out_path: str):
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    df.to_csv(out_path, index=False, encoding="utf-8")
    print(f"‚úÖ saved: {out_path}")
    return out_path


# -----------------------------
# Ï†ïÏÑ± Ï≤¥ÌÅ¨(ÌèâÍ∞ÄÌëú Ïª¨Îüº): Í∑ºÍ±∞ÌëúÌòÑ/Î¨∏Ïû•Íµ¨Ï°∞/Ï∂îÎ°†ÌôïÏû•/Î∂àÌôïÏã§ÏÑ±
# - ÏóÑÎ∞ÄÌïú metricÏù¥ ÏïÑÎãàÎùº "Ìå®ÌÑ¥ ÌÉêÏßÄÏö©" Î≥¥Ï°∞ Ïª¨Îüº
# -----------------------------
_KR_EVIDENCE_PHRASES = ["Îî∞Î•¥Î©¥", "Í∑ºÍ±∞", "Ï∞∏Í≥†", "Ïóê Îî∞Î•¥Î©¥", "Ï°∞Ìï≠", "Í∑úÏ†ï", "ÌåêÎ°Ä", "Î≤ïÏõê", "ÏÇ¨Í±¥Î≤àÌò∏", "Ï†ú"]
_KR_UNCERTAINTY_PHRASES = ["ÌôïÏù∏ ÌïÑÏöî", "Ï∂îÍ∞Ä ÌôïÏù∏", "Í∞ÄÎä•", "Ïùº Ïàò", "ÏûêÎ£å Î≤îÏúÑ", "Ï†úÍ≥µÎêú ÏûêÎ£å", "Îã®Ï†ï", "Î∂àÎ™ÖÌôï"]
_KR_INFERENCE_PHRASES = ["ÏùºÎ∞òÏ†ÅÏúºÎ°ú", "ÌÜµÏÉÅ", "Î≥¥ÌÜµ", "ÎåÄÍ∞ú", "Ï∂îÏ†ï", "ÏõêÏπôÏ†ÅÏúºÎ°ú", "Ï∂îÎ°†", "ÏÉÅÏãùÏ†ÅÏúºÎ°ú"]


def _has_any(text: str, phrases: List[str]) -> int:
    t = text or ""
    return int(any(p in t for p in phrases))


def _first_sentence(text: str) -> str:
    if not text:
        return ""
    for sep in ["\n", ".", "?", "!"]:
        if sep in text:
            return text.split(sep, 1)[0].strip()
    return text.strip()


def _looks_conclusion_first(answer: str) -> int:
    s1 = _first_sentence(answer)
    if not s1:
        return 0
    return int(any(x in s1 for x in ["ÏûÖÎãàÎã§", "Îê©ÎãàÎã§", "Í∞ÄÎä•Ìï©ÎãàÎã§", "Î∂àÍ∞ÄÎä•Ìï©ÎãàÎã§", "ÌïòÏÑ∏Ïöî", "Í∂åÌï©ÎãàÎã§", "ÏúÑÌóò", "Ïú†Î¶¨", "Î∂àÎ¶¨"]))


def _hallucinated_citation_flag(answer: str, contexts: List[str]) -> int:
    """
    ÎãµÎ≥ÄÏóê 'OOO Ï†úNNÏ°∞' Í∞ôÏùÄ Ìå®ÌÑ¥Ïù¥ ÏûàÎäîÎç∞
    Ïª®ÌÖçÏä§Ìä∏ Ï†ÑÏ≤¥ Î¨∏ÏûêÏó¥Ïóê ÎèôÏùºÌïú Íµ¨Ï†àÏù¥ ÌïòÎÇòÎèÑ ÏóÜÏúºÎ©¥ 1 (ÏùòÏã¨)
    """
    if not answer:
        return 0
    ctx_all = "\n".join(contexts or [])
    pats = re.findall(r"([Í∞Ä-Ìû£A-Za-z0-9¬∑\s]{2,40})\s(Ï†ú?\d+Ï°∞(?:Ïùò\d+)?)", answer)
    if not pats:
        return 0
    for left, art in pats:
        needle = (left.strip() + " " + art.strip()).strip()
        if needle and needle not in ctx_all:
            return 1
    return 0


# -----------------------------
# Ï†ïÍ∑úÌôî Í≤∞Í≥º ‚Äú1Ï§Ñ Í∞ïÏ†ú‚Äù Ïú†Ìã∏
# (Ï£ºÏùò: ÏßÄÍ∏à Ìå®ÏπòÎäî trace Ï†ÄÏû•Í∞íÎßå 1Ï§ÑÎ°ú ÎßåÎì§Í≥†, pipeline ÎÇ¥Î∂Ä retrieval queryÏóêÎäî ÏòÅÌñ•ÏùÑ Ï£ºÏßÄ ÏïäÏäµÎãàÎã§.
#        pipeline ÎÇ¥Î∂ÄÏóê Ï†ÅÏö©ÌïòÎ†§Î©¥ rag_module.py ÏøºÎ¶¨ ÏÉùÏÑ± ÏßÅÏ†ÑÏóê ÎèôÏùº Î°úÏßÅÏùÑ ÎÑ£Ïñ¥Ïïº Ìï©ÎãàÎã§.)
# -----------------------------
def sanitize_normalized_query(text: str, max_chars: int = 400) -> str:
    if not text:
        return ""
    first = text.strip().splitlines()[0].strip()
    return first[:max_chars]


# ‚úÖ Îç∞Ïù¥ÌÑ∞ Í≤ÄÏ¶ù Ìï®Ïàò Ï∂îÍ∞Ä
def validate_data(rows, required_fields):
    """Îç∞Ïù¥ÌÑ∞ Í≤ÄÏ¶ù Î∞è Î¨∏Ï†ú Î¶¨Ìè¨Ìä∏"""
    issues = []
    for i, r in enumerate(rows):
        for field in required_fields:
            if field not in r:
                issues.append(f"Row {i}: missing field '{field}'")
            elif r[field] is None:
                issues.append(f"Row {i}: field '{field}' is None")
            elif field in ["question", "answer"] and not str(r[field]).strip():
                issues.append(f"Row {i}: field '{field}' is empty")
    
    if issues:
        print("‚ö†Ô∏è Data validation issues:")
        for issue in issues[:10]:  # Ï≤òÏùå 10Í∞úÎßå Ï∂úÎ†•
            print(f"  - {issue}")
        if len(issues) > 10:
            print(f"  ... and {len(issues) - 10} more issues")
    else:
        print("‚úÖ Data validation passed")
    
    return len(issues) == 0

# -----------------------------
# Ïã§Ìñâ ÌÉúÍ∑∏ / ÎîîÎ†âÌÜ†Î¶¨
# -----------------------------
def make_run_tag_for_answer(model_name: str) -> str:
    return f"llmcmp_{model_name}"


RUNS_ROOT = os.path.join(OUT_DIR, "runs_answer")
os.makedirs(RUNS_ROOT, exist_ok=True)


# -----------------------------
# run_once_answer: AnswerRelevancy / Faithfulness 1Ìöå
# -----------------------------
def run_once_answer(
    pipeline,
    items: List[Dict[str, Any]],
    run_idx: int,
    llm_name: str = "solar-pro2",
) -> Dict[str, Any]:
    run_tag = make_run_tag_for_answer(llm_name)
    run_dir = os.path.join(RUNS_ROOT, run_tag, f"run_{run_idx:02d}")
    os.makedirs(run_dir, exist_ok=True)

    cache_jsonl = os.path.join(run_dir, "rag_run_cache.jsonl")
    print(f"--- run_once_answer START | run_idx={run_idx} | llm_name={llm_name} ---")
    # (1) ÌååÏù¥ÌîÑÎùºÏù∏ Ïã§Ìñâ Í≤∞Í≥º Ï∫êÏãú Ï†ÄÏû•
    with open(cache_jsonl, "w", encoding="utf-8") as f:
        print(f"items={len(items)}")
        for i, ex in enumerate(items, start=1):
            q = ex["question"]
            gt = ex.get("ground_truth", "")

            MAX_QUERY_CHARS = 2000
            q_for_retrieval = q if len(q) <= MAX_QUERY_CHARS else q[:MAX_QUERY_CHARS]

            try:
                trace = pipeline.answer_with_trace(q_for_retrieval, skip_normalization=False)
                if "normalized_query" in trace:
                    trace["normalized_query"] = sanitize_normalized_query(trace.get("normalized_query", ""))
            except Exception as e:
                trace = {"answer": "", "docs": [], "normalized_query": ""}

            answer = trace.get("answer", "") or ""
            docs = trace.get("docs", []) or []

            contexts = []
            for d in docs:
                try:
                    contexts.append(d.page_content)
                except Exception:
                    pass

            row = {
                "id": i,
                "question": q,
                "answer": answer,
                "contexts": contexts,
                "reference": gt,
                "ground_truths": [gt],
                "normalized_query": trace.get("normalized_query", ""),
            }
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

    # (2) rows Î°úÎìú + ÏµúÏÜå Ï†ïÎ¶¨
    raw_rows = []
    with open(cache_jsonl, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                raw_rows.append(json.loads(line))

    rows_local = []
    for r in raw_rows:
        rr = dict(r)
        if not rr.get("question"):
            continue
        if not rr.get("answer"):
            rr["answer"] = "ÎãµÎ≥ÄÏùÑ ÏÉùÏÑ±Ìï† Ïàò ÏóÜÏäµÎãàÎã§."
        if "contexts" not in rr or rr["contexts"] is None:
            rr["contexts"] = []
        if not isinstance(rr["contexts"], list):
            rr["contexts"] = [str(rr["contexts"])]
        rows_local.append(rr)

    validate_data(rows_local, required_fields=["question", "answer", "contexts"])

    # (3) clip (ÏöîÍµ¨ÏÇ¨Ìï≠: max_chars=1200)
    rows_eval = clip_rows(rows_local, max_ctx=4, max_chars=1200, clip_answer_chars=700)

    ds = Dataset.from_list(
        [
            {
                "question": r["question"],
                "answer": r["answer"],
                "contexts": r.get("contexts", []),
                "reference": r.get("reference", ""),
            }
            for r in rows_eval
        ]
    )

    # (4) RAGAS ÌèâÍ∞Ä
    print(f"evaluate START | run_idx={run_idx}")
    import sys, ragas.metrics._answer_relevance as ar

    print("ar module id:", id(ar))
    print("calc filename:", ar.calculate_similarity.__code__.co_filename)
    print("in sys.modules:", sys.modules.get("ragas.metrics._answer_relevance") is ar)

    AR_METRIC.strictness = 1
    res = evaluate(
        dataset=ds,
        metrics=[AR_METRIC, F_METRIC],
        llm=ragas_llm,
        embeddings=ragas_embeddings,
        run_config=rc,
        show_progress=True,
        raise_exceptions=True,
        batch_size=1,
    )


    print("evaluate RETURNED")   # ‚úÖ Ïù¥ Ï§ÑÏù¥ Ï∞çÌûàÎäîÏßÄ Ïó¨Î∂ÄÍ∞Ä ÌïµÏã¨
    df = res.to_pandas()

    print("to_pandas DONE")      # ‚úÖ Ïù¥ Ï§ÑÏù¥ Ï∞çÌûàÎäîÏßÄ Ïó¨Î∂ÄÎèÑ ÌïµÏã¨
    # id Ïª¨Îüº Î≥¥Ï†ï
    if "id" not in df.columns:
        df = df.copy()
        df["id"] = range(1, len(df) + 1)

    # (5) Ï†ïÏÑ± Ï≤¥ÌÅ¨ Ïª¨Îüº Ï∂îÍ∞Ä
    id_to_ctx = {i + 1: rows_eval[i].get("contexts", []) for i in range(len(rows_eval))}
    id_to_ans = {i + 1: rows_eval[i].get("answer", "") for i in range(len(rows_eval))}

    df["evidence_expression"] = df["id"].map(lambda _id: _has_any(id_to_ans.get(_id, ""), _KR_EVIDENCE_PHRASES))
    df["structure_conclusion_first"] = df["id"].map(lambda _id: _looks_conclusion_first(id_to_ans.get(_id, "")))
    df["inference_expansion_signal"] = df["id"].map(lambda _id: _has_any(id_to_ans.get(_id, ""), _KR_INFERENCE_PHRASES))
    df["uncertainty_signal"] = df["id"].map(lambda _id: _has_any(id_to_ans.get(_id, ""), _KR_UNCERTAINTY_PHRASES))
    df["hallucinated_citation_suspect"] = df["id"].map(
        lambda _id: _hallucinated_citation_flag(id_to_ans.get(_id, ""), id_to_ctx.get(_id, []))
    )

    # (Ï†ÄÏû•1) ragas raw
    out_raw = os.path.join(run_dir, "ragas_result_answer_faith.csv")
    save_df(df, out_raw)

    # (Ï†ÄÏû•2) per_question (Î≥¥Í∏∞ Ï¢ãÏùÄ ÌèâÍ∞ÄÌëú)
    perq_cols = [
        "id",
        "answer_relevancy",
        "faithfulness",
        "evidence_expression",
        "structure_conclusion_first",
        "inference_expansion_signal",
        "uncertainty_signal",
        "hallucinated_citation_suspect",
    ]
    # ‚úÖ Ïã§Ï†ú dfÏóê Ï°¥Ïû¨ÌïòÎäî Ïª¨ÎüºÎßå ÏÑ†ÌÉù (metricsÎ•º Î∞îÍøîÎèÑ KeyError Î∞©ÏßÄ)
    _cols_exist = [c for c in perq_cols if c in df.columns]
    perq = df[_cols_exist].copy()
    for c in ["answer_relevancy", "faithfulness"]:
        if c in perq.columns:
            perq[c] = perq[c].astype(float).round(3)

    perq_csv = os.path.join(run_dir, "per_question.csv")
    save_df(perq, perq_csv)

    print(f"saved per_question_csv={perq_csv}")
    print(f"--- run_once_answer DONE | run_idx={run_idx} ---")
    cleanup_memory()

    return {
        "run_tag": run_tag,
        "run_idx": run_idx,
        "run_dir": run_dir,
        "cache_jsonl": cache_jsonl,
        "raw_csv": out_raw,
        "per_question_csv": perq_csv,
        "AR_mean": float(perq["answer_relevancy"].mean()),
        "AR_std": float(perq["answer_relevancy"].std(ddof=1)) if len(perq) > 1 else 0.0,
        "F_mean": float(perq["faithfulness"].mean()),
        "F_std": float(perq["faithfulness"].std(ddof=1)) if len(perq) > 1 else 0.0,
    }


# -----------------------------
# Î∞òÎ≥µ Ïã§Ìñâ(run_repeat_answer) + wide CSV
# -----------------------------
def run_repeat_answer(pipeline, items: List[Dict[str, Any]], n: int = 3, llm_name: str = "solar-pro2") -> List[Dict[str, Any]]:
    results = []
    for run_idx in range(1, n + 1):
        print("\n==============================")
        print(f"RUN {run_idx:02d}/{n} | {make_run_tag_for_answer(llm_name)}")
        print("==============================")
        results.append(run_once_answer(pipeline, items, run_idx, llm_name=llm_name))
    return results


def build_wide_csv_answer(run_tag: str, results: List[Dict[str, Any]]) -> str:
    """
    Î©îÎ™®Î¶¨ ÏïàÏ†ïÏÑ±ÏùÑ ÏúÑÌï¥ Í≤∞Í≥º dictÏóê df_perqÎ•º Ï†ÄÏû•ÌïòÏßÄ ÏïäÍ≥†,
    per_question_csvÎ•º Îã§Ïãú ÏùΩÏñ¥ÏÑú wideÎ•º ÎßåÎì≠ÎãàÎã§.
    """
    base_dir = os.path.join(RUNS_ROOT, run_tag)
    os.makedirs(base_dir, exist_ok=True)

    ids = pd.DataFrame({"id": range(1, 11)})
    wide = ids.copy()

    for res in results:
        ridx = res["run_idx"]
        dfm = pd.read_csv(res["per_question_csv"])[[
            "id",
            "answer_relevancy",
            "faithfulness",
            "hallucinated_citation_suspect",
        ]].copy()

        dfm = dfm.rename(columns={
            "answer_relevancy": f"r{ridx}-answer_relevancy",
            "faithfulness": f"r{ridx}-faithfulness",
            "hallucinated_citation_suspect": f"r{ridx}-hallucination_suspect",
        })
        wide = pd.merge(wide, dfm, on="id", how="left")

    wide = wide.set_index("id").sort_index()

    ar_cols = [c for c in wide.columns if c.endswith("-answer_relevancy")]
    f_cols = [c for c in wide.columns if c.endswith("-faithfulness")]

    wide["answer_relevancy-mean"] = wide[ar_cols].mean(axis=1, skipna=True)
    wide["faithfulness-mean"] = wide[f_cols].mean(axis=1, skipna=True)

    mean_row = wide.mean(axis=0, skipna=True).to_frame().T
    mean_row.index = ["mean"]
    wide2 = pd.concat([wide, mean_row], axis=0)

    out_csv = os.path.join(base_dir, "runs_wide_3repeats.csv")
    wide2.to_csv(out_csv, encoding="utf-8")
    print("‚úÖ saved:", out_csv)
    return out_csv


# -----------------------------
# DOCX ÏßàÎ¨∏ Î°úÎî© (ÏóÖÎ°úÎìú ÌååÏùº Í≤ΩÎ°ú Í∏∞Ï§Ä)
# -----------------------------
from docx import Document

DOCX_PATH = "data/RAGAS/RAGAS_ÏßàÎ¨∏_25Í∞ú_Ï†ïÎ¶¨Î≥∏.docx"

def load_items_from_docx(docx_path: str, max_q: int = 25):
    doc = Document(docx_path)
    paras = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]

    items = []
    i = 0
    while i < len(paras) and len(items) < max_q:
        line = paras[i]
        m = re.match(r"^(\d+)\.\s*(.+)$", line)
        if not m:
            i += 1
            continue

        question = m.group(2).strip()

        i += 1
        answer_lines = []
        while i < len(paras):
            if re.match(r"^\d+\.\s*.+$", paras[i]):
                break
            answer_lines.append(paras[i])
            i += 1

        gt = "\n".join([x for x in answer_lines if x not in ["‚úîÔ∏è Î™®Î≤îÎãµÏïà"]]).strip()

        items.append({"question": question, "ground_truth": gt})

    return items


items = load_items_from_docx(DOCX_PATH, max_q=25)
print("‚úÖ items loaded:", len(items))
print("sample Q1:", items[0]["question"])

  from ragas.metrics import answer_relevancy as AnswerRelevancy
  ragas_llm = LangchainLLMWrapper(ChatUpstage(model="solar-pro2", temperature=0))


ragas_llm=<class 'ragas.llms.base.LangchainLLMWrapper'>
ragas_embeddings=<class 'langchain_upstage.embeddings.UpstageEmbeddings'>
‚úÖ injected ar.calculate_similarity
‚úÖ patched ResponseRelevancy.calculate_similarity
‚úÖ AR_METRIC.strictness fixed to 1
‚úÖ items loaded: 25
sample Q1: Ï†ÑÏûÖÏã†Í≥†¬∑ÌôïÏ†ïÏùºÏûê ÌñàÎäîÎç∞, ÌôïÏ†ïÏùºÏûêÎ∂Ä ÎÇ¥Ïö©ÍπåÏßÄ Ï§ëÏöîÌïúÍ∞ÄÏöî?


# 2. Ïã§Ìñâ

In [2]:
%%time

from langchain_openai import ChatOpenAI

gpt_llm = ChatOpenAI(
    model="gpt-4o-mini",
    # model="gpt-5-mini",
    temperature=0
)

upstage_llm = ChatUpstage(
    model="solar-pro2",
    temperature=0
)

pipeline = create_pipeline(
    generation_llm=gpt_llm,
    # generation_llm=upstage_llm,
    embedding=upstage_embedding,   
)

# -----------------------------
# 1Ìöå Ïã§Ìñâ (Î∞òÎ≥µ Ïã§Ìóò Ï†úÍ±∞)
# -----------------------------
result = run_once_answer(pipeline, items, run_idx=1, llm_name="gpt-4o-mini")

# Ï†ÄÏû• ÌååÏùºÎì§ Í≤ΩÎ°ú Ï∂úÎ†•
print("‚úÖ DONE. run_tag =", result.get("run_tag"))
print(" - raw_csv:", result.get("raw_csv"))
print(" - perq_csv:", result.get("perq_csv"))
print(" - summary_csv:", result.get("summary_csv"))


2026-02-09 16:12:54,767 - chatbot_app.modules.rag_module - INFO - üîó Pinecone 3Ï§ë Ïù∏Îç±Ïä§ Ïó∞Í≤∞ Ï§ë...
2026-02-09 16:12:56,472 - chatbot_app.modules.rag_module - INFO - ‚úÖ [Law / Rule / Case] 3Í∞ú Ïù∏Îç±Ïä§ Î°úÎìú ÏôÑÎ£å!
2026-02-09 16:12:56,497 - chatbot_app.modules.rag_module - INFO - ‚ÑπÔ∏è SimpleTokenizer ÏÇ¨Ïö© (BM25)


--- run_once_answer START | run_idx=1 | llm_name=gpt-4o-mini ---
items=25


2026-02-09 16:12:57,582 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/chat/completions "HTTP/1.1 200 OK"
2026-02-09 16:12:57,594 - chatbot_app.modules.rag_module - INFO - üîÑ ÌëúÏ§ÄÌôîÎêú ÏßàÎ¨∏: Ï£ºÎØºÎì±Î°ù¬∑ÌôïÏ†ïÏùºÏûê ÌñàÎäîÎç∞, ÌôïÏ†ïÏùºÏûêÎ∂Ä ÎÇ¥Ïö©ÍπåÏßÄ Ï§ëÏöîÌïúÍ∞ÄÏöî? (Ï£ºÎØºÎì±Î°ù¬∑ÌôïÏ†ïÏùºÏûê ÌñàÎäîÎç∞, ÌôïÏ†ïÏùºÏûêÎ∂Ä ÎÇ¥Ïö©ÍπåÏßÄ Ï§ëÏöîÌïúÍ∞ÄÏöî?)  

‚Äª [Ïö©Ïñ¥ ÏÇ¨Ï†Ñ]Ïóê 'Ï†ÑÏûÖÏã†Í≥†'Îäî 'Ï£ºÎØºÎì±Î°ù'ÏúºÎ°úÎßå Îß§ÌïëÎêòÏñ¥ ÏûàÏúºÎ©∞, 'ÌôïÏ†ïÏùºÏûêÎ∂Ä'Îäî Î≥ÑÎèÑ Îß§Ìïë ÎåÄÏÉÅÏù¥ ÏïÑÎãàÎØÄÎ°ú ÏõêÎ¨∏ÏùÑ Ïú†ÏßÄÌïòÏòÄÏäµÎãàÎã§.
2026-02-09 16:12:57,595 - chatbot_app.modules.rag_module - INFO - üîç [Hybrid Retrieval] query='Ï£ºÎØºÎì±Î°ù¬∑ÌôïÏ†ïÏùºÏûê ÌñàÎäîÎç∞, ÌôïÏ†ïÏùºÏûêÎ∂Ä ÎÇ¥Ïö©ÍπåÏßÄ Ï§ëÏöîÌïúÍ∞ÄÏöî? (Ï£ºÎØºÎì±Î°ù¬∑ÌôïÏ†ïÏùºÏûê ÌñàÎäîÎç∞, ÌôïÏ†ïÏùºÏûêÎ∂Ä ÎÇ¥Ïö©ÍπåÏßÄ Ï§ëÏöîÌïúÍ∞ÄÏöî?)  

‚Äª [Ïö©Ïñ¥ ÏÇ¨Ï†Ñ]Ïóê 'Ï†ÑÏûÖÏã†Í≥†'Îäî 'Ï£ºÎØºÎì±Î°ù'ÏúºÎ°úÎßå Îß§ÌïëÎêòÏñ¥ ÏûàÏúºÎ©∞, 'ÌôïÏ†ïÏùºÏûêÎ∂Ä'Îäî Î≥ÑÎèÑ Îß§Ìïë ÎåÄÏÉÅÏù¥ ÏïÑÎãàÎØÄÎ°ú ÏõêÎ¨∏ÏùÑ Ï

2026-02-09 16:13:26,873 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:13:27,593 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:13:28,482 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:13:28,717 - chatbot_app.modules.rag_module - INFO - üìù Using prompt mode: GENERAL
2026-02-09 16:13:28,717 - chatbot_app.modules.rag_module - INFO - ü§ñ ÎãµÎ≥Ä ÏÉùÏÑ± Ï§ë...
2026-02-09 16:13:32,211 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-02-09 16:13:33,400 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/chat/completions "HTTP/1.1 200 OK"
2026-02-09 16:13:33,404 - chatbot_app.modules.rag_module - INFO - üîÑ ÌëúÏ§ÄÌôîÎêú ÏßàÎ¨∏: Î¨µÏãúÏ†ÅÍ∞±Ïã†(Î¨µÏãúÏ†ÅÍ∞±Ïã†)ÏúºÎ°ú Ïó∞Ïû•Îêú Ï§Ñ Î™®Î•¥Í≥† ÏÇ¥ÏïòÎäîÎç∞, Ï£ºÌÉùÏùòÏù∏ÎèÑ(Ï£ºÌÉùÏùòÏù∏Îè

2026-02-09 16:13:57,153 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:13:57,905 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:13:58,658 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:13:59,192 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 429 Too Many Requests"
2026-02-09 16:13:59,670 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:14:00,732 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:14:01,395 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:14:01,627 - chatbot_app.modules.rag_module - INFO - üìù Using prompt mode: GENERAL
2026-02-09 16:14:01,627 - chatbot_app.modules.rag_

2026-02-09 16:14:27,662 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:14:28,591 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:14:29,260 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:14:29,995 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 429 Too Many Requests"
2026-02-09 16:14:30,461 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:14:31,147 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:14:32,035 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:14:32,282 - chatbot_app.modules.rag_module - INFO - üìù Using prompt mode: GENERAL
2026-02-09 16:14:32,284 - chatbot_app.modules.rag_

2026-02-09 16:14:57,408 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:14:58,353 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:14:59,060 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:14:59,556 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 429 Too Many Requests"
2026-02-09 16:14:59,978 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:15:00,701 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:15:01,601 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:15:01,833 - chatbot_app.modules.rag_module - INFO - üìù Using prompt mode: GENERAL
2026-02-09 16:15:01,834 - chatbot_app.modules.rag_

2026-02-09 16:15:26,215 - chatbot_app.modules.rag_module - INFO - üîç [Hybrid Retrieval] query='Í≥ÑÏïΩÏ¶ùÏÑú(ÏûÑÎåÄÏ∞®Í≥ÑÏïΩÏ¶ùÏÑú)Ïóê ÏúÑÏïΩÍ∏à Ï°∞Ìï≠Ïù¥ ÏûàÎäîÎç∞, Ï†úÍ∞Ä Í≥ÑÏïΩÌï¥ÏßÄ(Í≥ÑÏïΩÌï¥ÏßÄ)ÌïòÎ©¥ Î¨¥Ï°∞Í±¥ ÎØºÏÇ¨ÏÜåÏÜ°(ÎØºÏÇ¨ÏÜåÏÜ°)ÏÉÅ ÏÜêÌï¥Î∞∞ÏÉÅ(ÏÜêÌï¥Î∞∞ÏÉÅ)ÏùÑ Ìï¥Ïïº ÌïòÎÇòÏöî?'
2026-02-09 16:15:26,700 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:15:27,379 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:15:28,085 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:15:28,639 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 429 Too Many Requests"
2026-02-09 16:15:29,053 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:15:29,966 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/em

2026-02-09 16:15:56,701 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/chat/completions "HTTP/1.1 200 OK"
2026-02-09 16:15:56,701 - chatbot_app.modules.rag_module - INFO - üîÑ ÌëúÏ§ÄÌôîÎêú ÏßàÎ¨∏: Ï†ÑÏÑ∏(Ï†ÑÏÑ∏) Í≥ÑÏïΩÌïòÎ†§Îäî ÏûÑÏ∞®Ï£ºÌÉù(ÏûÑÏ∞®Ï£ºÌÉù)Ïóê ÏÑ†ÏàúÏúÑ Í∑ºÏ†ÄÎãπÍ∂å(Í∑ºÏ†ÄÎãπÍ∂å)ÏùÄ ÏóÜÎäîÎç∞, ÎÇòÏ§ëÏóê Î≥¥Îãà ÏûÑÎåÄÏù∏(ÏûÑÎåÄÏù∏)Ïù¥ Íµ≠ÏÑ∏Î•º Ï≤¥ÎÇ©Ìïú ÏÉÅÌÉúÏòÄÏäµÎãàÎã§. ÏïÑÏßÅ ÏïïÎ•òÎì±Í∏∞(ÏïïÎ•òÎì±Í∏∞)Îäî ÏóÜÏóàÎäîÎç∞, Ïù¥ Í≤ΩÏö∞ÏóêÎèÑ Ï†ú ÏûÑÎåÄÏ∞®Î≥¥Ï¶ùÍ∏à(ÏûÑÎåÄÏ∞®Î≥¥Ï¶ùÍ∏à)Ïù¥ ÏúÑÌóòÌïúÍ∞ÄÏöî?
2026-02-09 16:15:56,701 - chatbot_app.modules.rag_module - INFO - üîç [Hybrid Retrieval] query='Ï†ÑÏÑ∏(Ï†ÑÏÑ∏) Í≥ÑÏïΩÌïòÎ†§Îäî ÏûÑÏ∞®Ï£ºÌÉù(ÏûÑÏ∞®Ï£ºÌÉù)Ïóê ÏÑ†ÏàúÏúÑ Í∑ºÏ†ÄÎãπÍ∂å(Í∑ºÏ†ÄÎãπÍ∂å)ÏùÄ ÏóÜÎäîÎç∞, ÎÇòÏ§ëÏóê Î≥¥Îãà ÏûÑÎåÄÏù∏(ÏûÑÎåÄÏù∏)Ïù¥ Íµ≠ÏÑ∏Î•º Ï≤¥ÎÇ©Ìïú ÏÉÅÌÉúÏòÄÏäµÎãàÎã§. ÏïÑÏßÅ ÏïïÎ•òÎì±Í∏∞(ÏïïÎ•òÎì±Í∏∞)Îäî ÏóÜÏóàÎäîÎç∞, Ïù¥ Í≤ΩÏö∞ÏóêÎèÑ Ï†ú ÏûÑÎåÄÏ∞®Î≥¥Ï¶ùÍ∏à(ÏûÑÎåÄÏ∞®Î≥¥Ï¶ùÍ∏à)Ïù¥ ÏúÑÌóòÌïúÍ∞ÄÏöî?'
2026-02-09 16:15:57,203 - httpx - I

2026-02-09 16:16:26,961 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:16:27,715 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:16:29,018 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:16:29,560 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 429 Too Many Requests"
2026-02-09 16:16:30,034 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:16:30,951 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:16:32,006 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:16:32,252 - chatbot_app.modules.rag_module - INFO - üìù Using prompt mode: GENERAL
2026-02-09 16:16:32,254 - chatbot_app.modules.rag_

2026-02-09 16:16:57,577 - chatbot_app.modules.rag_module - INFO - üîç [Hybrid Retrieval] query='Í≥ÑÏïΩÌïú ÏûÑÏ∞®Ï£ºÌÉùÏù¥ ÎÇòÏ§ëÏóê Î≥¥Îãà Î∂àÎ≤ïÏúºÎ°ú Ï™ºÍ∞úÏßÑ Ìò∏Ïàò(A-1Ìò∏)ÏòÄÏäµÎãàÎã§. Ï†ÄÎäî Ï£ºÎØºÎì±Î°ùÍ≥º ÌôïÏ†ïÏùºÏûê(ÌôïÏ†ïÏùºÏûê)Î•º Îã§ Î∞õÏïòÎäîÎç∞, Í≤ΩÎß§Ï†àÏ∞®(Í≤ΩÎß§)Í∞Ä ÎêòÎ©¥ Ïö∞ÏÑ†Î≥ÄÏ†úÍ∂å(Ïö∞ÏÑ†Î≥ÄÏ†ú)ÏúºÎ°ú Î≥¥Ìò∏Î∞õÏùÑ Ïàò ÏûàÎÇòÏöî?'
2026-02-09 16:16:58,325 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:16:59,050 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:16:59,714 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:17:00,217 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 429 Too Many Requests"
2026-02-09 16:17:00,715 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:17:01,412 - httpx 

2026-02-09 16:17:30,437 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:17:30,667 - chatbot_app.modules.rag_module - INFO - üìù Using prompt mode: GENERAL
2026-02-09 16:17:30,667 - chatbot_app.modules.rag_module - INFO - ü§ñ ÎãµÎ≥Ä ÏÉùÏÑ± Ï§ë...
2026-02-09 16:17:34,747 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


‚úÖ Data validation passed
evaluate START | run_idx=1
ar module id: 1875909712432
calc filename: C:\Users\Admin\AppData\Local\Temp\ipykernel_15524\906357061.py
in sys.modules: True


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

Batch 1/50:   0%|          | 0/1 [00:00<?, ?it/s]

2026-02-09 16:17:36,005 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/chat/completions "HTTP/1.1 200 OK"
2026-02-09 16:17:37,282 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:17:38,093 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:17:40,062 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/chat/completions "HTTP/1.1 200 OK"
2026-02-09 16:17:48,079 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/chat/completions "HTTP/1.1 200 OK"
2026-02-09 16:17:49,467 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/chat/completions "HTTP/1.1 200 OK"
2026-02-09 16:17:50,715 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:17:51,603 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-

2026-02-09 16:20:31,346 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:20:32,277 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:20:34,744 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/chat/completions "HTTP/1.1 200 OK"
2026-02-09 16:20:41,663 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/chat/completions "HTTP/1.1 200 OK"
2026-02-09 16:20:43,344 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/chat/completions "HTTP/1.1 200 OK"
2026-02-09 16:20:44,625 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:20:45,719 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/embeddings "HTTP/1.1 200 OK"
2026-02-09 16:20:47,828 - httpx - INFO - HTTP Request: POST https://api.upstage.ai/v1/solar/chat/completions "HTTP/1.1 200 OK"
2026-02-

evaluate RETURNED
to_pandas DONE
‚úÖ saved: ./data/RAGAS\runs_answer\llmcmp_gpt-4o-mini\run_01\ragas_result_answer_faith.csv
‚úÖ saved: ./data/RAGAS\runs_answer\llmcmp_gpt-4o-mini\run_01\per_question.csv
saved per_question_csv=./data/RAGAS\runs_answer\llmcmp_gpt-4o-mini\run_01\per_question.csv
--- run_once_answer DONE | run_idx=1 ---
üßπ Memory cleaned
‚úÖ DONE. run_tag = llmcmp_gpt-4o-mini
 - raw_csv: ./data/RAGAS\runs_answer\llmcmp_gpt-4o-mini\run_01\ragas_result_answer_faith.csv
 - perq_csv: None
 - summary_csv: None
CPU times: total: 15.5 s
Wall time: 10min 6s


In [3]:
df = pd.read_csv('data/RAGAS/runs_answer/llmcmp_gpt-4o-mini/run_01/per_question.csv')
df

Unnamed: 0,id,answer_relevancy,faithfulness,evidence_expression,structure_conclusion_first,inference_expansion_signal,uncertainty_signal,hallucinated_citation_suspect
0,1,0.417,0.222,1,0,0,1,1
1,2,0.396,0.071,1,0,0,1,1
2,3,0.502,0.125,1,0,0,1,1
3,4,0.55,0.214,1,0,0,1,1
4,5,0.537,0.273,1,0,0,1,1
5,6,0.32,0.0,1,0,0,1,1
6,7,0.5,0.3,1,0,0,1,1
7,8,0.489,0.125,1,0,0,1,1
8,9,0.175,0.182,1,0,0,1,1
9,10,0.255,0.182,1,0,0,1,1


In [5]:
# df = pd.read_csv('data/RAGAS/runs_answer/llmcmp_solar-pro2/run_01/per_question.csv')
# df

Unnamed: 0,id,answer_relevancy,faithfulness,evidence_expression,structure_conclusion_first,inference_expansion_signal,uncertainty_signal,hallucinated_citation_suspect
0,1,0.0,0.6,1,0,0,1,1
1,2,0.407,0.4,1,0,1,1,1
2,3,0.0,0.714,1,0,0,0,1
3,4,0.492,0.875,1,1,0,1,1
4,5,0.625,0.286,1,0,0,0,1
5,6,0.326,0.6,1,1,0,1,1
6,7,0.483,0.5,1,0,0,0,1
7,8,0.604,0.286,1,0,0,1,1
8,9,0.0,0.167,1,0,0,0,1
9,10,0.0,0.556,1,0,0,1,1
