In [4]:
import pandas as pd
import os

# CSV 파일 경로 (직접 경로로 바꿔주세요)
csv_path = "data/test_with_answer.csv"

# 출력 디렉토리 설정
output_dir = "docs/train_chunks"
os.makedirs(output_dir, exist_ok=True)

# 몇 행씩 나눌 것인지 설정
chunk_size = 500  # 필요에 따라 100, 1000 등으로 조정 가능

# CSV 불러오기
df = pd.read_csv(csv_path)

# 총 분할 수 계산
num_chunks = (len(df) + chunk_size - 1) // chunk_size

# 각 분할을 텍스트로 저장
for i in range(num_chunks):
    chunk = df.iloc[i * chunk_size : (i + 1) * chunk_size]
    lines = []
    for _, row in chunk.iterrows():
        err = row.get("err_sentence", "")
        cor = row.get("cor_sentence", "")
        if pd.notnull(err) and pd.notnull(cor):
            lines.append(f"입력: {err}\n출력: {cor}\n")

    file_path = os.path.join(output_dir, f"train_chunk_{i+1:02}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

print(f"{num_chunks}개의 파일이 '{output_dir}'에 저장되었습니다.")

22개의 파일이 'docs/train_chunks'에 저장되었습니다.


In [None]:
import asyncio
import pandas as pd
import json
from tqdm import tqdm
from itertools import cycle
from prompts.template_format import format_prompt
from optimizer.evaluator import compute_scores
from openai import AsyncOpenAI

from dotenv import load_dotenv
import os

load_dotenv()  # .env 파일 로드

# 모든 API 키 가져오기
API_KEYS = [
    os.getenv(f"UPSTAGE_API_KEY_{i}") for i in range(1, 11)
]
API_KEYS = [k for k in API_KEYS if k]  # None 제거


clients = [
    AsyncOpenAI(api_key=key, base_url="https://api.upstage.ai/v1")
    for key in API_KEYS
]

client_cycle = cycle(clients)

# 템플릿 로딩
graph_path = "data/prompt_graph.jsonl"
selected_template_id = "base_02"

with open(graph_path, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        if obj.get("id") == selected_template_id:
            template_str = obj["template"]
            break

# 데이터 로딩
df = pd.read_csv("data/test_with_answer.csv")
sample = [
    {"input": row["err_sentence"], "output": row["cor_sentence"]}
    for _, row in df.iterrows()
    if pd.notnull(row["err_sentence"]) and pd.notnull(row["cor_sentence"])
]

# LLM 호출 함수
async def call_with_client(client, messages):
    try:
        response = await client.chat.completions.create(
            model="solar-pro",
            messages=messages
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return "[ERROR]"

# 병렬 실행
async def evaluate_all_concurrent():
    results = []

    async def worker(row):
        client = next(client_cycle)
        messages = format_prompt(template_str, row["input"])
        pred = await call_with_client(client, messages)
        return {"prediction": pred, "output": row["output"]}

    tasks = [worker(row) for row in sample]
    responses = await asyncio.gather(*tasks)

    recalls = []
    for res in responses:
        _, recall, _ = compute_scores(res["prediction"], res["output"])
        recalls.append(recall)

    avg_recall = sum(recalls) / len(recalls) if recalls else 0.0
    print(f"[{selected_template_id}] 평균 Recall (병렬 {len(API_KEYS)}개 키 사용): {round(avg_recall, 4)}")

# ▶ 실행
asyncio.run(evaluate_all_concurrent())

ValueError: 템플릿 base_02 를 찾을 수 없거나 {text} 자리표시자가 없습니다.