In [94]:
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv, find_dotenv
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
import asyncio
import nest_asyncio

nest_asyncio.apply()
load_dotenv(find_dotenv())

True

## Prompt 정의

In [151]:
# generation, retrieval 나누지 않고 한번에 진행용 프롬프트
SYSTEM_PROMPT = """You are an evaluation assistant for a RAG (Retrieval-Augmented Generation) system.
Your task is to score the retrieved context(s) and the final answer according to the criteria provided below.

### Input:
- `question`: string = Field(description="A single user question")
- `ground_truth_context`: list = Field(description="Ground Truth (GT), usually one context")
- `context`: list = Field(description="list of retrieved contexts")
- `ground_truth_answer`: string = Field(description="The correct answer to the question")
- `answer`: string = Field(description="The final answer generated by the system")

### Output:
- `retrieval`: list = Field(description="A list of 5 scores (0 or 1) for the retrieval evaluation criteria.")
- `generation`: list = Field(description="A list of 9 scores (0 or 1) for the generation evaluation criteria.")

You must return a JSON object with two keys: "retrieval" and "generation".
- "retrieval" is a list of 5 scores (1 or 0).
- "generation" is a list of 9 scores (1 or 0).
No additional text or explanation is allowed.

### Retrieval Criteria (1~5):
1) Do any of the retrieved contexts show strong similarity to the Ground Truth? (1 or 0)
2) Do the retrieved contexts collectively capture essential information from the Ground Truth? (1 or 0)
3) Do the retrieved contexts sufficiently address the user’s query? (1 or 0)
4) Are all retrieved contexts relevant to the Ground Truth or the user’s query? (1 or 0)
5) Does the combined length and number of retrieved contexts remain reasonable without overwhelming the user with excessive or irrelevant details?  

### Generation Criteria (6~14):
6) Is the final answer clearly relevant to the question and reflective of the user’s intent? (1 or 0)
7) Is the answer factually correct and free from unsupported or inaccurate information? (1 or 0)
8) Does the answer include all essential points required by the question and the ground_truth_answer?
9) Is the answer clear and concise? (1 or 0)
10) Is the answer logically structured, consistent with the context, and free of contradictions? (1 or 0)
11) Does the answer provide enough detail without being too excessive? (1 or 0)
12) Does the answer provide any citations when referencing data or claims? (1 or 0)
13) Is the answer presented in a suitable format (list, table, short text, etc.) for the question? (1 or 0)
14) Does the answer offer helpful additional insights without deviating from factual correctness? (1 or 0)

Output format (example):
{{
  "retrieval": [1, 0, 1, 1, 1],
  "generation": [1, 1, 0, 1, 1, 1, 0, 1, 1]
}}
"""

HUMAN_PROMPT = """Question: {query}
Ground Truth: {ground_truth_context}
Retrieved Context: {retrieved_context}
Final Answer: {generated_answer}

Evaluate the above according to the 14 criteria outlined in the system instructions.
"""

PROMPT_TEMPLATE = ChatPromptTemplate(
    [("system", SYSTEM_PROMPT), ("human", HUMAN_PROMPT)]
)

In [153]:
# retrieval 단계, generation 단계 각각 프롬프트 정의
RETRIEVAL_SYSTEM_PROMPT = """You are an evaluation assistant for a RAG (Retrieval-Augmented Generation) system.
Your task is to score the *retrieved contexts* according to the criteria provided below.

### Input:
- `question`: string = Field(description="A single user question")
- `ground_truth_context`: list = Field(description="Ground Truth (GT), usually one context")
- `context`: list = Field(description="list of retrieved contexts")

### Output:
- `retrieval`: list = Field(description="A list of 5 scores (0 or 1) for the retrieval evaluation criteria.")

You must return a JSON object with the key "retrieval".
- "retrieval" is a list of 5 scores (1 or 0).
No additional text or explanation is allowed.

### Retrieval Criteria (1~5):
1) Do any of the retrieved contexts show strong similarity to the Ground Truth? (1 or 0)
2) Do the retrieved contexts collectively capture essential information from the Ground Truth? (1 or 0)
3) Do the retrieved contexts sufficiently address the user’s query? (1 or 0)
4) Are all retrieved contexts relevant to the Ground Truth or the user’s query? (1 or 0)
5) Does the combined length and number of retrieved contexts remain reasonable without overwhelming the user with excessive or irrelevant details?

Output format (example):
{{
  "retrieval": [1, 0, 1, 1, 1]
}}"""

GENERATION_SYSTEM_PROMPT = """You are an evaluation assistant for a RAG (Retrieval-Augmented Generation) system.
Your task is to score the *final answer* according to the criteria provided below.

### Input:
- `question`: string = Field(description="A single user question")
- `ground_truth_answer`: string = Field(description="The correct answer to the question")
- `answer`: string = Field(description="The final answer generated by the system")

### Output:
- `generation`: list = Field(description="A list of 9 scores (0 or 1) for the generation evaluation criteria.")

You must return a JSON object with the key "generation".
- "generation" is a list of 9 scores (1 or 0).
No additional text or explanation is allowed.

### Generation Criteria (1~9):
1) Is the final answer clearly relevant to the question and reflective of the user’s intent? (1 or 0)
2) Is the answer factually correct and free from unsupported or inaccurate information? (1 or 0)
3) Does the answer include all essential points required by the question and the ground_truth_answer? (1 or 0)
4) Is the answer clear and concise? (1 or 0)
5) Is the answer logically structured, consistent with any provided context, and free of contradictions? (1 or 0)
6) Does the answer provide enough detail without being too excessive? (1 or 0)
7) Does the answer provide any citations when referencing data or claims? (1 or 0)
8) Is the answer presented in a suitable format for the question? (1 or 0)
9) Does the answer offer helpful additional insights without deviating from factual correctness? (1 or 0)

Output format (example):
{{
  "generation": [1, 1, 0, 1, 1, 1, 0, 1, 1]
}}"""

RETRIEVAL_HUMAN_PROMPT = """Question: {question}
Ground Truth: {ground_truth_context}
Retrieved Context: {retrieved_context}

Evaluate the above according to the 5 criteria (retrieval) outlined in the system instructions."""

GENERATION_HUMAN_PROMPT = """Question: {question}
Ground Truth Answer: {ground_truth_answer}
Final Answer: {answer}

Evaluate the above according to the 9 criteria (generation) outlined in the system instructions."""


RETRIEVAL_PROMPT_TEMPLATE = ChatPromptTemplate(
    [("system", RETRIEVAL_SYSTEM_PROMPT), ("human", RETRIEVAL_HUMAN_PROMPT)]
)
GENERETION_PROMPT_TEMPLATE = ChatPromptTemplate(
    [("system", GENERATION_SYSTEM_PROMPT), ("human", GENERATION_HUMAN_PROMPT)]
)

## Output Format 정의

In [162]:
# output format 정의
class EvaluationResult(BaseModel):
    retrieval: list = Field(
        description="A list of 5 scores (0 or 1) for the retrieval evaluation criteria."
    )
    generation: list = Field(
        description="A list of 9 scores (0 or 1) for the generation evaluation criteria."
    )


class RetrievalResult(BaseModel):
    retrieval: list = Field(
        description="A list of 5 scores (0 or 1) for the retrieval evaluation criteria."
    )


class GenerationResult(BaseModel):
    generation: list = Field(
        description="A list of 9 scores (0 or 1) for the generation evaluation criteria."
    )

## 모델 정의

In [163]:
# LLM 정의
llm = ChatOpenAI(model="gpt-4o-mini")

## Retrieval, Generation 한번에 평가

In [168]:
# chain 정의
chain = PROMPT_TEMPLATE | llm.with_structured_output(EvaluationResult)

In [169]:
result = chain.invoke(
    {
        "query": "What is the capital of France?",
        "ground_truth_context": "The capital of France is Paris.",
        "retrieved_context": "France: Paris",
        "ground_truth_answer": "Paris",
        "generated_answer": "The capital of France is Paris. \nsource: (./data/france.md)",
    }
)

In [170]:
result.retrieval

[1, 1, 1, 1, 1]

In [171]:
result.generation

[1, 1, 1, 1, 1, 0, 1, 1, 1]

## Retrieval 단계, Generation 단계 각각 평가

In [172]:
# Retrieval 단계, Generation 단계 chain 정의
retrieval_chain = RETRIEVAL_PROMPT_TEMPLATE | llm.with_structured_output(
    RetrievalResult
)
generation_chain = GENERETION_PROMPT_TEMPLATE | llm.with_structured_output(
    GenerationResult
)

In [190]:
retrieval_score_list = retrieval_chain.invoke(
    {
        "question": "What is the capital of France?",
        "ground_truth_context": "The capital of France is Paris.",
        "retrieved_context": "France: Paris",
    }
)

In [191]:
generation_score_list = generation_chain.invoke(
    {
        "question": "What is the capital of France?",
        "ground_truth_answer": "Paris",
        "answer": "The capital of France is Paris. \nsource: (./data/france.md)",
    }
)

In [197]:
print(f"retrieval 평가 결과: {retrieval_score_list.retrieval}")
print(f"generation 평가 결과: {generation_score_list.generation}")

retrieval 평가 결과: [1, 1, 1, 1, 1]
generation 평가 결과: [1, 1, 1, 1, 1, 1, 1, 1, 1]


## 평가 함수 정의

In [202]:
async def execute_chain_async(input_data):
    """
    Execute retrieval and generation chains asynchronously.

    Parameters:
        input_data (dict): Input data containing question, contexts, and answers.

    Returns:
        tuple: Retrieval and generation score lists.
    """
    # retrieval 평가
    retrieval_task = asyncio.create_task(
        retrieval_chain.ainvoke(
            {
                "question": input_data["question"],
                "ground_truth_context": input_data["ground_truth_context"],
                "retrieved_context": input_data["retrieved_context"],
            }
        )
    )

    # generation 평가
    generation_task = asyncio.create_task(
        generation_chain.ainvoke(
            {
                "question": input_data["question"],
                "ground_truth_answer": input_data["ground_truth_answer"],
                "answer": input_data["answer"],
            }
        )
    )

    retrieval_result = await retrieval_task
    generation_result = await generation_task

    return retrieval_result.retrieval, generation_result.generation


async def calculate_score_async(retrieval_scores, generation_scores):
    """
    Calculate the total score asynchronously based on retrieval and generation scores with their respective weights.

    Parameters:
        retrieval_scores (list): A list of 5 scores (0 or 1) for retrieval evaluation.
        generation_scores (list): A list of 9 scores (0 or 1) for generation evaluation.

    Returns:
        float: Total score out of 50.
    """
    retrieval_weights = [5, 5, 4, 3, 3]
    generation_weights = [5, 5, 5, 5, 3, 3, 2, 1, 1]

    if len(retrieval_scores) != len(retrieval_weights):
        raise ValueError("Retrieval scores must have exactly 5 elements.")
    if len(generation_scores) != len(generation_weights):
        raise ValueError("Generation scores must have exactly 9 elements.")

    retrieval_total = sum(
        score * weight for score, weight in zip(retrieval_scores, retrieval_weights)
    )
    print(f"retrieval 결과: {retrieval_total}")

    generation_total = sum(
        score * weight for score, weight in zip(generation_scores, generation_weights)
    )
    print(f"generation 결과: {generation_total}")

    total_score = retrieval_total + generation_total

    return total_score

## 실행

In [209]:
input_data = {
    "question": "크래프톤의 글로벌 e-sports 대회 투자로 예상 지급수수료 812억 원은 전년 대비 몇 퍼센트 증가했나요?",
    "ground_truth_context": """파일 제목: 크래프톤_교보증권_20240723.pdf
 내용: 영업비용  중  지급수수료는  글로벌  e-sports  대회  투자로  812억원(YoY  +15.0%, 

매출  대비  14.7%)  예상,  인건비는  인력증가분을  반영하며  1,270억원(YoY 

+23.0%),  마케팅비는 PUBG PC  관련 마케팅 집행으로 YoY +123.0%  증가하나 

신작  마케팅이  본격화되지  않아  매출  대비  3.5%  수준에  머무르는  193억원  추정. 
주식보상비용은 3월말 대비 주가 상승에 따라 213억원 발생 전망. 

투자의견 Buy, 목표주가 370,000원으로 상향 

투자의견  Buy  유지,  목표주가를  370,000원(종전  330,000원)으로  상향.  목표주가 

상향은  PUBG  PC  버전의  글로벌  트래픽이  높게  유지되고  있음을  반영해  2024년 

지배주주순이익  추정치를  기존  대비  +6%  상향함에  따름.  차기  성장  동력  확보를 

위한  공격적인  인력  채용,  e-sports에  대한  적극적인 투자가  이루어지고  있음에도 

탑라인 성장 기반의 이익 성장이 이루어지고 있다는 점이 차별화되는 요소로 판단. 

PUBG IP가 언리얼엔진5 이관 및 콘솔 기반 출시를 준비하는 가운데 4Q24 ‘다크앤

다커  모바일’  글로벌  출시,  쿠키런  인도  출시가  이루어져  2025년  그  수혜가  온기 

반영될 것으로 예상하며, ‘inZOI’(라이프 시뮬레이션, PC/콘솔) 역시 1H25 정식 출

시될 것으로 전망. ‘PUBG’, ‘다크앤다커 모바일’, ‘inZOI’ 3 종의 IP는 24년 8월 게

임스컴에 참가하여 글로벌 관심도가 고양될 것으로 기대. 

Forecast earnings & Valuation 

12 결산(십억원) 
매출액(십억원) 
YoY(%) 
영업이익(십억원) 
OP 마진(%) 
순이익(십억원) 
EPS(원) 
YoY(%) 
PER(배) 
PCR(배) 
PBR(배) 
EV/EBITDA(배) 
ROE(%) 

2022.12 
1,854  
-1.7 
752  
40."
""",
    "retrieved_context": """영업비용  중  지급수수료는  글로벌  e-sports  대회  투자로  812억원(YoY  +15.0%, 

매출  대비  14.7%)  예상,  인건비는  인력증가분을  반영하며  1,270억원(YoY 

+23.0%),  마케팅비는 PUBG PC  관련 마케팅 집행으로 YoY +123.0%  증가하나 

신작  마케팅이  본격화되지  않아  매출  대비  3.5%  수준에  머무르는  193억원  추정.""",
    "ground_truth_answer": "글로벌 e-sports 대회 투자로 812억원의 예상 지급수수료는 YoY +15.0% 증가했습니다.",
    "answer": "글로벌 e-sports 대회 투자로 예상 지급수수료는 +15.0% 증가 할것으로 보입니다",
}

retrieval_score_list, generation_score_list = await execute_chain_async(
    input_data=input_data
)
total_score = await calculate_score_async(
    retrieval_score_list, generation_score_list
)
total_score

retrieval 결과: 20
generation 결과: 28


48