# EncouRAGe Tutorial

Steps:
1. Load data with Hugging Face
2. Put context into `Document`s
3. Init BatchInferenceRunner and Template
4. Select and initialize RAG method
5. Run two inference examples (plain + structured)
6. Calculate metrics

In [None]:
import os
import uuid
from pathlib import Path

HF_HOME = str((Path.cwd() / '.cache' / 'huggingface').resolve())
os.environ['HF_HOME'] = HF_HOME
os.environ['VLLM_API_KEY'] = 'token-abc123'


from datasets import load_dataset
from pydantic import BaseModel

from encourage.llm import BatchInferenceRunner, SamplingParams
from encourage.prompts import Context, Document, MetaData
from encourage.rag import BaseRAG, BaseRAGConfig, HydeRAGConfig, HydeRAG
from encourage.metrics import map_pydantic_field_to_response, F1, NumberMatch, MeanReciprocalRank, RecallAtK, AnswerFaithfulness, ExactMatch

## 1) Load Data with HF 

In [None]:
dataset_hf = load_dataset("G4KMU/hotpot_qa", split="validation[:20]").to_pandas()  # ty:ignore[unresolved-attribute]
dataset_hf

## 2) Put Context into Documents

In [None]:
## Create the user prompts
user_prompts = dataset_hf['question'].tolist()  # ty:ignore[not-subscriptable]


## Create the context collection
context_collection = []
for _, row in dataset_hf.iterrows():  # ty:ignore[unresolved-attribute]
    context_collection.append(
        Document(
            id=uuid.uuid5(uuid.NAMESPACE_DNS, str(row.get('context_id', ''))),
            content=row.get('context', ''),
        )
    )


## Create the meta data collection
meta_datas = []
for idx, row in dataset_hf.iterrows():  # ty:ignore[unresolved-attribute]
    reference_answer = row.get('answer')
    meta_datas.append(
        MetaData(
            {
                'id': str(row.get('id', idx)),
                'reference_answer': reference_answer,
                'reference_document': context_collection[idx],  # ty:ignore[invalid-argument-type]
            }
        )
    )

print(len(user_prompts))
print(len(context_collection))
print(len(meta_datas))

## 3) Init BatchInferenceRunner and Template

In [None]:
sampling_params = SamplingParams(temperature=0, max_tokens=3000)
runner = BatchInferenceRunner(sampling_params, "meta-llama/Meta-Llama-3.1-8B-Instruct", base_url="http://localhost:18124/v1/")
template_name = "hotpotqa_template.j2"

## 4) Select RAG Method and Initialize it


In [None]:
rag_config_1 = BaseRAGConfig(
    context_collection=context_collection,
    collection_name="hotpotqa_test",
    embedding_function='intfloat/multilingual-e5-large-instruct',
    top_k=5,
    runner=runner,
    template_name=template_name,
    retrieval_only=False,
)

rag_method_instance = BaseRAG(rag_config_1)

rag_config_2 = HydeRAGConfig(
    context_collection=context_collection,
    collection_name="hotpotqa_test",
    embedding_function='intfloat/multilingual-e5-large-instruct',
    top_k=5,
    runner=runner,
    template_name=template_name,
    retrieval_only=False,
    additional_prompt="Please write a passage to answer the question:"
)

rag_method_instance_2 = HydeRAG(rag_config_2)

## 5) Two inference examples

- Example A: without structured output
- Example B: with structured output via a class

In [None]:
# Example A: plain output (no response_format)
sys_prompt = Path('/ltstorage/home/strich/encourage/docs/demo/hotpotqa.txt').read_text(encoding='utf-8')

responses_plain = rag_method_instance.run(
    runner=runner,
    sys_prompt=sys_prompt,
    user_prompts=user_prompts,
    meta_datas=meta_datas,
    retrieval_queries=user_prompts,
)

In [None]:
responses_plain.print_response_summary()

In [None]:
NLI_SYS_PROMPT = (
    "Create verdicts for each statement based on the context. A verdict of 1 means the "
    "statement is supported by the context, while a verdict of 0 means it is not "
    "supported. Just return one output per statement, in the same order as the statements."
)

metrics = [
    AnswerFaithfulness(runner, nli_sys_prompt=NLI_SYS_PROMPT),
    F1(),
    MeanReciprocalRank(),
    RecallAtK(k=3),
    RecallAtK(k=5),
]


for metric in metrics:
    result = metric(responses_plain)
    print(f"{metric.name}: {result.score}")

In [None]:
# Example B: structured output with an explicit class
sys_prompt = Path('/ltstorage/home/strich/encourage/docs/demo/hotpotqa_structured.txt').read_text(encoding='utf-8')

class HotPotQAResponse(BaseModel):
    reasoning_steps: list[str]
    list_of_supporting_facts: list[str]
    final_answer: str

responses = rag_method_instance.run(
    runner=runner,
    sys_prompt=sys_prompt,
    user_prompts=user_prompts,
    meta_datas=meta_datas,
    retrieval_queries=user_prompts,
    response_format=HotPotQAResponse,
)

## 6) Calculate Metrics

In [None]:
metrics = [
    F1(),
    ExactMatch(),
    MeanReciprocalRank(),
    RecallAtK(k=3),
    RecallAtK(k=5)
]
responses_new = map_pydantic_field_to_response(responses, HotPotQAResponse, "final_answer")
for metric in metrics:
    result = metric(responses_new)
    print(f"{metric.name}: {result.score}")