# EncouRAGe Tutorial

Steps:
1. Load data with Hugging Face
2. Put context into `Document`s
3. Init BatchInferenceRunner and Template
4. Select and initialize RAG method
5. Run two inference examples (plain + structured)
6. Calculate metrics

In [None]:
import os
import json
import uuid
from pathlib import Path

HF_HOME = str((Path.cwd() / '.cache' / 'huggingface').resolve())
os.environ['HF_HOME'] = HF_HOME
os.environ['VLLM_API_KEY'] = 'token-abc123'

import pandas as pd
from datasets import load_dataset
from pydantic import BaseModel

from encourage.llm import BatchInferenceRunner, SamplingParams
from encourage.prompts import Context, Document, MetaData
from encourage.rag import BaseRAG, BaseRAGConfig
from encourage.metrics import map_pydantic_field_to_response, F1, NumberMatch, MeanReciprocalRank, RecallAtK, AnswerFaithfulness, ExactMatch

## 1) Load Data with HF 

In [18]:
dataset_hf = load_dataset("G4KMU/hotpot_qa", split="validation[:1]").to_pandas() 
dataset_hf

Unnamed: 0,id,context_id,split,question,answer,context,type,level
0,5a8b57f25542995d1e6f1371,hotpotqa_validation_ctx_0,validation,Were Scott Derrickson and Ed Wood of the same ...,yes,Adam Collis\nAdam Collis is an American filmma...,comparison,hard


## 2) Put Context into Documents

In [19]:
## Create the user prompts
user_prompts = dataset_hf['question'].tolist()


## Create the context collection
context_collection = []
for _, row in dataset_hf.iterrows():
    context_collection.append(
        Document(
            id=uuid.uuid5(uuid.NAMESPACE_DNS, str(row.get('context_id', ''))),
            content=row.get('context', ''),
        )
    )


## Create the meta data collection
meta_datas = []
for idx, row in dataset_hf.iterrows():
    reference_answer = row.get('answer')
    meta_datas.append(
        MetaData(
            {
                'id': str(row.get('id', idx)),
                'reference_answer': reference_answer,
                'reference_document': context_collection[idx],
            }
        )
    )

print(len(user_prompts))
print(len(context_collection))
print(len(meta_datas))

1
1
1


## 3) Init BatchInferenceRunner and Template

In [20]:
sampling_params = SamplingParams(temperature=0, max_tokens=3000)
runner = BatchInferenceRunner(sampling_params, "meta-llama/Meta-Llama-3.1-8B-Instruct", base_url="http://localhost:18124/v1/")
template_name = "hotpotqa_template.j2"

## 4) Select RAG Method and Initialize it


In [21]:
rag_config = BaseRAGConfig(
    context_collection=context_collection,
    collection_name="hotpotqa_test",
    embedding_function='intfloat/multilingual-e5-large-instruct',
    top_k=5,
    runner=runner,
    template_name=template_name,
    retrieval_only=False,
)

rag_method_instance = BaseRAG(rag_config)

Inserting documents: 100%|██████████| 1/1 [00:00<00:00, 18.36it/s]


## 5) Two inference examples

- Example A: without structured output
- Example B: with structured output via a class

In [22]:
# Example A: plain output (no response_format)
sys_prompt = Path('/ltstorage/home/strich/encourage/docs/demo/hotpotqa.txt').read_text(encoding='utf-8')

responses_plain = rag_method_instance.run(
    runner=runner,
    sys_prompt=sys_prompt,
    user_prompts=user_prompts,
    meta_datas=meta_datas,
    retrieval_queries=user_prompts,
)

Querying documents: 100%|██████████| 1/1 [00:00<00:00, 42.11it/s]
Batch 1/1 (1 prompts): 100%|██████████| 1/1 [00:01<00:00,  1.85s/it]


In [None]:
metrics = [
    AnswerFaithfulness(runner),
    F1(),
    MeanReciprocalRank(),
    RecallAtK(k=3),
    RecallAtK(k=5),
]

for metric in metrics:
    result = metric(responses_plain)
    print(f"{metric.name}: {result.score}")

Batch 1/1 (6 prompts): 100%|██████████| 6/6 [00:01<00:00,  3.71it/s]
Batch 1/1 (1 prompts): 100%|██████████| 1/1 [00:11<00:00, 11.16s/it]

answer-faithfulness: 0.47058823529411764
f1squad: 0.0
number_match: 0.0
mrr: 1.0
recall3: 1.0
recall5: 1.0





In [25]:
# Example B: structured output with an explicit class
sys_prompt = Path('/ltstorage/home/strich/encourage/docs/demo/hotpotqa_structured.txt').read_text(encoding='utf-8')

class HotPotQAResponse(BaseModel):
    reasoning_steps: list[str]
    list_of_supporting_facts: list[str]
    final_answer: str

responses = rag_method_instance.run(
    runner=runner,
    sys_prompt=sys_prompt,
    user_prompts=user_prompts,
    meta_datas=meta_datas,
    retrieval_queries=user_prompts,
    response_format=HotPotQAResponse,
)

Querying documents: 100%|██████████| 1/1 [00:00<00:00, 40.38it/s]
Batch 1/1 (1 prompts): 100%|██████████| 1/1 [00:02<00:00,  2.39s/it]


## 6) Calculate Metrics

In [None]:
metrics = [
    F1(),
    ExactMatch(),
    MeanReciprocalRank(),
    RecallAtK(k=3),
    RecallAtK(k=5)
]
responses_new = map_pydantic_field_to_response(responses, HotPotQAResponse, "final_answer")
for metric in metrics:
    result = metric(responses_new)
    print(f"{metric.name}: {result.score}")

f1squad: 0.18181818181818182
exact_match: 0.0
mrr: 1.0
recall3: 1.0
recall5: 1.0
