# Notebook ③ – RAG Pipeline & Evaluation

Switch tiers by changing `MODEL_NAME` in the first code cell.

In [None]:
import os, torch, time, pandas as pd, wandb
from llama_index import (
    StorageContext, load_index_from_storage, ServiceContext, set_global_service_context
)
from llama_index.llms import Vllm
from llama_index.evaluation import RagasEvaluator
from llama_index.evaluation import AnswerRelevancyEvaluator, FaithfulnessEvaluator, ContextRecallEvaluator

MODEL_NAME = os.getenv("MODEL_NAME", "mistral-7b-instruct-v0.3-gptq")

index = load_index_from_storage("lade_chroma")
retriever = index.as_retriever(search_kwargs={"k":4})
llm = Vllm(
    model=MODEL_NAME,
    openai_base_url="http://localhost:8000",
    temperature=0.0
)
service_context = ServiceContext.from_defaults(llm=llm)
set_global_service_context(service_context)


In [None]:
qa_df = pd.read_csv("eval_qa50.csv")
evaluator = RagasEvaluator(
    metrics=["faithfulness", "answer_relevancy", "context_recall"]
)
results = evaluator.evaluate_dataset(
    dataset=qa_df.to_dict('records'),
    retriever=retriever,
    llm=llm
)
print(results.head())


In [None]:
# System metrics
start = time.perf_counter()
_ = llm.complete("Hello", max_tokens=128)
elapsed = time.perf_counter() - start
tok_per_sec = 128 / elapsed
vram_gb = torch.cuda.max_memory_allocated() / 1e9
print(f"{MODEL_NAME} → {tok_per_sec:.1f} tok/s, {vram_gb:.2f} GB VRAM")


In [None]:
wandb.init(project="logisticgpt", name=MODEL_NAME, mode="offline")
wandb.log({
    "tok_per_sec": tok_per_sec,
    "vram_gb": vram_gb,
    **results.mean().to_dict()
})
wandb.finish()
print("Logged to wandb (offline).")
