In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
sys.path.append(os.path.abspath("../../"))
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from src.config import Config, load_spreadsheet
from src.rag import (
    ChromaSlideStore,
    HyperbolicScorer,
    MinScorer,
    PresentationRetriever,
    ScorerTypes,
)
from src.eval.eval_mlflow import EvaluationConfig, RAGEvaluator
from src.rag.storage import LLMPresentationRetriever

import mlflow

In [3]:
# Mlflow setup logging
mlflow.langchain.autolog()

# Setup components
project_config = Config()
llm = project_config.model_config.load_vsegpt(model="openai/gpt-4o-mini")
embeddings = project_config.embedding_config.load_vsegpt()
# embeddings = project_config.embedding_config.load_openai()

storage = ChromaSlideStore(collection_name="pres1", embedding_model=embeddings)
retriever = LLMPresentationRetriever(storage=storage, llm=llm, n_contexts=10, n_pages=3)

db_path = project_config.navigator.processed / "eval" / "runs" / "mlruns.db"
artifacts_path = project_config.navigator.processed / "eval" / "artifacts"
eval_config = EvaluationConfig(
    retriever = retriever,
    experiment_name="PresRetrieve_7",
    metrics=["presentationmatch", "llmrelevance"],
    scorers=[MinScorer(), HyperbolicScorer()],

)

evaluator = RAGEvaluator(
    config=eval_config,
    llm=llm
)

In [11]:
from IPython.display import display
# Load questions
sheet_id = os.environ["BENCHMARK_SPREADSHEET_ID"]
gids = {
    "NoAnswer": "1219206941"
}
df = evaluator.load_questions_from_sheet(sheet_id, gid=gids["NoAnswer"])

df_eval = df.sample(5)
df_eval = df.copy()
display(df_eval)

Unnamed: 0,pres_name,question,page,content,comment
0,,Презентация про космонавтов,,,
1,,Презентация про экономику Китая,,,
2,,Зоомагазины,,,
3,,The capital of Great Britain,,,
4,,Обучение LLM на CPU,,,
5,,Фото кабриолета,,,
6,,История Российской Империи,,,


In [12]:
df = evaluator.load_questions_from_sheet(sheet_id)
df_eval = df.sample(20)

In [13]:
import nest_asyncio
nest_asyncio.apply()
# Run evaluation
evaluator.run_evaluation(df_eval)



Processing questions (max 5 concurrent):   0%|          | 0/20 [00:00<?, ?it/s]Failed to process question 12: Error code: 429 - {'error': {'message': 'Rate-limit error: You send more than 1 request per 1.0 second. Try later.', 'code': 429}}
Processing questions (max 5 concurrent):  60%|██████    | 12/20 [01:56<00:59,  7.45s/it]Failed to process question 16: Error code: 429 - {'error': {'message': 'Rate-limit error: You send more than 1 request per 1.0 second. Try later.', 'code': 429}}
Processing questions (max 5 concurrent):  65%|██████▌   | 13/20 [02:11<01:02,  8.87s/it]Failed to process question 2: Error code: 429 - {'error': {'message': 'Rate-limit error: You send more than 1 request per 1.0 second. Try later.', 'code': 429}}
Failed to process question 14: Error code: 429 - {'error': {'message': 'Rate-limit error: You send more than 1 request per 1.0 second. Try later.', 'code': 429}}
Processing questions (max 5 concurrent): 100%|██████████| 20/20 [02:43<00:00,  8.19s/it]
Processin