In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
sys.path.append(os.path.abspath("../../"))
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from src.config import Config
from src.rag import ChromaSlideStore
import pandas as pd

from src.eval import EvaluationConfig, RAGEvaluator
from src.config import load_spreadsheet

from langchain.callbacks import tracing_v2_enabled

config = Config()

In [3]:
# Enable LangSmith tracing
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "default"

In [4]:
from src.rag.score import (
    ExponentialScorer, HyperbolicScorer, MinScorer,
    ExponentialWeightedScorer, HyperbolicWeightedScorer
)
from src.eval.evaluate import (
    page_found, page_match, presentation_found,
    presentation_match, create_llm_relevance_evaluator,
    n_pages, n_pres
)


# Initialize components
embeddings = config.embedding_config.load_vsegpt()
storage = ChromaSlideStore(collection_name="pres0", embedding_model=embeddings)


# Load questions if needed
sheet_id = os.environ["BENCHMARK_SPREADSHEET_ID"]
df_questions = RAGEvaluator.load_questions_from_sheet(sheet_id)
df_questions.sample(5)

Unnamed: 0,pres_name,question,page,content,comment,MinScorer,Hyperbolic Scorer
8,Kept_Подвижной состав РФ_2024 (20 стр),Про что рассказывал Сергей Казачков?,220.0,text,,PASS,PASS
10,4.Обзор уязвимостей и техник защиты для LLM_Ев...,Презентация с мемом про Трампа,15.0,general,,,
24,2.Kolmogorov Arnold Networks_Павел Плюснин_вер.2,Презентация про нейросети Колмогорова-Арнольда,,general,,,
13,2. Пристягина Матрицы компетенций,Презентация с картинкой единорога,,visual,на многих слайдах,,
2,ЯиП_Энергетический_переход_Вызовы_и_возможност...,В какой презентации была таблица с источниками...,16.0,visual,,PASS,PASS


In [8]:
dataset_name = "PresRetrieve_25"
df_eval = df_questions
scorers = [MinScorer(), ExponentialScorer(), HyperbolicScorer(), ExponentialWeightedScorer(), HyperbolicWeightedScorer()]
scorers = [HyperbolicWeightedScorer()]

# dataset_name = "PresRetrieve_5"
# df_eval = df_questions.sample(5)
# scorers = [MinScorer(), HyperbolicScorer() ]

llm = Config().model_config.load_vsegpt(model="openai/gpt-4o-mini", temperature=0.1)

eval_config = EvaluationConfig(
    dataset_name=dataset_name,
    evaluators=[
        presentation_match,
        page_match,
        presentation_found,
        page_found,
        create_llm_relevance_evaluator(llm),
        n_pages,
        n_pres
    ],
    scorers=scorers,
    max_concurrency=1
)

evaluator = RAGEvaluator(storage=storage, config=eval_config, llm=llm)
evaluator.create_or_load_dataset(df_eval)
# evaluator.create_dataset(dataset_name+"test", df_eval)


Using existing dataset: PresRetrieve_25


Dataset(name='PresRetrieve_25', description=None, data_type=<DataType.kv: 'kv'>, id=UUID('e25f48e1-fd33-4a9d-aca2-d19232ad1a34'), created_at=datetime.datetime(2024, 12, 6, 10, 0, 7, 865953, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 12, 6, 10, 0, 7, 865953, tzinfo=datetime.timezone.utc), example_count=25, session_count=5, last_session_start_time=datetime.datetime(2024, 12, 6, 10, 25, 52, 357159), inputs_schema=None, outputs_schema=None)

In [9]:
import nest_asyncio
nest_asyncio.apply()
# Run evaluation
evaluator.run_evaluation()

View the evaluation results for experiment: 'hyperbolicweighted_k2.0_p3.0-aa5bda8e' at:
https://smith.langchain.com/o/44cd8e12-235d-52ea-a8de-1063f25bb9a4/datasets/e25f48e1-fd33-4a9d-aca2-d19232ad1a34/compare?selectedSessions=1f5f916b-eb71-4f02-aa77-f61f47d73477




0it [00:00, ?it/s]