microsoft
diff --git a/‎experiments/eval/run.py
Lines changed: 26 additions & 16 deletions b/‎experiments/eval/run.py
Lines changed: 26 additions & 16 deletions
diff --git a/‎experiments/eval/systems/__init__.py
Lines changed: 2 additions & 1 deletion b/‎experiments/eval/systems/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎experiments/eval/systems/llm_system.py
Lines changed: 86 additions & 0 deletions b/‎experiments/eval/systems/llm_system.py
Lines changed: 86 additions & 0 deletions
diff --git a/‎src/magentic_ui/eval/baseqa.py
Lines changed: 23 additions & 0 deletions b/‎src/magentic_ui/eval/baseqa.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎src/magentic_ui/eval/benchmarks/__init__.py
Lines changed: 6 additions & 0 deletions b/‎src/magentic_ui/eval/benchmarks/__init__.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/magentic_ui/eval/benchmarks/gpqa/__init__.py b/‎src/magentic_ui/eval/benchmarks/gpqa/__init__.py
diff --git a/‎src/magentic_ui/eval/benchmarks/gpqa/gpqa.py
Lines changed: 118 additions & 0 deletions b/‎src/magentic_ui/eval/benchmarks/gpqa/gpqa.py
Lines changed: 118 additions & 0 deletions
diff --git a/‎src/magentic_ui/eval/benchmarks/simpleqa/__init__.py b/‎src/magentic_ui/eval/benchmarks/simpleqa/__init__.py
@@ -6,6 +6,7 @@
 from typing import Optional, Dict, Any, Callable
 from magentic_ui.eval.core import run_evaluate_benchmark_func, evaluate_benchmark_func
 from systems.magentic_ui_sim_user_system import MagenticUISimUserSystem
+from systems.llm_system import LLMSystem
 from magentic_ui.eval.benchmarks import WebVoyagerBenchmark
 from magentic_ui.eval.benchmark import Benchmark
 from autogen_core.models import ChatCompletionClient
@@ -157,19 +158,27 @@ def run_system_sim_user(args: argparse.Namespace, system_name: str) -> None:
     """
     config = load_config(args.config)
 
-    system = MagenticUISimUserSystem(
-        simulated_user_type=args.simulated_user_type,
-        endpoint_config_orch=config.get("orchestrator_client") if config else None,
-        endpoint_config_websurfer=config.get("web_surfer_client") if config else None,
-        endpoint_config_coder=config.get("coder_client") if config else None,
-        endpoint_config_file_surfer=config.get("file_surfer_client")
-        if config
-        else None,
-        endpoint_config_user_proxy=config.get("user_proxy_client") if config else None,
-        web_surfer_only=args.web_surfer_only,
-        how_helpful_user_proxy=args.how_helpful_user_proxy,
-        dataset_name=args.dataset,
-    )
+    if system_name == "LLM":
+        # Use LLMSystem for LLM-based evaluations
+        system = LLMSystem(
+            system_name=system_name,
+            endpoint_config=config.get("model_config") if config else None,
+            dataset_name=args.dataset,
+        )
+    else:
+        system = MagenticUISimUserSystem(
+            simulated_user_type=args.simulated_user_type,
+            endpoint_config_orch=config.get("orchestrator_client") if config else None,
+            endpoint_config_websurfer=config.get("web_surfer_client") if config else None,
+            endpoint_config_coder=config.get("coder_client") if config else None,
+            endpoint_config_file_surfer=config.get("file_surfer_client")
+            if config
+            else None,
+            endpoint_config_user_proxy=config.get("user_proxy_client") if config else None,
+            web_surfer_only=args.web_surfer_only,
+            how_helpful_user_proxy=args.how_helpful_user_proxy,
+            dataset_name=args.dataset,
+        )
 
     run_system_evaluation(args, system, system_name, config)
 
@@ -229,8 +238,8 @@ def main() -> None:
     parser.add_argument(
         "--system-type",
         type=str,
-        default="magentic-ui",
-        choices=["magentic-ui", "magentic-ui-sim-user"],
+        default="MagenticUI",
+        choices=["MagenticUI", "magentic-ui-sim-user", "LLM"],
         help="Type of system to run",
     )
     parser.add_argument(
@@ -250,7 +259,8 @@ def main() -> None:
 
     # Determine system name based on arguments
 
-    system_name = "MagenticUI"
+    system_name = args.system_type
+
     if args.simulated_user_type != "none":
         system_name += f"_{args.simulated_user_type}_{args.how_helpful_user_proxy}"
     if args.web_surfer_only:
 
@@ -1,5 +1,6 @@
 from .magentic_ui_sim_user_system import MagenticUISimUserSystem
 from .magentic_ui_system import MagenticUIAutonomousSystem
 from .magentic_one_system import MagenticOneSystem
+from .llm_system import LLMSystem
 
-__all__ = ["MagenticUISimUserSystem", "MagenticUIAutonomousSystem", "MagententicOneSystem"]
+__all__ = ["MagenticUISimUserSystem", "MagenticUIAutonomousSystem", "MagententicOneSystem", "LLMSystem"]
@@ -0,0 +1,86 @@
+import asyncio
+import json
+import os
+from typing import List, Tuple, Dict, Any, Optional, Union
+from autogen_core import ComponentModel
+from autogen_core.models import ChatCompletionClient, SystemMessage, UserMessage
+from magentic_ui.eval.basesystem import BaseSystem
+from magentic_ui.eval.models import BaseQATask, BaseCandidate
+
+class LLMSystem(BaseSystem):
+
+    default_client_config = {
+        "provider": "OpenAIChatCompletionClient",
+        "config": {
+            "model": "gpt-4o-2024-08-06",
+        },
+        "max_retries": 10,
+    }
+
+    def __init__(self, system_name, endpoint_config=default_client_config, dataset_name:str="SimpleQA"):
+        super().__init__(system_name)
+
+        self.endpoint_config = endpoint_config
+        self.dataset_name = dataset_name
+        self.candidate_class = BaseCandidate
+
+    def get_answer(
+        self, task_id: str, task: BaseQATask, output_dir: str
+    ) -> BaseCandidate:
+        """
+        Runs the agent team to solve a given task and saves the answer and logs to disk.
+
+        Args:
+            task_id (str): Unique identifier for the task.
+            task (BaseTask): The task object containing the question and metadata.
+            output_dir (str): Directory to save logs, screenshots, and answer files.
+
+        Returns:
+            BaseCandidate: An object containing the final answer and any screenshots taken during execution.
+        """
+        async def _runner() -> Tuple[str, List[str]]:
+            """Asynchronous runner to answer the task and return the answer"""
+            task_question = task.format_to_user_message() if hasattr(task, 'format_to_user_message') else task.question
+            system_instruction = task.system_instruction if hasattr(task, 'system_instruction') else ""
+
+            def get_model_client(
+                endpoint_config: Optional[Union[ComponentModel, Dict[str, Any]]],
+            ) -> ChatCompletionClient:
+                """
+                Loads a ChatCompletionClient from a given endpoint configuration.
+
+                Args:
+                    endpoint_config (Optional[Union[ComponentModel, Dict[str, Any]]]):
+                        The configuration for the model client.
+
+                Returns:
+                    ChatCompletionClient: The loaded model client.
+                """
+                if endpoint_config is None:
+                    return ChatCompletionClient.load_component(
+                        self.default_client_config
+                    )
+                return ChatCompletionClient.load_component(endpoint_config)
+
+            messages = [
+                SystemMessage(content=system_instruction),
+                UserMessage(content=task_question, source="user"),
+            ]
+            client = get_model_client(self.endpoint_config)
+
+            response = await client.create(
+                messages=messages,
+            )
+
+            await client.close()
+
+            answer = response.content
+            usage = response.usage
+
+            return answer, usage
+
+        answer, usage = asyncio.run(_runner())
+        return BaseCandidate(answer=answer)
+
+
+            
@@ -0,0 +1,23 @@
+from .benchmark import Benchmark
+from typing import Union, Optional, Dict
+from .models import AllTaskTypes
+
+
+class BaseQABenchmark(Benchmark):
+    """Base class for Question-Answering benchmarks."""
+
+    def __init__(
+        self,
+        name: str,
+        data_dir: Union[str, None] = None,
+        tasks: Optional[Dict[str, AllTaskTypes]] = None,
+        num_instances: Optional[int] = None,
+    ):
+        super().__init__(name, data_dir, tasks)
+
+        self.num_instances = num_instances
+
+    def get_formatted_question(self, task: AllTaskTypes) -> str:
+        raise NotImplementedError(
+            "Subclasses must implement get_formatted_question method."
+        )
@@ -5,11 +5,17 @@
 from .bearcubs.bearcubs import BearcubsBenchmark
 from .webgames.webgames import WebGamesBenchmark
 
+# QA
+from .simpleqa.simpleqa import SimpleQABenchmark
+from .gpqa.gpqa import GPQABenchmark
+
 __all__ = [
     "AssistantBenchBenchmark",
     "CustomBenchmark",
     "GaiaBenchmark",
     "WebVoyagerBenchmark",
     "BearcubsBenchmark",
     "WebGamesBenchmark",
+    "SimpleQABenchmark",
+    "GPQABenchmark",
 ]
@@ -0,0 +1,118 @@
+""" """
+
+import re
+import os
+import logging
+import pandas as pd
+from ...baseqa import BaseQABenchmark
+from ...models import (
+    GPQACandidate,
+    GPQATask,
+    GPQAEvalResult,
+    AllTaskTypes,
+)
+from typing import Dict, List, Union, Optional
+
+from huggingface_hub import snapshot_download  # type: ignore
+
+
+class GPQABenchmark(BaseQABenchmark):
+    DATASET_URL = "hf://datasets/Idavidrein/gpqa/"
+    DATASET_REPO_ID = "Idavidrein/gpqa"
+    SPLITS = ["diamond", "extended", "main"]
+    SYSTEM_INSTRUCTION = """You are a helpful assistant that answers questions."""
+
+    def __init__(
+        self,
+        name: str,
+        data_dir: Union[str, None] = None,
+        tasks: Optional[Dict[str, AllTaskTypes]] = None,
+        num_instances: Optional[int] = None,
+        system_instruction: str = SYSTEM_INSTRUCTION,
+    ):
+        super().__init__(name, data_dir, tasks, num_instances)
+
+        self.system_instruction = system_instruction
+
+    def download_dataset(self) -> None:
+        """
+        Download the dataset into self.data_dir using huggingface_hub.snapshot_download().
+        """
+        assert self.data_dir is not None, "data_dir must be provided for GPQABenchmark"
+        if not os.path.exists(self.data_dir):
+            os.makedirs(self.data_dir, exist_ok=True)
+
+        logging.info(f"[GPQABenchmark] Downloading dataset into '{self.data_dir}'...")
+        snapshot_download(
+            repo_id=self.DATASET_REPO_ID,
+            repo_type="dataset",
+            local_dir=self.data_dir,
+            local_dir_use_symlinks=True,
+        )
+        logging.info("[GPQABenchmark] Dataset downloaded.")
+
+    def load_dataset(self) -> None:
+        """
+        Read all the split csvs from the dataset
+        """
+
+        split_paths = {  # type: ignore
+            split: os.path.join(self.data_dir, f"gpqa_{split}.csv")  # type: ignore
+            for split in self.SPLITS
+        }
+
+        for split_name, split_path in split_paths.items():  # type: ignore
+            if not os.path.exists(split_path):  # type: ignore
+                raise FileNotFoundError(f"Dataset file {split_path} does not exist.")
+
+            df = pd.read_csv(split_path)  # type: ignore
+            for _, row in df.iterrows():
+                self.tasks[row["Record ID"]] = GPQATask(  # type: ignore
+                    id=row["Record ID"],  # type: ignore
+                    question=row["Question"],
+                    answer=row["Correct Answer"],  # type: ignore
+                    options=[  # type: ignore
+                        row["Correct Answer"],
+                        row["Incorrect Answer 1"],
+                        row["Incorrect Answer 2"],
+                        row["Incorrect Answer 3"],
+                    ],
+                    set=split_name,
+                    metadata=row.to_dict(),  # type: ignore
+                    system_instruction=self.system_instruction,  # type: ignore
+                )
+
+        logging.info(
+            f"[GPQABenchmark] Loaded {len(self.tasks)} tasks from {self.SPLITS} splits from the dataset."
+        )
+
+    def get_split_tasks(self, split: str) -> List[str]:
+        assert (
+            split in self.SPLITS
+        ), f"Invalid split: {split}. Must be one of {self.SPLITS}."
+        return [task.id for task in self.tasks.values() if task.set == split]
+
+    def evaluator(self, task: GPQATask, candidate: GPQACandidate) -> GPQAEvalResult:  # type: ignore
+        if isinstance(task, Dict):
+            task = GPQATask(**task)  # type: ignore
+        if isinstance(candidate, Dict):
+            candidate = GPQACandidate(**candidate)  # type: ignore
+
+        answer_search_by_format = re.search(
+            r"(?i)Answer[ \t]*:[ \t]*\$?([A-D])\$?", candidate.answer
+        )
+        extracted_answer = (
+            answer_search_by_format.group(1) if answer_search_by_format else None
+        )
+
+        ground_truth_answer = task.answer  # type: ignore
+        score = ground_truth_answer == extracted_answer  # type: ignore
+        return GPQAEvalResult(  # type: ignore
+            score=score,  # type: ignore
+            metadata={
+                "ground_truth_answer": ground_truth_answer,
+                "extracted_answer": extracted_answer,
+                "llm_response": candidate.answer,
+                "task_id": task.id,
+            },
+        )