From da9fa039305911b8bd77632c934854b5d60c98f5 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 10 Nov 2025 23:12:15 +0000 Subject: [PATCH 1/4] switch to lm_eval entrypoint that uses pre-loaded transformers model Signed-off-by: Brian Dellabetta --- tests/lmeval/test_lmeval.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py index da99782c68..e78cb5bed7 100644 --- a/tests/lmeval/test_lmeval.py +++ b/tests/lmeval/test_lmeval.py @@ -11,6 +11,7 @@ import yaml from loguru import logger from pydantic import BaseModel +from transformers import AutoModelForCausalLM from llmcompressor.core import active_session from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing @@ -152,11 +153,12 @@ def test_lm_eval(self, test_data_file: str): @log_time def _eval_base_model(self): """Evaluate the base (uncompressed) model.""" - model_args = {**self.lmeval.model_args, "pretrained": self.model} - results = lm_eval.simple_evaluate( - model=self.lmeval.model, - model_args=model_args, + model=lm_eval.models.huggingface.HFLM( + pretrained=AutoModelForCausalLM.from_pretrained(self.model), + batch_size=self.lmeval.batch_size, + **self.lmeval.model_args, + ), tasks=[self.lmeval.task], num_fewshot=self.lmeval.num_fewshot, limit=self.lmeval.limit, @@ -183,11 +185,12 @@ def _handle_recipe(self): @log_time def _run_lm_eval(self): - model_args = {"pretrained": self.save_dir} - model_args.update(self.lmeval.model_args) results = lm_eval.simple_evaluate( - model=self.lmeval.model, - model_args=model_args, + model=lm_eval.models.huggingface.HFLM( + pretrained=AutoModelForCausalLM.from_pretrained(self.save_dir), + batch_size=self.lmeval.batch_size, + **self.lmeval.model_args, + ), tasks=[self.lmeval.task], num_fewshot=self.lmeval.num_fewshot, limit=self.lmeval.limit, From 7348db5a6b324623297cd8e9e4afa9f1c578be4a Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Wed, 12 Nov 2025 20:10:43 +0000 Subject: [PATCH 2/4] allow for hf-multimodal and non-AutoModelForCausalLM models Signed-off-by: Brian Dellabetta --- tests/e2e/e2e_utils.py | 10 +++---- tests/lmeval/test_lmeval.py | 53 +++++++++++++++---------------------- 2 files changed, 25 insertions(+), 38 deletions(-) diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py index f052a7ca73..f52ec494bd 100644 --- a/tests/e2e/e2e_utils.py +++ b/tests/e2e/e2e_utils.py @@ -11,14 +11,13 @@ @log_time -def _load_model_and_processor( +def load_model( model: str, model_class: str, ): pretrained_model_class = getattr(transformers, model_class) loaded_model = pretrained_model_class.from_pretrained(model, torch_dtype="auto") - processor = AutoProcessor.from_pretrained(model) - return loaded_model, processor + return loaded_model @log_time @@ -41,9 +40,8 @@ def run_oneshot_for_e2e_testing( # Load model. oneshot_kwargs = {} - loaded_model, processor = _load_model_and_processor( - model=model, model_class=model_class - ) + loaded_model = load_model(model=model, model_class=model_class) + processor = AutoProcessor.from_pretrained(model) if dataset_id: ds = load_dataset(dataset_id, name=dataset_config, split=dataset_split) diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py index e78cb5bed7..67d8333089 100644 --- a/tests/lmeval/test_lmeval.py +++ b/tests/lmeval/test_lmeval.py @@ -11,10 +11,9 @@ import yaml from loguru import logger from pydantic import BaseModel -from transformers import AutoModelForCausalLM from llmcompressor.core import active_session -from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing +from tests.e2e.e2e_utils import load_model, run_oneshot_for_e2e_testing from tests.test_timer.timer_utils import get_singleton_manager, log_time from tests.testing_utils import requires_gpu @@ -36,6 +35,10 @@ class LmEvalConfig(BaseModel): try: import lm_eval + import lm_eval.api.registry + + # needed to populate model registry + import lm_eval.models # noqa lm_eval_installed = True except ImportError: @@ -121,7 +124,7 @@ def test_lm_eval(self, test_data_file: str): # Always evaluate base model for recovery testing logger.info("================= Evaluating BASE model ======================") - self.base_results = self._eval_base_model() + base_results = self._eval_model(self.model) if not self.save_dir: self.save_dir = self.model.split("/")[1] + f"-{self.scheme}" @@ -146,16 +149,25 @@ def test_lm_eval(self, test_data_file: str): self._handle_recipe() logger.info("================= Running LM Eval on COMPRESSED model ==========") - self._run_lm_eval() + compressed_results = self._eval_model(self.save_dir) + + # Always use recovery testing + self._validate_recovery(base_results, compressed_results) + + # If absolute metrics provided, show warnings (not failures) + if self.lmeval.metrics: + self._check_absolute_warnings(compressed_results) self.tear_down() @log_time - def _eval_base_model(self): + def _eval_model(self, model: str): """Evaluate the base (uncompressed) model.""" + lm_eval_cls = lm_eval.api.registry.get_model(self.lmeval.model) + results = lm_eval.simple_evaluate( - model=lm_eval.models.huggingface.HFLM( - pretrained=AutoModelForCausalLM.from_pretrained(self.model), + model=lm_eval_cls( + pretrained=load_model(model, self.model_class), batch_size=self.lmeval.batch_size, **self.lmeval.model_args, ), @@ -183,32 +195,9 @@ def _handle_recipe(self): fp.write(recipe_yaml_str) session.reset() - @log_time - def _run_lm_eval(self): - results = lm_eval.simple_evaluate( - model=lm_eval.models.huggingface.HFLM( - pretrained=AutoModelForCausalLM.from_pretrained(self.save_dir), - batch_size=self.lmeval.batch_size, - **self.lmeval.model_args, - ), - tasks=[self.lmeval.task], - num_fewshot=self.lmeval.num_fewshot, - limit=self.lmeval.limit, - device="cuda:0", - apply_chat_template=self.lmeval.apply_chat_template, - batch_size=self.lmeval.batch_size, - ) - - # Always use recovery testing - self._validate_recovery(results) - - # If absolute metrics provided, show warnings (not failures) - if self.lmeval.metrics: - self._check_absolute_warnings(results) - - def _validate_recovery(self, compressed_results): + def _validate_recovery(self, base_results, compressed_results): """Validate using recovery testing - compare against base model.""" - base_metrics = self.base_results["results"][self.lmeval.task] + base_metrics = base_results["results"][self.lmeval.task] compressed_metrics = compressed_results["results"][self.lmeval.task] higher_is_better_map = compressed_results.get("higher_is_better", {}).get( self.lmeval.task, {} From 8587c8eed9f1929da30867d60a1b3da8d0456886 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Wed, 12 Nov 2025 20:18:59 +0000 Subject: [PATCH 3/4] typehint Signed-off-by: Brian Dellabetta --- tests/lmeval/test_lmeval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py index 67d8333089..5fc61a6ecb 100644 --- a/tests/lmeval/test_lmeval.py +++ b/tests/lmeval/test_lmeval.py @@ -161,7 +161,7 @@ def test_lm_eval(self, test_data_file: str): self.tear_down() @log_time - def _eval_model(self, model: str): + def _eval_model(self, model: str) -> dict: """Evaluate the base (uncompressed) model.""" lm_eval_cls = lm_eval.api.registry.get_model(self.lmeval.model) From 277c1cd94ffcfc15f4fb26e5492908016142a08f Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Wed, 12 Nov 2025 22:53:59 +0000 Subject: [PATCH 4/4] cleanup Signed-off-by: Brian Dellabetta --- tests/e2e/e2e_utils.py | 10 ++++------ tests/lmeval/test_lmeval.py | 19 ++++++++++++++----- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py index f52ec494bd..92a272737f 100644 --- a/tests/e2e/e2e_utils.py +++ b/tests/e2e/e2e_utils.py @@ -10,13 +10,11 @@ from tests.testing_utils import process_dataset -@log_time -def load_model( - model: str, - model_class: str, -): +def load_model(model: str, model_class: str, device_map: str | None = None): pretrained_model_class = getattr(transformers, model_class) - loaded_model = pretrained_model_class.from_pretrained(model, torch_dtype="auto") + loaded_model = pretrained_model_class.from_pretrained( + model, torch_dtype="auto", device_map=device_map + ) return loaded_model diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py index 5fc61a6ecb..4e38d7643b 100644 --- a/tests/lmeval/test_lmeval.py +++ b/tests/lmeval/test_lmeval.py @@ -124,7 +124,7 @@ def test_lm_eval(self, test_data_file: str): # Always evaluate base model for recovery testing logger.info("================= Evaluating BASE model ======================") - base_results = self._eval_model(self.model) + base_results = self._eval_base_model() if not self.save_dir: self.save_dir = self.model.split("/")[1] + f"-{self.scheme}" @@ -149,7 +149,7 @@ def test_lm_eval(self, test_data_file: str): self._handle_recipe() logger.info("================= Running LM Eval on COMPRESSED model ==========") - compressed_results = self._eval_model(self.save_dir) + compressed_results = self._eval_compressed_model() # Always use recovery testing self._validate_recovery(base_results, compressed_results) @@ -161,20 +161,29 @@ def test_lm_eval(self, test_data_file: str): self.tear_down() @log_time - def _eval_model(self, model: str) -> dict: + def _eval_base_model(self) -> dict: """Evaluate the base (uncompressed) model.""" + return self._eval_model(self.model) + + @log_time + def _eval_compressed_model(self) -> dict: + """Evaluate the compressed model.""" + return self._eval_model(self.save_dir) + + def _eval_model(self, model: str) -> dict: + # NOTE: pass in PreTrainedModel to avoid lm_eval's model-loading logic + # https://github.com/EleutherAI/lm-evaluation-harness/pull/3393 lm_eval_cls = lm_eval.api.registry.get_model(self.lmeval.model) results = lm_eval.simple_evaluate( model=lm_eval_cls( - pretrained=load_model(model, self.model_class), + pretrained=load_model(model, self.model_class, device_map="cuda:0"), batch_size=self.lmeval.batch_size, **self.lmeval.model_args, ), tasks=[self.lmeval.task], num_fewshot=self.lmeval.num_fewshot, limit=self.lmeval.limit, - device="cuda:0", apply_chat_template=self.lmeval.apply_chat_template, batch_size=self.lmeval.batch_size, )