EleutherAI · May 12, 2023 · May 12, 2023 · May 28, 2023 · Jun 6, 2023 · Jun 7, 2023
Showing with 40 additions and 17 deletions.

+1 −1 CODEOWNERS

+15 −0 README.md

+1 −1 lm_eval/tasks/gsm8k.py

+23 −15 lm_eval/tasks/hendrycks_test.py
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -1 +1 @@
-* @jon-tow @StellaAthena @haileyschoelkopf @lintangsutawika
+* @haileyschoelkopf @lintangsutawika
diff --git a/README.md b/README.md
@@ -1,5 +1,20 @@
 # Language Model Evaluation Harness
 
+## Notice to Users
+(as of 6/15/23)
+We have a revamp of the Evaluation Harness library internals staged on the [big-refactor](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor) branch! It is far along in progress, but before we start to move the `master` branch of the repository over to this new design with a new version release, we'd like to ensure that it's been tested by outside users and there are no glaring bugs.
+
+We’d like your help to test it out! you can help by:
+1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive,
+2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.
+
+If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with: 
+- A shell command to run the task in the `master` branch, and what the score is
+- A shell command to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
+
+Lastly, we'll no longer be accepting new feature requests beyond those that are already open to the master branch as we carry out this switch to the new version over the next week, though we will be accepting bugfixes to `master` branch and PRs to `big-refactor`. Feel free to reach out in the #lm-thunderdome channel of the EAI discord for more information.
+
+
 ## Overview
 
 This project provides a unified framework to test generative language models on a large number of different evaluation tasks.

diff --git a/lm_eval/tasks/gsm8k.py b/lm_eval/tasks/gsm8k.py
@@ -79,7 +79,7 @@ def construct_requests(self, doc, ctx):
         """
         # NOTE: The paper implements "verifiers" that assign a score to multiple
         # solutions and output the highest ranked solution.
-        completion = rf.greedy_until(ctx, {"until": [":", "Question:", "Question"]})
+        completion = rf.greedy_until(ctx, {"until": ["Question:", "Question"]})
         return completion
 
     def _extract_answer(self, completion):

diff --git a/lm_eval/tasks/hendrycks_test.py b/lm_eval/tasks/hendrycks_test.py
@@ -14,7 +14,6 @@
 """
 from lm_eval.base import MultipleChoiceTask
 
-
 _CITATION = """
 @article{hendryckstest2021,
     title={Measuring Massive Multitask Language Understanding},
@@ -103,16 +102,16 @@ def __init__(self):
 
 
 class GeneralHendrycksTest(MultipleChoiceTask):
-    VERSION = 0
-    DATASET_PATH = "hendrycks_test"
+    VERSION = 1
+    DATASET_PATH = "cais/mmlu"
     DATASET_NAME = None
 
     def __init__(self, subject):
         self.DATASET_NAME = subject
         super().__init__()
 
     def has_training_docs(self):
-        return False
+        return True
 
     def has_validation_docs(self):
         return True
@@ -126,41 +125,50 @@ def validation_docs(self):
     def test_docs(self):
         return map(self._process_doc, self.dataset["test"])
 
+    def _format_subject(self, subject):
+        words = subject.split("_")
+        return " ".join(words)
+
+    def fewshot_context(self, doc, num_fewshot, **kwargs):
+        subject = self.DATASET_NAME
+        description = f"The following are multiple choice questions (with answers) about {self._format_subject(subject)}."
+        kwargs["description"] = description
+        return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)
+
     def _process_doc(self, doc):
         def format_example(doc, keys):
             """
-            Question: <prompt>
-            Choices:
+            <prompt>
             A. <choice1>
             B. <choice2>
             C. <choice3>
             D. <choice4>
             Answer:
             """
-            prompt = "Question: " + doc["question"] + "\nChoices:\n"
-            prompt += "".join(
+
+            question = doc["question"].strip()
+            choices = "".join(
                 [f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])]
             )
-            prompt += "Answer:"
+            prompt = f"{question}\n{choices}Answer:"
             return prompt
 
         keys = ["A", "B", "C", "D"]
         return {
             "query": format_example(doc, keys),
-            "choices": doc["choices"],
-            "gold": keys.index(doc["answer"])
-            if isinstance(doc["answer"], str)
-            else doc["answer"],
+            "choices": keys,
+            "gold": doc["answer"],
         }
 
     def fewshot_examples(self, k, rnd):
         # fewshot_examples is not just sampling from train_docs because dev is
         # in the same distribution as val/test but auxiliary_train isn't
-
         if self._fewshot_docs is None:
             self._fewshot_docs = list(map(self._process_doc, self.dataset["dev"]))
 
-        return rnd.sample(list(self._fewshot_docs), k)
+        # use the unchanged order of the dev set without sampling,
+        # just as in the original code https://github.com/hendrycks/test/blob/master/evaluate.py#L28
+        return self._fewshot_docs[:k]
 
     def doc_to_text(self, doc):
         return doc["query"]
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		* @jon-tow @StellaAthena @haileyschoelkopf @lintangsutawika
		* @haileyschoelkopf @lintangsutawika