use json description

microsoft · qew21 · Feb 21, 2025 · Feb 24, 2025 · Feb 21, 2025 · 1808d05027f95ad547bc133e7701624095e7d34b
commit 1808d05027f95ad547bc133e7701624095e7d34b
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -40,6 +40,7 @@ workflow_coder:
     3. The user may provide specific code organization rules and instructions. Ensure that the integration follows the given framework and structure.
     4. After predicting the output, print the shape and other information of the output to stdout to help the evaluator assess the code.
     5. You should avoid using logging module to output information in your generated code, and instead use the print() function.
+    6. The code will be evaluated on sampled dataset, so don't hardcode the length of the input data and output.
 
     ## Output Format
     Please response the code in the following json format. Here is an example structure for the JSON output:
@@ -105,6 +106,7 @@ workflow_eval:
     Your focus is to check whether the workflow code:
     1. Executes successfully, correctly organizing components and generating a final submission.
     2. Generates predictions in the correct format, ensuring they align with the **sample submission** structure!
+    3. The code will be evaluated on sampled dataset, so don't check the length of the input data and output.
 
     [Note] 
     1. The individual components (data loading, feature engineering, model tuning, etc.) have already been evaluated by the user. You should only evaluate and improve the workflow code, unless there are critical issues in the components.

diff --git a/rdagent/scenarios/data_science/dev/runner/prompts.yaml b/rdagent/scenarios/data_science/dev/runner/prompts.yaml
@@ -18,6 +18,7 @@ DSCoSTEER_eval:
     The user will provide you the whole code base, some logs generated during the execution of the whole workflow. Your evaluation scope includes whether the workflow code:
     1. Executes successfully, correctly organizing components and generating a final submission.
     2. Generates predictions in the correct format, ensuring they align with the **sample submission** structure!
+    3. The code will be evaluated on sampled dataset, so don't check the length of the input data and output.
 
 
     Please respond with your feedback in the following JSON format and order

diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
@@ -213,7 +213,7 @@ def describe_data_folder(folder_path, indent=0, max_files=2, partial_expand_subf
                     result.append(" " * (indent + 2) + f"- Content of {file}:")
                     with open(path, "r", encoding="utf-8") as f:
                         for i, line in enumerate(f):
-                            if i < 2:
+                            if i < 4:
                                 result.append(
                                     " " * (indent + 4) + line.strip()[:100] + ("..." if len(line.strip()) > 100 else "")
                                 )
@@ -229,6 +229,7 @@ class DataScienceScen(Scenario):
     def __init__(self, competition: str) -> None:
         self.competition = competition
         self.raw_description = self._get_description()
+        self.json_description = self._get_description_json()
         self.processed_data_folder_description = self._get_data_folder_description()
         self._analysis_competition_description()
         self.metric_direction = self._get_direction()
@@ -243,6 +244,9 @@ def _get_description(self):
                 f"Cannot find {self.competition}.json in {DS_RD_SETTING.local_data_path}, please check the file."
             )
 
+    def _get_description_json(self):
+        return self.raw_description
+
     def _get_direction(self):
         return self.metric_direction_guess if hasattr(self, "metric_direction_guess") else True
 
@@ -262,9 +266,9 @@ def _analysis_competition_description(self):
         response_json_analysis = json.loads(response_analysis)
         self.task_type = response_json_analysis.get("Task Type", "No type provided")
         self.data_type = response_json_analysis.get("Data Type", "No data type provided")
-        self.brief_description = response_json_analysis.get("Brief Description", "No brief description provided")
-        self.dataset_description = response_json_analysis.get("Dataset Description", "No dataset description provided")
-        self.target_description = response_json_analysis.get("Evaluation Description", "No target description provided")
+        self.brief_description = self.json_description.get("Description", response_json_analysis.get("Brief Description", "No brief description provided"))
+        self.dataset_description = self.json_description.get("Data Description", response_json_analysis.get("Dataset Description", "No dataset description provided"))
+        self.target_description = self.json_description.get("Evaluation", response_json_analysis.get("Evaluation Description", "No target description provided"))
         self.submission_specifications = response_json_analysis.get(
             "Submission Specifications", "No submission requirements provided"
         )
@@ -336,6 +340,9 @@ class KaggleScen(DataScienceScen):
     def _get_description(self):
         return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path)
 
+    def _get_description_json(self):
+        return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path, force=True)
+
     def _get_direction(self):
         if DS_RD_SETTING.if_using_mle_data:
             return super()._get_direction()

diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -41,7 +41,7 @@ def crawl_descriptions(
         logger.info(f"Found {competition}/description.md, loading from it.")
         return fp.read_text()
 
-    if (fp := Path(f"{local_data_path}/{competition}.json")).exists() and not force:
+    if (fp := Path(f"{local_data_path}/{competition}.json")).exists():
         logger.info(f"Found {competition}.json, loading from local file.")
         with fp.open("r") as f:
             return json.load(f)