microsoft · WinstonLiyt · Feb 27, 2025 · Feb 27, 2025 · Feb 27, 2025 · Mar 3, 2025
diff --git a/docs/WeChat_QR_code.jpg b/docs/WeChat_QR_code.jpg
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
@@ -142,13 +142,15 @@ def record(self, prev_out: dict[str, Any]):
         logger.log_object(self.trace.sota_experiment(), tag="SOTA experiment")
 
 
-def main(path=None, output_path=None, step_n=None, loop_n=None, competition="bms-molecular-translation"):
+def main(
+    path=None, output_path=None, step_n=None, loop_n=None, competition="bms-molecular-translation", do_truncate=True
+):
     """
 
     Parameters
     ----------
     path :
-        path like `$LOG_PATH/__session__/1/0_propose`. It indicates that we restore the state that after finish the step 0 in loop1
+        path like `$LOG_PATH/__session__/1/0_propose`. It indicates that we restore the state that after finish the step 0 in loop 1
     output_path :
         path like `$LOG_PATH`. It indicates that where we want to save our session and log information.
     step_n :
@@ -158,6 +160,8 @@ def main(path=None, output_path=None, step_n=None, loop_n=None, competition="bms
         - if current loop is incomplete, it will be counted as the first loop for completion.
         - if both step_n and loop_n are provided, the process will stop as soon as either condition is met.
     competition :
+    do_truncate :
+        If set to True, the logger will truncate the future log messages by calling `logger.storage.truncate`.
 
 
     Auto R&D Evolving loop for models in a Kaggle scenario.
@@ -181,7 +185,7 @@ def main(path=None, output_path=None, step_n=None, loop_n=None, competition="bms
     if path is None:
         kaggle_loop = DataScienceRDLoop(DS_RD_SETTING)
     else:
-        kaggle_loop = DataScienceRDLoop.load(path, output_path)
+        kaggle_loop = DataScienceRDLoop.load(path, output_path, do_truncate)
     kaggle_loop.run(step_n=step_n, loop_n=loop_n)
 
 

diff --git a/rdagent/app/qlib_rd_loop/factor_from_report.py b/rdagent/app/qlib_rd_loop/factor_from_report.py
@@ -1,6 +1,6 @@
 import json
 from pathlib import Path
-from typing import Any, Tuple
+from typing import Any, Dict, Tuple
 
 import fire
 from jinja2 import Environment, StrictUndefined
@@ -49,6 +49,7 @@ def generate_hypothesis(factor_result: dict, report_content: str) -> str:
         user_prompt=user_prompt,
         system_prompt=system_prompt,
         json_mode=True,
+        json_target_type=Dict[str, str],
     )
 
     response_json = json.loads(response)

diff --git a/rdagent/components/coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/CoSTEER/knowledge_management.py
@@ -6,7 +6,7 @@
 import re
 from itertools import combinations
 from pathlib import Path
-from typing import Union
+from typing import List, Union
 
 from jinja2 import Environment, StrictUndefined
 
@@ -339,6 +339,7 @@ def analyze_component(
                     system_prompt=analyze_component_system_prompt,
                     user_prompt=analyze_component_user_prompt,
                     json_mode=True,
+                    json_target_type=List[int],
                 ),
             )["component_no_list"]
             return [all_component_nodes[index - 1] for index in sorted(list(set(component_no_list)))]

diff --git a/rdagent/components/coder/data_science/ensemble/__init__.py b/rdagent/components/coder/data_science/ensemble/__init__.py
@@ -12,6 +12,7 @@
 """
 
 import json
+from typing import Dict
 
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.evaluators import (
@@ -85,7 +86,10 @@ def implement_one_task(
         for _ in range(5):
             ensemble_code = json.loads(
                 APIBackend().build_messages_and_create_chat_completion(
-                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                    json_mode=True,
+                    json_target_type=Dict[str, str],
                 )
             )["code"]
             if ensemble_code != workspace.file_dict.get("ensemble.py"):

diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
@@ -1,4 +1,5 @@
 import json
+from typing import Dict
 
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.evaluators import (
@@ -70,7 +71,10 @@ def implement_one_task(
         for _ in range(5):
             feature_code = json.loads(
                 APIBackend().build_messages_and_create_chat_completion(
-                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                    json_mode=True,
+                    json_target_type=Dict[str, str],
                 )
             )["code"]
             if feature_code != workspace.file_dict.get("feature.py"):

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
@@ -1,3 +1,5 @@
+from typing import Dict
+
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEERMultiEvaluator,
@@ -83,6 +85,7 @@ def implement_one_task(
                     user_prompt=user_prompt,
                     system_prompt=system_prompt,
                     json_mode=BatchEditOut.json_mode,
+                    json_target_type=Dict[str, str],
                 )
             )
 

diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -24,6 +24,7 @@
 
 import json
 import re
+from typing import Dict
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER import CoSTEER
@@ -108,20 +109,30 @@ def implement_one_task(
             spec_session = APIBackend().build_chat_session(session_system_prompt=system_prompt)
 
             data_loader_spec = json.loads(
-                spec_session.build_chat_completion(user_prompt=data_loader_prompt, json_mode=True)
+                spec_session.build_chat_completion(
+                    user_prompt=data_loader_prompt, json_mode=True, json_target_type=Dict[str, str]
+                )
+            )["spec"]
+            feature_spec = json.loads(
+                spec_session.build_chat_completion(
+                    user_prompt=feature_prompt, json_mode=True, json_target_type=Dict[str, str]
+                )
+            )["spec"]
+            model_spec = json.loads(
+                spec_session.build_chat_completion(
+                    user_prompt=model_prompt, json_mode=True, json_target_type=Dict[str, str]
+                )
+            )["spec"]
+            ensemble_spec = json.loads(
+                spec_session.build_chat_completion(
+                    user_prompt=ensemble_prompt, json_mode=True, json_target_type=Dict[str, str]
+                )
+            )["spec"]
+            workflow_spec = json.loads(
+                spec_session.build_chat_completion(
+                    user_prompt=workflow_prompt, json_mode=True, json_target_type=Dict[str, str]
+                )
             )["spec"]
-            feature_spec = json.loads(spec_session.build_chat_completion(user_prompt=feature_prompt, json_mode=True))[
-                "spec"
-            ]
-            model_spec = json.loads(spec_session.build_chat_completion(user_prompt=model_prompt, json_mode=True))[
-                "spec"
-            ]
-            ensemble_spec = json.loads(spec_session.build_chat_completion(user_prompt=ensemble_prompt, json_mode=True))[
-                "spec"
-            ]
-            workflow_spec = json.loads(spec_session.build_chat_completion(user_prompt=workflow_prompt, json_mode=True))[
-                "spec"
-            ]
         else:
             data_loader_spec = workspace.file_dict["spec/data_loader.md"]
             feature_spec = workspace.file_dict["spec/feature.md"]
@@ -146,7 +157,10 @@ def implement_one_task(
         for _ in range(5):
             data_loader_code = json.loads(
                 APIBackend().build_messages_and_create_chat_completion(
-                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                    json_mode=True,
+                    json_target_type=Dict[str, str],
                 )
             )["code"]
             if data_loader_code != workspace.file_dict.get("load_data.py"):

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -271,8 +271,29 @@ spec:
 
         3. Dataset Splitting
           - The dataset returned by `load_data` is not pre-split. After calling `feat_eng`, split the data into training and test sets.
-          - If feasible, apply cross-validation on the training set (`X_transformed`, `y_transformed`) to ensure a reliable assessment of model performance.
+          - [Notice] Apply cross-validation (e.g. KFold) on the training set (`X_transformed`, `y_transformed`) to ensure a reliable assessment of model performance.
           - Keep the test set (`X_test_transformed`) unchanged, as it is only used for generating the final predictions.
+          - Pseudocode logic for reference:
+          ```
+          Set number of splits and initialize KFold cross-validator.
+
+          Create dictionaries for validation and test predictions.
+
+          For each model file:
+              Import the model dynamically.
+              Initialize arrays for out-of-fold (OOF) and test predictions.
+
+              For each fold in KFold:
+                  Split data into training and validation sets.
+                  Run model workflow to get validation and test predictions.
+                  Validate shapes.
+                  Store validation and test predictions.
+
+              Compute average test predictions across folds.
+              Save OOF and averaged test predictions.
+
+          Ensemble predictions from all models and print the final shape.
+          ```
 
         4. Submission File:
           - Save the final predictions as `submission.csv`, ensuring the format matches the competition requirements (refer to `sample_submission` in the Folder Description for the correct structure).

diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -1,4 +1,5 @@
 import json
+from typing import Dict
 
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.evaluators import (
@@ -73,7 +74,10 @@ def implement_one_task(
         for _ in range(5):
             workflow_code = json.loads(
                 APIBackend().build_messages_and_create_chat_completion(
-                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                    json_mode=True,
+                    json_target_type=Dict[str, str],
                 )
             )["code"]
             if workflow_code != workspace.file_dict.get("main.py"):

diff --git a/rdagent/components/coder/data_science/workflow/eval_tests/submission_format_test.txt b/rdagent/components/coder/data_science/workflow/eval_tests/submission_format_test.txt
@@ -1,6 +1,13 @@
 from pathlib import Path
 import pandas as pd
+import hashlib
 
+def calculate_md5(file_path):
+    with open(file_path, "rb") as f:
+        file_hash = hashlib.md5(f.read()).hexdigest()
+    return file_hash
+
+file_md5 = calculate_md5("scores.csv")
 
 """
 find . | grep -i sample | grep -i submission | grep -v sample_submission.csv | grep -v zip_files  | grep -v 'sample/'
@@ -66,4 +73,5 @@ def print_first_rows(file_path, file_name, num_rows=5):
 print_first_rows(SAMPLE_SUBMISSION_PATH, sample_submission_name)
 print_first_rows('submission.csv', 'submission.csv')
 
+assert calculate_md5("scores.csv") == file_md5, "scores.csv should not be rewritten"
 print(f"\nPlease Checked the content of the submission file(submission.csv should align with {sample_submission_name}). ")
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -109,6 +109,7 @@ workflow_eval:
     [Note] 
     1. The individual components (data loading, feature engineering, model tuning, etc.) have already been evaluated by the user. You should only evaluate and improve the workflow code, unless there are critical issues in the components.
     2. Model performance is NOT a concern in this evaluation—only correct execution and formatting matter.
+    3. As long as the execution does not exceed the time limit, ensure that the code uses cross-validation to split the training data and train the model. If cross-validation is not used, mention it in the execution section and set `final_decision` to `false`.
 
     ## Evaluation Criteria
     You will be given the workflow execution output (`stdout`) to determine correctness.  

diff --git a/rdagent/components/coder/factor_coder/eva_utils.py b/rdagent/components/coder/factor_coder/eva_utils.py
@@ -2,7 +2,7 @@
 import json
 from abc import abstractmethod
 from pathlib import Path
-from typing import Tuple
+from typing import Dict, Tuple
 
 import pandas as pd
 from jinja2 import Environment, StrictUndefined
@@ -212,7 +212,10 @@ def evaluate(
             try:
                 api = APIBackend() if attempts == 0 else APIBackend(use_chat_cache=False)
                 resp = api.build_messages_and_create_chat_completion(
-                    user_prompt=gen_df_info_str, system_prompt=system_prompt, json_mode=True
+                    user_prompt=gen_df_info_str,
+                    system_prompt=system_prompt,
+                    json_mode=True,
+                    json_target_type=Dict[str, str | bool | int],
                 )
                 resp_dict = json.loads(resp)
                 resp_dict["output_format_decision"] = str(resp_dict["output_format_decision"]).lower() in ["true", "1"]
@@ -556,6 +559,7 @@ def evaluate(
                         system_prompt=system_prompt,
                         json_mode=True,
                         seed=attempts,  # in case of useless retrying when cache enabled.
+                        json_target_type=Dict[str, str | bool | int],
                     ),
                 )
                 final_decision = final_evaluation_dict["final_decision"]

diff --git a/rdagent/components/coder/factor_coder/evolving_strategy.py b/rdagent/components/coder/factor_coder/evolving_strategy.py
@@ -2,6 +2,7 @@
 
 import json
 from pathlib import Path
+from typing import Dict
 
 from jinja2 import Environment, StrictUndefined
 
@@ -168,7 +169,10 @@ def implement_one_task(
                     APIBackend(
                         use_chat_cache=FACTOR_COSTEER_SETTINGS.coder_use_cache
                     ).build_messages_and_create_chat_completion(
-                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                        user_prompt=user_prompt,
+                        system_prompt=system_prompt,
+                        json_mode=True,
+                        json_target_type=Dict[str, str],
                     )
                 )["code"]
                 return code

diff --git a/rdagent/components/coder/model_coder/eva_utils.py b/rdagent/components/coder/model_coder/eva_utils.py
@@ -1,6 +1,6 @@
 import json
 from pathlib import Path
-from typing import Tuple
+from typing import Dict, Tuple
 
 import numpy as np
 from jinja2 import Environment, StrictUndefined
@@ -177,6 +177,7 @@ def evaluate(
                 user_prompt=user_prompt,
                 system_prompt=system_prompt,
                 json_mode=True,
+                json_target_type=Dict[str, str | bool | int],
             ),
         )
         if isinstance(final_evaluation_dict["final_decision"], str) and final_evaluation_dict[

diff --git a/rdagent/components/coder/model_coder/evolving_strategy.py b/rdagent/components/coder/model_coder/evolving_strategy.py
@@ -1,5 +1,6 @@
 import json
 from pathlib import Path
+from typing import Dict
 
 from jinja2 import Environment, StrictUndefined
 
@@ -96,6 +97,7 @@ def implement_one_task(
                 user_prompt=user_prompt,
                 system_prompt=system_prompt,
                 json_mode=True,
+                json_target_type=Dict[str, str],
             ),
         )["code"]
         return code

diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
@@ -18,7 +18,7 @@
 from rdagent.utils.fmt import shrink_text
 
 if typing.TYPE_CHECKING:
-    from rdagent.core.proposal import ExperimentFeedback, Hypothesis
+    from rdagent.core.proposal import Hypothesis
     from rdagent.utils.env import Env
 
 """
@@ -225,7 +225,7 @@ def inject_code_from_file_dict(self, workspace: FBWorkspace) -> None:
         """
         for name, code in workspace.file_dict.items():
             self.inject_files(**{name: code})
-        
+
     def copy(self) -> FBWorkspace:
         """
         copy the workspace from the original one