Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: test whether CV is effective #649

Closed
wants to merge 10 commits into from
Binary file modified docs/WeChat_QR_code.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
10 changes: 7 additions & 3 deletions rdagent/app/data_science/loop.py
Original file line number Diff line number Diff line change
@@ -142,13 +142,15 @@ def record(self, prev_out: dict[str, Any]):
logger.log_object(self.trace.sota_experiment(), tag="SOTA experiment")


def main(path=None, output_path=None, step_n=None, loop_n=None, competition="bms-molecular-translation"):
def main(
path=None, output_path=None, step_n=None, loop_n=None, competition="bms-molecular-translation", do_truncate=True
):
"""

Parameters
----------
path :
path like `$LOG_PATH/__session__/1/0_propose`. It indicates that we restore the state that after finish the step 0 in loop1
path like `$LOG_PATH/__session__/1/0_propose`. It indicates that we restore the state that after finish the step 0 in loop 1
output_path :
path like `$LOG_PATH`. It indicates that where we want to save our session and log information.
step_n :
@@ -158,6 +160,8 @@ def main(path=None, output_path=None, step_n=None, loop_n=None, competition="bms
- if current loop is incomplete, it will be counted as the first loop for completion.
- if both step_n and loop_n are provided, the process will stop as soon as either condition is met.
competition :
do_truncate :
If set to True, the logger will truncate the future log messages by calling `logger.storage.truncate`.


Auto R&D Evolving loop for models in a Kaggle scenario.
@@ -181,7 +185,7 @@ def main(path=None, output_path=None, step_n=None, loop_n=None, competition="bms
if path is None:
kaggle_loop = DataScienceRDLoop(DS_RD_SETTING)
else:
kaggle_loop = DataScienceRDLoop.load(path, output_path)
kaggle_loop = DataScienceRDLoop.load(path, output_path, do_truncate)
kaggle_loop.run(step_n=step_n, loop_n=loop_n)


3 changes: 2 additions & 1 deletion rdagent/app/qlib_rd_loop/factor_from_report.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
from pathlib import Path
from typing import Any, Tuple
from typing import Any, Dict, Tuple

import fire
from jinja2 import Environment, StrictUndefined
@@ -49,6 +49,7 @@ def generate_hypothesis(factor_result: dict, report_content: str) -> str:
user_prompt=user_prompt,
system_prompt=system_prompt,
json_mode=True,
json_target_type=Dict[str, str],
)

response_json = json.loads(response)
3 changes: 2 additions & 1 deletion rdagent/components/coder/CoSTEER/knowledge_management.py
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@
import re
from itertools import combinations
from pathlib import Path
from typing import Union
from typing import List, Union

from jinja2 import Environment, StrictUndefined

@@ -339,6 +339,7 @@ def analyze_component(
system_prompt=analyze_component_system_prompt,
user_prompt=analyze_component_user_prompt,
json_mode=True,
json_target_type=List[int],
),
)["component_no_list"]
return [all_component_nodes[index - 1] for index in sorted(list(set(component_no_list)))]
6 changes: 5 additions & 1 deletion rdagent/components/coder/data_science/ensemble/__init__.py
Original file line number Diff line number Diff line change
@@ -12,6 +12,7 @@
"""

import json
from typing import Dict

from rdagent.components.coder.CoSTEER import CoSTEER
from rdagent.components.coder.CoSTEER.evaluators import (
@@ -85,7 +86,10 @@ def implement_one_task(
for _ in range(5):
ensemble_code = json.loads(
APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
user_prompt=user_prompt,
system_prompt=system_prompt,
json_mode=True,
json_target_type=Dict[str, str],
)
)["code"]
if ensemble_code != workspace.file_dict.get("ensemble.py"):
6 changes: 5 additions & 1 deletion rdagent/components/coder/data_science/feature/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from typing import Dict

from rdagent.components.coder.CoSTEER import CoSTEER
from rdagent.components.coder.CoSTEER.evaluators import (
@@ -70,7 +71,10 @@ def implement_one_task(
for _ in range(5):
feature_code = json.loads(
APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
user_prompt=user_prompt,
system_prompt=system_prompt,
json_mode=True,
json_target_type=Dict[str, str],
)
)["code"]
if feature_code != workspace.file_dict.get("feature.py"):
3 changes: 3 additions & 0 deletions rdagent/components/coder/data_science/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Dict

from rdagent.components.coder.CoSTEER import CoSTEER
from rdagent.components.coder.CoSTEER.evaluators import (
CoSTEERMultiEvaluator,
@@ -83,6 +85,7 @@ def implement_one_task(
user_prompt=user_prompt,
system_prompt=system_prompt,
json_mode=BatchEditOut.json_mode,
json_target_type=Dict[str, str],
)
)

42 changes: 28 additions & 14 deletions rdagent/components/coder/data_science/raw_data_loader/__init__.py
Original file line number Diff line number Diff line change
@@ -24,6 +24,7 @@

import json
import re
from typing import Dict

from rdagent.app.data_science.conf import DS_RD_SETTING
from rdagent.components.coder.CoSTEER import CoSTEER
@@ -108,20 +109,30 @@ def implement_one_task(
spec_session = APIBackend().build_chat_session(session_system_prompt=system_prompt)

data_loader_spec = json.loads(
spec_session.build_chat_completion(user_prompt=data_loader_prompt, json_mode=True)
spec_session.build_chat_completion(
user_prompt=data_loader_prompt, json_mode=True, json_target_type=Dict[str, str]
)
)["spec"]
feature_spec = json.loads(
spec_session.build_chat_completion(
user_prompt=feature_prompt, json_mode=True, json_target_type=Dict[str, str]
)
)["spec"]
model_spec = json.loads(
spec_session.build_chat_completion(
user_prompt=model_prompt, json_mode=True, json_target_type=Dict[str, str]
)
)["spec"]
ensemble_spec = json.loads(
spec_session.build_chat_completion(
user_prompt=ensemble_prompt, json_mode=True, json_target_type=Dict[str, str]
)
)["spec"]
workflow_spec = json.loads(
spec_session.build_chat_completion(
user_prompt=workflow_prompt, json_mode=True, json_target_type=Dict[str, str]
)
)["spec"]
feature_spec = json.loads(spec_session.build_chat_completion(user_prompt=feature_prompt, json_mode=True))[
"spec"
]
model_spec = json.loads(spec_session.build_chat_completion(user_prompt=model_prompt, json_mode=True))[
"spec"
]
ensemble_spec = json.loads(spec_session.build_chat_completion(user_prompt=ensemble_prompt, json_mode=True))[
"spec"
]
workflow_spec = json.loads(spec_session.build_chat_completion(user_prompt=workflow_prompt, json_mode=True))[
"spec"
]
else:
data_loader_spec = workspace.file_dict["spec/data_loader.md"]
feature_spec = workspace.file_dict["spec/feature.md"]
@@ -146,7 +157,10 @@ def implement_one_task(
for _ in range(5):
data_loader_code = json.loads(
APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
user_prompt=user_prompt,
system_prompt=system_prompt,
json_mode=True,
json_target_type=Dict[str, str],
)
)["code"]
if data_loader_code != workspace.file_dict.get("load_data.py"):
Original file line number Diff line number Diff line change
@@ -271,8 +271,29 @@ spec:

3. Dataset Splitting
- The dataset returned by `load_data` is not pre-split. After calling `feat_eng`, split the data into training and test sets.
- If feasible, apply cross-validation on the training set (`X_transformed`, `y_transformed`) to ensure a reliable assessment of model performance.
- [Notice] Apply cross-validation (e.g. KFold) on the training set (`X_transformed`, `y_transformed`) to ensure a reliable assessment of model performance.
- Keep the test set (`X_test_transformed`) unchanged, as it is only used for generating the final predictions.
- Pseudocode logic for reference:
```
Set number of splits and initialize KFold cross-validator.

Create dictionaries for validation and test predictions.

For each model file:
Import the model dynamically.
Initialize arrays for out-of-fold (OOF) and test predictions.

For each fold in KFold:
Split data into training and validation sets.
Run model workflow to get validation and test predictions.
Validate shapes.
Store validation and test predictions.

Compute average test predictions across folds.
Save OOF and averaged test predictions.

Ensemble predictions from all models and print the final shape.
```

4. Submission File:
- Save the final predictions as `submission.csv`, ensuring the format matches the competition requirements (refer to `sample_submission` in the Folder Description for the correct structure).
6 changes: 5 additions & 1 deletion rdagent/components/coder/data_science/workflow/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from typing import Dict

from rdagent.components.coder.CoSTEER import CoSTEER
from rdagent.components.coder.CoSTEER.evaluators import (
@@ -73,7 +74,10 @@ def implement_one_task(
for _ in range(5):
workflow_code = json.loads(
APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
user_prompt=user_prompt,
system_prompt=system_prompt,
json_mode=True,
json_target_type=Dict[str, str],
)
)["code"]
if workflow_code != workspace.file_dict.get("main.py"):
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
from pathlib import Path
import pandas as pd
import hashlib

def calculate_md5(file_path):
with open(file_path, "rb") as f:
file_hash = hashlib.md5(f.read()).hexdigest()
return file_hash

file_md5 = calculate_md5("scores.csv")

"""
find . | grep -i sample | grep -i submission | grep -v sample_submission.csv | grep -v zip_files | grep -v 'sample/'
@@ -66,4 +73,5 @@ def print_first_rows(file_path, file_name, num_rows=5):
print_first_rows(SAMPLE_SUBMISSION_PATH, sample_submission_name)
print_first_rows('submission.csv', 'submission.csv')

assert calculate_md5("scores.csv") == file_md5, "scores.csv should not be rewritten"
print(f"\nPlease Checked the content of the submission file(submission.csv should align with {sample_submission_name}). ")
Original file line number Diff line number Diff line change
@@ -109,6 +109,7 @@ workflow_eval:
[Note]
1. The individual components (data loading, feature engineering, model tuning, etc.) have already been evaluated by the user. You should only evaluate and improve the workflow code, unless there are critical issues in the components.
2. Model performance is NOT a concern in this evaluation—only correct execution and formatting matter.
3. As long as the execution does not exceed the time limit, ensure that the code uses cross-validation to split the training data and train the model. If cross-validation is not used, mention it in the execution section and set `final_decision` to `false`.

## Evaluation Criteria
You will be given the workflow execution output (`stdout`) to determine correctness.
8 changes: 6 additions & 2 deletions rdagent/components/coder/factor_coder/eva_utils.py
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@
import json
from abc import abstractmethod
from pathlib import Path
from typing import Tuple
from typing import Dict, Tuple

import pandas as pd
from jinja2 import Environment, StrictUndefined
@@ -212,7 +212,10 @@ def evaluate(
try:
api = APIBackend() if attempts == 0 else APIBackend(use_chat_cache=False)
resp = api.build_messages_and_create_chat_completion(
user_prompt=gen_df_info_str, system_prompt=system_prompt, json_mode=True
user_prompt=gen_df_info_str,
system_prompt=system_prompt,
json_mode=True,
json_target_type=Dict[str, str | bool | int],
)
resp_dict = json.loads(resp)
resp_dict["output_format_decision"] = str(resp_dict["output_format_decision"]).lower() in ["true", "1"]
@@ -556,6 +559,7 @@ def evaluate(
system_prompt=system_prompt,
json_mode=True,
seed=attempts, # in case of useless retrying when cache enabled.
json_target_type=Dict[str, str | bool | int],
),
)
final_decision = final_evaluation_dict["final_decision"]
6 changes: 5 additions & 1 deletion rdagent/components/coder/factor_coder/evolving_strategy.py
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@

import json
from pathlib import Path
from typing import Dict

from jinja2 import Environment, StrictUndefined

@@ -168,7 +169,10 @@ def implement_one_task(
APIBackend(
use_chat_cache=FACTOR_COSTEER_SETTINGS.coder_use_cache
).build_messages_and_create_chat_completion(
user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
user_prompt=user_prompt,
system_prompt=system_prompt,
json_mode=True,
json_target_type=Dict[str, str],
)
)["code"]
return code
3 changes: 2 additions & 1 deletion rdagent/components/coder/model_coder/eva_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
from pathlib import Path
from typing import Tuple
from typing import Dict, Tuple

import numpy as np
from jinja2 import Environment, StrictUndefined
@@ -177,6 +177,7 @@ def evaluate(
user_prompt=user_prompt,
system_prompt=system_prompt,
json_mode=True,
json_target_type=Dict[str, str | bool | int],
),
)
if isinstance(final_evaluation_dict["final_decision"], str) and final_evaluation_dict[
2 changes: 2 additions & 0 deletions rdagent/components/coder/model_coder/evolving_strategy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
from pathlib import Path
from typing import Dict

from jinja2 import Environment, StrictUndefined

@@ -96,6 +97,7 @@ def implement_one_task(
user_prompt=user_prompt,
system_prompt=system_prompt,
json_mode=True,
json_target_type=Dict[str, str],
),
)["code"]
return code
4 changes: 2 additions & 2 deletions rdagent/core/experiment.py
Original file line number Diff line number Diff line change
@@ -18,7 +18,7 @@
from rdagent.utils.fmt import shrink_text

if typing.TYPE_CHECKING:
from rdagent.core.proposal import ExperimentFeedback, Hypothesis
from rdagent.core.proposal import Hypothesis
from rdagent.utils.env import Env

"""
@@ -225,7 +225,7 @@ def inject_code_from_file_dict(self, workspace: FBWorkspace) -> None:
"""
for name, code in workspace.file_dict.items():
self.inject_files(**{name: code})

def copy(self) -> FBWorkspace:
"""
copy the workspace from the original one
Loading
Oops, something went wrong.