Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: use raw description #633

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
use description from markdown
  • Loading branch information
qew21 committed Feb 24, 2025
commit 5ccc9e1be619396ac85edb5ce839b07241c9a31d
Original file line number Diff line number Diff line change
@@ -37,6 +37,9 @@ train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_st

# Print the types of train_y and val_y
print(f"train_y type: {type(train_y)}, val_y type: {type(val_y)}")
if isinstance(val_y, pd.Series | pd.DataFrame):
print(f"val_y shape: {val_y.shape} and first few rows of val_y:")
print(val_y.head())

test_preds_dict = {}
val_preds_dict = {}
Original file line number Diff line number Diff line change
@@ -24,6 +24,8 @@ print(f"train_X.shape: {train_X.shape}")
print(f"train_y.shape: {train_y.shape}" if not isinstance(train_y, list) else f"train_y(list)'s length: {len(train_y)}")
print(f"val_X.shape: {val_X.shape}")
print(f"val_y.shape: {val_y.shape}" if not isinstance(val_y, list) else f"val_y(list)'s length: {len(val_y)}")
if isinstance(train_y, list) and not isinstance(val_y[0], int | str | float):
print(f"first 3 train_y: {train_y[:3]}")

# First execution
print("The first execution begins.\n")
Original file line number Diff line number Diff line change
@@ -30,6 +30,7 @@ assert X is not None, "Training data (X) is None."
assert y is not None, "Training labels (y) are None."
assert X_test is not None, "Test data (X_test) is None."
assert test_ids is not None, "Test IDs (test_ids) are None."
assert X.shape

assert get_length(X_test) == get_length(
test_ids
17 changes: 4 additions & 13 deletions rdagent/scenarios/data_science/scen/__init__.py
Original file line number Diff line number Diff line change
@@ -229,7 +229,6 @@ class DataScienceScen(Scenario):
def __init__(self, competition: str) -> None:
self.competition = competition
self.raw_description = self._get_description()
self.json_description = self._get_description_json()
self.processed_data_folder_description = self._get_data_folder_description()
self._analysis_competition_description()
self.metric_direction = self._get_direction()
@@ -244,9 +243,6 @@ def _get_description(self):
f"Cannot find {self.competition}.json in {DS_RD_SETTING.local_data_path}, please check the file."
)

def _get_description_json(self):
return self.raw_description

def _get_direction(self):
return self.metric_direction_guess if hasattr(self, "metric_direction_guess") else True

@@ -266,9 +262,9 @@ def _analysis_competition_description(self):
response_json_analysis = json.loads(response_analysis)
self.task_type = response_json_analysis.get("Task Type", "No type provided")
self.data_type = response_json_analysis.get("Data Type", "No data type provided")
self.brief_description = self.json_description.get("Description", response_json_analysis.get("Brief Description", "No brief description provided"))
self.dataset_description = self.json_description.get("Data Description", response_json_analysis.get("Dataset Description", "No dataset description provided"))
self.target_description = self.json_description.get("Evaluation", response_json_analysis.get("Evaluation Description", "No target description provided"))
self.brief_description = response_json_analysis.get("Brief Description", "No brief description provided")
self.dataset_description = response_json_analysis.get("Dataset Description", "No dataset description provided")
self.target_description = response_json_analysis.get("Evaluation Description", "No target description provided")
self.submission_specifications = response_json_analysis.get(
"Submission Specifications", "No submission requirements provided"
)
@@ -291,9 +287,7 @@ def background(self) -> str:
background_prompt = background_template.r(
task_type=self.task_type,
data_type=self.data_type,
brief_description=self.brief_description,
dataset_description=self.dataset_description,
target_description=self.target_description,
raw_description=self.raw_description,
)
return background_prompt

@@ -340,9 +334,6 @@ class KaggleScen(DataScienceScen):
def _get_description(self):
return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path)

def _get_description_json(self):
return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path, force=True)

def _get_direction(self):
if DS_RD_SETTING.if_using_mle_data:
return super()._get_direction()
4 changes: 1 addition & 3 deletions rdagent/scenarios/data_science/scen/prompts.yaml
Original file line number Diff line number Diff line change
@@ -48,9 +48,7 @@ competition_background: |-

The task type for this competition is {{ task_type }}.
The data type used in this competition is {{ data_type }}.
Briefly, the competition involves: {{ brief_description }}.
The dataset used in this competition is: {{ dataset_description }}.
Your goal in this competition is to: {{target_description }}.
The description of the competition is: {{ raw_description }}

rich_style_description: |-
### {{ name }} Agent: Automated Feature Engineering & Model Tuning Evolution
2 changes: 1 addition & 1 deletion rdagent/scenarios/kaggle/kaggle_crawler.py
Original file line number Diff line number Diff line change
@@ -41,7 +41,7 @@ def crawl_descriptions(
logger.info(f"Found {competition}/description.md, loading from it.")
return fp.read_text()

if (fp := Path(f"{local_data_path}/{competition}.json")).exists():
if (fp := Path(f"{local_data_path}/{competition}.json")).exists() and not force:
logger.info(f"Found {competition}.json, loading from local file.")
with fp.open("r") as f:
return json.load(f)