A slightly better feature selection

microsoft · xisen-w · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
commit 78810e1202f01725626868202aa5b35f5a7eda75
diff --git a/rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py
@@ -35,6 +35,7 @@ def implement_one_model(
         model_information_str = target_task.get_task_information()
         model_type = target_task.model_type
 
+        data_desc = None
         # Get the current code from the experiment using build_from_SOTA
         current_code = ""
         if exp is not None:
@@ -47,6 +48,9 @@ def implement_one_model(
             }
             if model_type in model_file_mapping:
                 current_code = exp.experiment_workspace.code_dict.get(model_file_mapping[model_type], "")
+                # send in data description  
+            data_desc = exp.experiment_workspace.data_description
+
 
         if queried_knowledge is not None and model_information_str in queried_knowledge.success_task_to_knowledge_dict:
             return queried_knowledge.success_task_to_knowledge_dict[model_information_str].implementation
@@ -90,6 +94,7 @@ def implement_one_model(
                         model_type=model_type,  # Add model type to the prompt
                         queried_similar_successful_knowledge=queried_similar_successful_knowledge_to_render,
                         queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
+                        data_desc=data_desc,
                     )
                     .strip("\n")
                 )
@@ -122,6 +127,7 @@ def evolve(
         *,
         evo: ModelEvolvingItem,
         queried_knowledge: ModelQueriedKnowledge | None = None,
+        exp: ModelExperiment = None,  # Add this parameter
         **kwargs,
     ) -> ModelEvolvingItem:
         # 1. Find the models that need to be evolved
@@ -140,7 +146,7 @@ def evolve(
 
         result = multiprocessing_wrapper(
             [
-                (self.implement_one_model, (evo.sub_tasks[target_index], queried_knowledge))
+                (self.implement_one_model, (evo.sub_tasks[target_index], queried_knowledge, exp))  # Pass exp here
                 for target_index in to_be_finished_task_index
             ],
             n=RD_AGENT_SETTINGS.multi_proc_n,

diff --git a/rdagent/components/coder/model_coder/prompts.yaml b/rdagent/components/coder/model_coder/prompts.yaml
@@ -41,8 +41,11 @@ extract_model_formulation_system: |-
 
 evolving_strategy_model_coder:
     system: |-
-        User is trying to implement some pytorch models in the following scenario:
+        User is trying to implement some machine learning models (pytorch or otherwise - see specifications) in the following scenario:
         {{ scenario }}
+
+        Very Important: The actions that you are responsible are mainly writing model (tuning model) or selecting features. Note that they are essentially the same task. If feature selection is involved, only adjust the select() section from existing models. DO NOT WRITE A SEPARATE FEATURE SELECTION CLASS.
+
         Your code is expected to align the scenario in any form which means The user needs to get the prediction of the model based on the input data.
 
         To help you write the correct code, the user might provide multiple information that helps you write the correct code:
@@ -56,15 +59,13 @@ evolving_strategy_model_coder:
         --------------Current code in the workspace:--------------- You need to tune the model based on this! If it is not None, do not write from scratch. 
         {{ current_code }}
         {% endif %}
-
         {% if queried_former_failed_knowledge|length != 0 %}
         --------------Your former latest attempt:---------------
         =====Code to the former implementation=====
         {{ queried_former_failed_knowledge[-1].implementation.code }}
         =====Feedback to the former implementation=====
         {{ queried_former_failed_knowledge[-1].feedback }}
         {% endif %}
-
         Please response the code in the following json format. Here is an example structure for the JSON output:
         {
             "code": "The Python code as a string."
@@ -84,6 +85,11 @@ evolving_strategy_model_coder:
         {% endfor %}
         {% endif %}
 
+        {% if data_desc is not none %}
+        --------------Data & Feature Descriptions (Use this for feature selection):---------------
+        {{ data_desc }}
+        {% endif %}
+
         {% if queried_former_failed_knowledge|length != 0 %}
         --------------Former failed code:---------------
         {% for former_failed_knowledge in queried_former_failed_knowledge %}
@@ -94,6 +100,7 @@ evolving_strategy_model_coder:
         {% endfor %}
         {% endif %}
 
+
 evaluator_code_feedback:
     system: |-
         User is trying to implement some models in the following scenario:

diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml
@@ -122,12 +122,13 @@ kg_feature_interface: |-
   7. You are participating in a Kaggle competition and need data engineering ideas that are small, efficient, and quick to execute. Your suggestions should avoid unnecessary complexity or excessive processing time. Focus on delivering concise, impactful transformations or preprocessing steps that improve model performance with minimal resource usage. Please suggest clear, targeted approaches that can be implemented and tested rapidly.
 
 kg_model_interface: |-
+  The action might be model tuning or feature selection. However, for both of them, the code structure is the same.
   Your code should contain several parts:
   1. The import part: import the necessary libraries.
   2. A select() function that handles feature selection for both training and prediction phases.
     The function should take the following arguments:
       - X: The features as a pandas DataFrame.
-    The function should return the selected features as a pandas DataFrame.
+    The function should return the selected features as a pandas DataFrame. (You will usually receive a description of data & existing features)
   3. A function called fit() that trains the model and returns the trained model. If feature selection is applied, it should be done within this function.
     The function should take the following arguments:
       - X_train: The training features as a pandas DataFrame.
@@ -151,7 +152,7 @@ kg_model_interface: |-
   from xgboost import DMatrix
 
 
-  def select(X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
+  def select(X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic for feature selection action!
 
 
   def fit(
@@ -185,7 +186,7 @@ kg_model_interface: |-
   from sklearn.metrics import accuracy_score
 
 
-  def select(X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
+  def select(X: pd.DataFrame) -> pd.DataFrame: ...  # Focus on implement feature selection logic for feature selection process 
 
 
   def fit(