Further revisions

microsoft · xisen-w · Sep 26, 2024 · Sep 26, 2024 · Sep 27, 2024 · Sep 27, 2024
commit f2d9eac5e48fe2ea6a646d2df58382f591a93561
diff --git a/...narios/kaggle/experiment/facebook-v-predicting-check-ins_template/fea_share_preprocess.py b/...narios/kaggle/experiment/facebook-v-predicting-check-ins_template/fea_share_preprocess.py
@@ -44,42 +44,40 @@ def preprocess_script():
     train_df = preprocess_data(train_df)
     test_df = preprocess_data(test_df)
 
-    # Encode place_ids first
+    # Encode place_ids
     place_id_encoder = LabelEncoder()
-    train_df['place_id'] = place_id_encoder.fit_transform(train_df['place_id'])
+    place_id_encoder.fit(train_df['place_id'])
+    train_df['place_id'] = place_id_encoder.transform(train_df['place_id'])
 
-    # Split features and target
+    # Split features and target for training data
     X = train_df.drop(['place_id'], axis=1)
     y = train_df['place_id']
 
-    # Count occurrences of each place_id
-    place_id_counts = y.value_counts()
-
-    # Identify place_ids with only one occurrence
-    single_occurrence_place_ids = place_id_counts[place_id_counts == 1].index
+    # Prepare test data
+    test_row_ids = test_df['row_id']
+    X_test = test_df.drop(['row_id'], axis=1)
 
-    # Split the data, ensuring single-occurrence place_ids are in the training set
-    mask = y.isin(single_occurrence_place_ids)
-    X_train_single = X[mask]
-    y_train_single = y[mask]
-    X_remaining = X[~mask]
-    y_remaining = y[~mask]
+    # Ensure X_test has the same columns as X
+    for col in X.columns:
+        if col not in X_test.columns:
+            X_test[col] = 0  # or some other appropriate default value
 
-    # Split the remaining data
-    X_train_rest, X_valid, y_train_rest, y_valid = train_test_split(X_remaining, y_remaining, test_size=0.2, random_state=42, stratify=y_remaining)
+    X_test = X_test[X.columns]  # Reorder columns to match X
 
-    # Combine the single-occurrence samples with the rest of the training data
-    X_train = pd.concat([X_train_single, X_train_rest])
-    y_train = pd.concat([y_train_single, y_train_rest])
-
-    # Prepare test data
-    X_test = test_df.drop('row_id', axis=1)
-    test_row_ids = test_df['row_id']
+    # Attempt stratified split, fall back to random split if necessary
+    try:
+        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
+    except ValueError:
+        print("Warning: Stratified split not possible. Falling back to random split.")
+        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
 
     # Handle missing values
     imputer = SimpleImputer(strategy="mean")
     X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
     X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
     X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
 
-    return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids
+    # Count the number of unique classes
+    n_classes = len(place_id_encoder.classes_)
+
+    return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes
diff --git a/...nt/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_nn.py b/...nt/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_nn.py
diff --git a/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/train.py b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/train.py
@@ -24,7 +24,7 @@ def import_module_from_path(module_name, module_path):
     return module
 
 # 1) Preprocess the data
-X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids = preprocess_script()
+X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes = preprocess_script()
 
 # 2) Auto feature engineering
 X_train_l, X_valid_l = [], []
@@ -60,7 +60,12 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
 model_l = []  # list[tuple[model, predict_func]]
 for f in DIRNAME.glob("model/model*.py"):
     m = import_module_from_path(f.stem, f)
-    model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict_proba))
+    # Check if the fit function accepts n_classes
+    if 'n_classes' in m.fit.__code__.co_varnames:
+        model = m.fit(X_train, y_train, X_valid, y_valid, n_classes)
+    else:
+        model = m.fit(X_train, y_train, X_valid, y_valid)
+    model_l.append((model, m.predict))
 
 # 4) Evaluate the model on the validation set
 y_valid_pred_l = []

diff --git a/test/utils/test_kaggle.py b/test/utils/test_kaggle.py
@@ -27,7 +27,7 @@ def test_competition_template(self):
         ws.execute()
         success = (ws.workspace_path / "submission.csv").exists()
         self.assertTrue(success, "submission.csv is not generated")
-        ws.clear()
+        # ws.clear()
 
 
 if __name__ == "__main__":