change the template

microsoft · xisen-w · Sep 26, 2024 · Sep 26, 2024 · Sep 27, 2024 · Sep 27, 2024
commit ded4e4f4336544fb09b9ee1cfdf509e686e9d8e8
diff --git a/...narios/kaggle/experiment/facebook-v-predicting-check-ins_template/fea_share_preprocess.py b/...narios/kaggle/experiment/facebook-v-predicting-check-ins_template/fea_share_preprocess.py
@@ -1,30 +1,33 @@
 import os
+
 import numpy as np
 import pandas as pd
+from sklearn.impute import SimpleImputer
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
-from sklearn.impute import SimpleImputer
+
 
 def preprocess_data(df):
     """Preprocess the data with feature engineering."""
     # Convert time to more useful features
-    df['hour'] = df['time'] % 24
-    df['day'] = (df['time'] // 24) % 7
-    df['week'] = df['time'] // (24 * 7)
-    
+    df["hour"] = df["time"] % 24
+    df["day"] = (df["time"] // 24) % 7
+    df["week"] = df["time"] // (24 * 7)
+
     # Create distance from center feature
-    df['dist_from_center'] = np.sqrt(df['x']**2 + df['y']**2)
-    
+    df["dist_from_center"] = np.sqrt(df["x"] ** 2 + df["y"] ** 2)
+
     # Create accuracy bins
-    df['accuracy_bins'] = pd.cut(df['accuracy'], bins=5, labels=False)
-    
+    df["accuracy_bins"] = pd.cut(df["accuracy"], bins=5, labels=False)
+
     # Create interaction features
-    df['xy'] = df['x'] * df['y']
-    df['x_accuracy'] = df['x'] * df['accuracy']
-    df['y_accuracy'] = df['y'] * df['accuracy']
-    
+    df["xy"] = df["x"] * df["y"]
+    df["x_accuracy"] = df["x"] * df["accuracy"]
+    df["y_accuracy"] = df["y"] * df["accuracy"]
+
     return df
 
+
 def preprocess_script():
     """Main preprocessing function."""
     if os.path.exists("/kaggle/input/X_train.pkl"):
@@ -46,16 +49,16 @@ def preprocess_script():
 
     # Encode place_ids
     place_id_encoder = LabelEncoder()
-    place_id_encoder.fit(train_df['place_id'])
-    train_df['place_id'] = place_id_encoder.transform(train_df['place_id'])
+    place_id_encoder.fit(train_df["place_id"])
+    train_df["place_id"] = place_id_encoder.transform(train_df["place_id"])
 
     # Split features and target for training data
-    X = train_df.drop(['place_id'], axis=1)
-    y = train_df['place_id']
+    X = train_df.drop(["place_id"], axis=1)
+    y = train_df["place_id"]
 
     # Prepare test data
-    test_row_ids = test_df['row_id']
-    X_test = test_df.drop(['row_id'], axis=1)
+    test_row_ids = test_df["row_id"]
+    X_test = test_df.drop(["row_id"], axis=1)
 
     # Ensure X_test has the same columns as X
     for col in X.columns:
@@ -80,4 +83,4 @@ def preprocess_script():
     # Count the number of unique classes
     n_classes = len(place_id_encoder.classes_)
 
-    return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes
+    return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes
diff --git a/...os/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_randomforest.py b/...os/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_randomforest.py
@@ -10,88 +10,15 @@
 from sklearn.metrics import accuracy_score
 
 
-def select(X: pd.DataFrame) -> pd.DataFrame:
-    """
-    Select relevant features. To be used in fit & predict function.
-    """
-    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
-    return X
-
-
-def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
-    """
-    Define and train the Random Forest model. Merge feature selection into the pipeline.
-    """
-    # Initialize the Random Forest model
-    model = RandomForestClassifier(n_estimators=10, random_state=32, n_jobs=-1)
-
-    # Select features (if any feature selection is needed)
-    X_train_selected = select(X_train)
-    X_valid_selected = select(X_valid)
-
-    # Fit the model
-    model.fit(X_train_selected, y_train)
-
-    # Validate the model
-    y_valid_pred = model.predict(X_valid_selected)
-    accuracy = accuracy_score(y_valid, y_valid_pred)
-    print(f"Validation Accuracy: {accuracy:.4f}")
-
-    return model
-
-
-def predict(model, X):
-    """
-    Keep feature selection's consistency and make predictions.
-    """
-    # Select features (if any feature selection is needed)
-    X_selected = select(X)
-
-    # Predict using the trained model
-    y_pred = model.predict(X_selected)
-
-    # Apply threshold to get boolean predictions
-    return y_pred.reshape
-
-
-"""
-Motivation of the model:
-The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
-It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
-baseline model for many classification tasks.
-"""
-
-import pandas as pd
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import accuracy_score
-
-
-def select(X: pd.DataFrame) -> pd.DataFrame:
-    """
-    Select relevant features. To be used in fit & predict function.
-    """
-    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
-    return X
-
-
 def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
     """
     Define and train the Random Forest model. Merge feature selection into the pipeline.
     """
     # Initialize the Random Forest model
     model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)
 
-    # Select features (if any feature selection is needed)
-    X_train_selected = select(X_train)
-    X_valid_selected = select(X_valid)
-
     # Fit the model
-    model.fit(X_train_selected, y_train)
-
-    # Validate the model
-    y_valid_pred = model.predict(X_valid_selected)
-    accuracy = accuracy_score(y_valid, y_valid_pred)
-    print(f"Validation Accuracy: {accuracy:.4f}")
+    model.fit(X_train, y_train)
 
     return model
 
@@ -100,10 +27,7 @@ def predict(model, X):
     """
     Keep feature selection's consistency and make predictions.
     """
-    # Select features (if any feature selection is needed)
-    X_selected = select(X)
-
     # Predict using the trained model
-    y_pred = model.predict(X_selected)
+    y_pred = model.predict(X)
 
     return y_pred.reshape(-1, 1)
diff --git a/...enarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_xgboost.py b/...enarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_xgboost.py
@@ -3,14 +3,9 @@
 import xgboost as xgb
 from sklearn.preprocessing import LabelEncoder
 
-def select(X: pd.DataFrame) -> pd.DataFrame:
-    # Ignore feature selection logic
-    return X
 
 def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
     """Define and train the model. Merge feature_select"""
-    X_train = select(X_train)
-    X_valid = select(X_valid)
 
     # Combine train and valid labels to get all unique labels
     all_labels = np.unique(np.concatenate([y_train, y_valid]))
@@ -44,13 +39,13 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
 
     return bst
 
+
 def predict(model, X):
     """
     Keep feature select's consistency.
     """
-    X = select(X)
     dtest = xgb.DMatrix(X)
     y_pred_prob = model.predict(dtest)
     # Convert probabilities back to original labels if needed
     # y_pred_labels = model.le.inverse_transform(y_pred_prob.argmax(axis=1))
-    return y_pred_prob
+    return y_pred_prob
diff --git a/...s/kaggle/experiment/facebook-v-predicting-check-ins_template/model/select_randomforest.py b/...s/kaggle/experiment/facebook-v-predicting-check-ins_template/model/select_randomforest.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(col)).strip() for col in X.columns.values]
+    return X
diff --git a/...narios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/select_xgboost.py b/...narios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/select_xgboost.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(col)).strip() for col in X.columns.values]
+    return X
diff --git a/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/train.py b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/train.py
@@ -13,16 +13,19 @@
 np.random.seed(SEED)
 DIRNAME = Path(__file__).absolute().resolve().parent
 
+
 def compute_map3(y_true, y_pred):
     """Compute Mean Average Precision @ 3 for multi-class classification."""
-    return average_precision_score(y_true, y_pred, average='micro')
+    return average_precision_score(y_true, y_pred, average="micro")
+
 
 def import_module_from_path(module_name, module_path):
     spec = importlib.util.spec_from_file_location(module_name, module_path)
     module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(module)
     return module
 
+
 # 1) Preprocess the data
 X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes = preprocess_script()
 
@@ -47,29 +50,24 @@ def import_module_from_path(module_name, module_path):
 X_test = pd.concat(X_test_l, axis=1)
 
 # 3) Train the model
-def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
-    if df.columns.nlevels == 1:
-        return df
-    df.columns = ["_".join(col).strip() for col in df.columns.values]
-    return df
-
-X_train = flatten_columns(X_train)
-X_valid = flatten_columns(X_valid)
-X_test = flatten_columns(X_test)
-
 model_l = []  # list[tuple[model, predict_func, validation_score]]
 for f in DIRNAME.glob("model/model*.py"):
+    select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)
+    select_m = import_module_from_path(select_python_path.stem, select_python_path)
+    X_train_selected = select_m.select(X_train.copy())
+    X_valid_selected = select_m.select(X_valid.copy())
+
     m = import_module_from_path(f.stem, f)
     # Check if the fit function accepts n_classes
-    if 'n_classes' in m.fit.__code__.co_varnames:
+    if "n_classes" in m.fit.__code__.co_varnames:
         model = m.fit(X_train, y_train, X_valid, y_valid, n_classes)
     else:
         model = m.fit(X_train, y_train, X_valid, y_valid)
-    
+
     # Evaluate the model on the validation set
     y_valid_pred = m.predict(model, X_valid)
     validation_score = log_loss(y_valid, y_valid_pred)
-    
+
     model_l.append((model, m.predict, validation_score))
 
 # Sort models by validation score (lower is better for log loss)
@@ -96,9 +94,8 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
 top_3_place_ids = place_id_encoder.inverse_transform(top_3_indices)
 
 # Create submission DataFrame
-submission_result = pd.DataFrame({
-    'row_id': test_row_ids,
-    'place_id': [' '.join(map(str, ids)) for ids in top_3_place_ids]
-})
+submission_result = pd.DataFrame(
+    {"row_id": test_row_ids, "place_id": [" ".join(map(str, ids)) for ids in top_3_place_ids]}
+)
 
 submission_result.to_csv("submission.csv", index=False)