Fixing XGBoost

microsoft · xisen-w · Sep 26, 2024 · Sep 26, 2024 · Sep 27, 2024 · Sep 27, 2024
commit 7e21f126c2c22b8e277b59f7a55f4d98d635aef9
diff --git a/...os/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_randomforest.py b/...os/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_randomforest.py
@@ -34,8 +34,8 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
 
     # Validate the model
     y_valid_pred = model.predict(X_valid_selected)
-    # accuracy = accuracy_score(y_valid, y_valid_pred)
-    # print(f"Validation Accuracy: {accuracy:.4f}")
+    accuracy = accuracy_score(y_valid, y_valid_pred)
+    print(f"Validation Accuracy: {accuracy:.4f}")
 
     return model
 
@@ -48,7 +48,62 @@ def predict(model, X):
     X_selected = select(X)
 
     # Predict using the trained model
-    y_pred_prob = model.predict_proba(X_selected)
+    y_pred = model.predict(X_selected)
 
     # Apply threshold to get boolean predictions
-    return y_pred_prob
+    return y_pred.reshape
+
+
+"""
+Motivation of the model:
+The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
+It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
+baseline model for many classification tasks.
+"""
+
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    return X
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
+    """
+    Define and train the Random Forest model. Merge feature selection into the pipeline.
+    """
+    # Initialize the Random Forest model
+    model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)
+
+    # Select features (if any feature selection is needed)
+    X_train_selected = select(X_train)
+    X_valid_selected = select(X_valid)
+
+    # Fit the model
+    model.fit(X_train_selected, y_train)
+
+    # Validate the model
+    y_valid_pred = model.predict(X_valid_selected)
+    accuracy = accuracy_score(y_valid, y_valid_pred)
+    print(f"Validation Accuracy: {accuracy:.4f}")
+
+    return model
+
+
+def predict(model, X):
+    """
+    Keep feature selection's consistency and make predictions.
+    """
+    # Select features (if any feature selection is needed)
+    X_selected = select(X)
+
+    # Predict using the trained model
+    y_pred = model.predict(X_selected)
+
+    return y_pred.reshape(-1, 1)
diff --git a/...enarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_xgboost.py b/...enarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_xgboost.py
@@ -1,38 +1,48 @@
-"""
-motivation  of the model
-"""
-
 import numpy as np
 import pandas as pd
 import xgboost as xgb
-
+from sklearn.preprocessing import LabelEncoder
 
 def select(X: pd.DataFrame) -> pd.DataFrame:
     # Ignore feature selection logic
     return X
 
-
-def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
     """Define and train the model. Merge feature_select"""
     X_train = select(X_train)
     X_valid = select(X_valid)
-    dtrain = xgb.DMatrix(X_train, label=y_train)
-    dvalid = xgb.DMatrix(X_valid, label=y_valid)
-    num_classes = len(np.unique(y_train))
 
-    # TODO: for quick running....
+    # Combine train and valid labels to get all unique labels
+    all_labels = np.unique(np.concatenate([y_train, y_valid]))
+    le = LabelEncoder().fit(all_labels)
+
+    # Encode labels
+    y_train_encoded = le.transform(y_train)
+    y_valid_encoded = le.transform(y_valid)
+
+    dtrain = xgb.DMatrix(X_train, label=y_train_encoded)
+    dvalid = xgb.DMatrix(X_valid, label=y_valid_encoded)
+    num_classes = len(le.classes_)
+
     params = {
         "objective": "multi:softprob",
         "num_class": num_classes,
-        "nthred": -1,
+        "max_depth": 6,
+        "eta": 0.3,
+        "subsample": 0.8,
+        "colsample_bytree": 0.8,
+        "min_child_weight": 1,
+        "nthread": -1,
     }
     num_round = 100
 
     evallist = [(dtrain, "train"), (dvalid, "eval")]
-    bst = xgb.train(params, dtrain, num_round, evallist)
+    bst = xgb.train(params, dtrain, num_round, evallist, early_stopping_rounds=10)
 
-    return bst
+    # Store the LabelEncoder in the model for later use in prediction
+    bst.le = le
 
+    return bst
 
 def predict(model, X):
     """
@@ -41,4 +51,6 @@ def predict(model, X):
     X = select(X)
     dtest = xgb.DMatrix(X)
     y_pred_prob = model.predict(dtest)
-    return y_pred_prob
+    # Convert probabilities back to original labels if needed
+    # y_pred_labels = model.le.inverse_transform(y_pred_prob.argmax(axis=1))
+    return y_pred_prob