Creating the competition for facebook

microsoft · xisen-w · Sep 26, 2024 · Sep 26, 2024 · Sep 27, 2024 · Sep 27, 2024
commit 2fbb496a2dc0b49335901d20a72d1f9146e81958
diff --git a/...narios/kaggle/experiment/facebook-v-predicting-check-ins_template/fea_share_preprocess.py b/...narios/kaggle/experiment/facebook-v-predicting-check-ins_template/fea_share_preprocess.py
@@ -0,0 +1,69 @@
+import os
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from sklearn.impute import SimpleImputer
+
+def preprocess_data(df):
+    """Preprocess the data with feature engineering."""
+    # Convert time to more useful features
+    df['hour'] = df['time'] % 24
+    df['day'] = (df['time'] // 24) % 7
+    df['week'] = df['time'] // (24 * 7)
+
+    # Create distance from center feature
+    df['dist_from_center'] = np.sqrt(df['x']**2 + df['y']**2)
+
+    # Create accuracy bins
+    df['accuracy_bins'] = pd.cut(df['accuracy'], bins=5, labels=False)
+
+    # Create interaction features
+    df['xy'] = df['x'] * df['y']
+    df['x_accuracy'] = df['x'] * df['accuracy']
+    df['y_accuracy'] = df['y'] * df['accuracy']
+
+    return df
+
+def preprocess_script():
+    """Main preprocessing function."""
+    if os.path.exists("/kaggle/input/X_train.pkl"):
+        X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
+        X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
+        y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
+        y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
+        X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
+        others = pd.read_pickle("/kaggle/input/others.pkl")
+        return X_train, X_valid, y_train, y_valid, X_test, *others
+
+    # Load the training data
+    train_df = pd.read_csv("/kaggle/input/train.csv").head(1000)
+    test_df = pd.read_csv("/kaggle/input/test.csv").head(1000)
+
+    # Preprocess the data
+    train_df = preprocess_data(train_df)
+    test_df = preprocess_data(test_df)
+
+    # Split features and target
+    X = train_df.drop(['place_id'], axis=1)
+    y = train_df['place_id']
+
+    # Split the data
+    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
+
+    # Prepare test data
+    X_test = test_df.drop('row_id', axis=1)
+    test_row_ids = test_df['row_id']
+
+    # Encode place_ids
+    place_id_encoder = LabelEncoder()
+    y_train = place_id_encoder.fit_transform(y_train)
+    y_valid = place_id_encoder.transform(y_valid)
+
+    # Handle missing values
+    imputer = SimpleImputer(strategy="mean")
+    X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
+    X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
+    X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
+
+    return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids
diff --git a/...t/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/feature/feature.py b/...t/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/feature/feature.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+"""
+Here is the feature engineering code for each task, with a class that has a fit and transform method.
+Remember
+"""
+
+
+class IdentityFeature:
+    def fit(self, train_df: pd.DataFrame):
+        """
+        Fit the feature engineering model to the training data.
+        """
+        pass
+
+    def transform(self, X: pd.DataFrame):
+        """
+        Transform the input data.
+        """
+        return X
+
+
+feature_engineering_cls = IdentityFeature
diff --git a/...nt/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_nn.py b/...nt/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_nn.py
@@ -0,0 +1,79 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, TensorDataset
+from tqdm import tqdm
+
+# Check if a GPU is available
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+# Modified model for multi-class classification
+class FeatureInteractionModel(nn.Module):
+    def __init__(self, num_features, num_classes):
+        super(FeatureInteractionModel, self).__init__()
+        self.fc1 = nn.Linear(num_features, 128)
+        self.bn1 = nn.BatchNorm1d(128)
+        self.fc2 = nn.Linear(128, 64)
+        self.bn2 = nn.BatchNorm1d(64)
+        self.fc3 = nn.Linear(64, num_classes)  # Output nodes equal to num_classes
+        self.dropout = nn.Dropout(0.3)
+
+    def forward(self, x):
+        x = F.relu(self.bn1(self.fc1(x)))
+        x = F.relu(self.bn2(self.fc2(x)))
+        x = self.dropout(x)
+        x = self.fc3(x)
+        return F.softmax(x, dim=1)  # Apply softmax to get probabilities
+
+
+# Training function
+def fit(X_train, y_train, X_valid, y_valid):
+    num_features = X_train.shape[1]
+    num_classes = len(np.unique(y_train))  # Determine number of classes
+    model = FeatureInteractionModel(num_features, num_classes).to(device)
+    criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multi-class
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+
+    # Convert to TensorDataset and create DataLoader
+    train_dataset = TensorDataset(
+        torch.tensor(X_train.to_numpy(), dtype=torch.float32),
+        torch.tensor(y_train.to_numpy(), dtype=torch.long),  # Use long for labels
+    )
+    valid_dataset = TensorDataset(
+        torch.tensor(X_valid.to_numpy(), dtype=torch.float32), torch.tensor(y_valid.to_numpy(), dtype=torch.long)
+    )
+
+    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
+    valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
+
+    # Train the model
+    model.train()
+    for epoch in range(10):
+        print(f"Epoch {epoch + 1}/10")
+        epoch_loss = 0
+        for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False):
+            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
+            optimizer.zero_grad()
+            outputs = model(X_batch)
+            loss = criterion(outputs, y_batch)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+        print(f"End of epoch {epoch + 1}, Avg Loss: {epoch_loss / len(train_loader):.4f}")
+
+    return model
+
+
+# Prediction function
+def predict(model, X):
+    model.eval()
+    probabilities = []
+    with torch.no_grad():
+        X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device)
+        for i in tqdm(range(0, len(X_tensor), 32), desc="Predicting", leave=False):
+            batch = X_tensor[i : i + 32]
+            pred = model(batch)
+            probabilities.append(pred.cpu().numpy())  # Collect probabilities
+    return np.vstack(probabilities)  # Return as a 2D array
diff --git a/...os/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_randomforest.py b/...os/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_randomforest.py
@@ -0,0 +1,54 @@
+"""
+Motivation of the model:
+The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
+It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
+baseline model for many classification tasks.
+"""
+
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    return X
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
+    """
+    Define and train the Random Forest model. Merge feature selection into the pipeline.
+    """
+    # Initialize the Random Forest model
+    model = RandomForestClassifier(n_estimators=10, random_state=32, n_jobs=-1)
+
+    # Select features (if any feature selection is needed)
+    X_train_selected = select(X_train)
+    X_valid_selected = select(X_valid)
+
+    # Fit the model
+    model.fit(X_train_selected, y_train)
+
+    # Validate the model
+    y_valid_pred = model.predict(X_valid_selected)
+    # accuracy = accuracy_score(y_valid, y_valid_pred)
+    # print(f"Validation Accuracy: {accuracy:.4f}")
+
+    return model
+
+
+def predict(model, X):
+    """
+    Keep feature selection's consistency and make predictions.
+    """
+    # Select features (if any feature selection is needed)
+    X_selected = select(X)
+
+    # Predict using the trained model
+    y_pred_prob = model.predict_proba(X_selected)
+
+    # Apply threshold to get boolean predictions
+    return y_pred_prob
diff --git a/...enarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_xgboost.py b/...enarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_xgboost.py
@@ -0,0 +1,44 @@
+"""
+motivation  of the model
+"""
+
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    # Ignore feature selection logic
+    return X
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
+    """Define and train the model. Merge feature_select"""
+    X_train = select(X_train)
+    X_valid = select(X_valid)
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dvalid = xgb.DMatrix(X_valid, label=y_valid)
+    num_classes = len(np.unique(y_train))
+
+    # TODO: for quick running....
+    params = {
+        "objective": "multi:softprob",
+        "num_class": num_classes,
+        "nthred": -1,
+    }
+    num_round = 100
+
+    evallist = [(dtrain, "train"), (dvalid, "eval")]
+    bst = xgb.train(params, dtrain, num_round, evallist)
+
+    return bst
+
+
+def predict(model, X):
+    """
+    Keep feature select's consistency.
+    """
+    X = select(X)
+    dtest = xgb.DMatrix(X)
+    y_pred_prob = model.predict(dtest)
+    return y_pred_prob
diff --git a/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/train.py b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/train.py
@@ -0,0 +1,97 @@
+import importlib.util
+import random
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from fea_share_preprocess import preprocess_script
+from sklearn.metrics import average_precision_score
+
+# Set random seed for reproducibility
+SEED = 42
+random.seed(SEED)
+np.random.seed(SEED)
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+def compute_map3(y_true, y_pred):
+    """Compute Mean Average Precision @ 3 for multi-class classification."""
+    return average_precision_score(y_true, y_pred, average='micro')
+
+def import_module_from_path(module_name, module_path):
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+# 1) Preprocess the data
+X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids = preprocess_script()
+
+# 2) Auto feature engineering
+X_train_l, X_valid_l = [], []
+X_test_l = []
+
+for f in DIRNAME.glob("feature/feat*.py"):
+    cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+    cls.fit(X_train)
+    X_train_f = cls.transform(X_train)
+    X_valid_f = cls.transform(X_valid)
+    X_test_f = cls.transform(X_test)
+
+    if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
+        X_train_l.append(X_train_f)
+        X_valid_l.append(X_valid_f)
+        X_test_l.append(X_test_f)
+
+X_train = pd.concat(X_train_l, axis=1)
+X_valid = pd.concat(X_valid_l, axis=1)
+X_test = pd.concat(X_test_l, axis=1)
+
+# 3) Train the model
+def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
+    if df.columns.nlevels == 1:
+        return df
+    df.columns = ["_".join(col).strip() for col in df.columns.values]
+    return df
+
+X_train = flatten_columns(X_train)
+X_valid = flatten_columns(X_valid)
+X_test = flatten_columns(X_test)
+
+model_l = []  # list[tuple[model, predict_func]]
+for f in DIRNAME.glob("model/model*.py"):
+    m = import_module_from_path(f.stem, f)
+    model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict_proba))
+
+# 4) Evaluate the model on the validation set
+y_valid_pred_l = []
+for model, predict_func in model_l:
+    y_valid_pred_l.append(predict_func(model, X_valid))
+
+# 5) Ensemble
+y_valid_pred_proba = np.mean(y_valid_pred_l, axis=0)
+
+# Compute metrics
+map3 = compute_map3(y_valid, y_valid_pred_proba)
+print(f"MAP@3 on validation set: {map3}")
+
+# 6) Save the validation metrics
+pd.Series(data=[map3], index=["MAP@3"]).to_csv("submission_score.csv")
+
+# 7) Make predictions on the test set and save them
+y_test_pred_l = []
+for model, predict_func in model_l:
+    y_test_pred_l.append(predict_func(model, X_test))
+
+y_test_pred_proba = np.mean(y_test_pred_l, axis=0)
+
+# Get top 3 predictions for each test sample
+top_3_indices = np.argsort(-y_test_pred_proba, axis=1)[:, :3]
+top_3_place_ids = place_id_encoder.inverse_transform(top_3_indices)
+
+# Create submission DataFrame
+submission_result = pd.DataFrame({
+    'row_id': test_row_ids,
+    'place_id': [' '.join(map(str, ids)) for ids in top_3_place_ids]
+})
+
+submission_result.to_csv("submission.csv", index=False)