Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: supporting Facebook competition (don't merge now) #364

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Prev Previous commit
Next Next commit
Creating the competition for facebook
  • Loading branch information
xisen-w committed Sep 27, 2024
commit 2fbb496a2dc0b49335901d20a72d1f9146e81958
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

def preprocess_data(df):
"""Preprocess the data with feature engineering."""
# Convert time to more useful features
df['hour'] = df['time'] % 24
df['day'] = (df['time'] // 24) % 7
df['week'] = df['time'] // (24 * 7)

# Create distance from center feature
df['dist_from_center'] = np.sqrt(df['x']**2 + df['y']**2)

# Create accuracy bins
df['accuracy_bins'] = pd.cut(df['accuracy'], bins=5, labels=False)

# Create interaction features
df['xy'] = df['x'] * df['y']
df['x_accuracy'] = df['x'] * df['accuracy']
df['y_accuracy'] = df['y'] * df['accuracy']

return df

def preprocess_script():
"""Main preprocessing function."""
if os.path.exists("/kaggle/input/X_train.pkl"):
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
others = pd.read_pickle("/kaggle/input/others.pkl")
return X_train, X_valid, y_train, y_valid, X_test, *others

# Load the training data
train_df = pd.read_csv("/kaggle/input/train.csv").head(1000)
test_df = pd.read_csv("/kaggle/input/test.csv").head(1000)

# Preprocess the data
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Split features and target
X = train_df.drop(['place_id'], axis=1)
y = train_df['place_id']

# Split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare test data
X_test = test_df.drop('row_id', axis=1)
test_row_ids = test_df['row_id']

# Encode place_ids
place_id_encoder = LabelEncoder()
y_train = place_id_encoder.fit_transform(y_train)
y_valid = place_id_encoder.transform(y_valid)

# Handle missing values
imputer = SimpleImputer(strategy="mean")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Remember
"""


class IdentityFeature:
def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
return X


feature_engineering_cls = IdentityFeature
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Modified model for multi-class classification
class FeatureInteractionModel(nn.Module):
def __init__(self, num_features, num_classes):
super(FeatureInteractionModel, self).__init__()
self.fc1 = nn.Linear(num_features, 128)
self.bn1 = nn.BatchNorm1d(128)
self.fc2 = nn.Linear(128, 64)
self.bn2 = nn.BatchNorm1d(64)
self.fc3 = nn.Linear(64, num_classes) # Output nodes equal to num_classes
self.dropout = nn.Dropout(0.3)

def forward(self, x):
x = F.relu(self.bn1(self.fc1(x)))
x = F.relu(self.bn2(self.fc2(x)))
x = self.dropout(x)
x = self.fc3(x)
return F.softmax(x, dim=1) # Apply softmax to get probabilities


# Training function
def fit(X_train, y_train, X_valid, y_valid):
num_features = X_train.shape[1]
num_classes = len(np.unique(y_train)) # Determine number of classes
model = FeatureInteractionModel(num_features, num_classes).to(device)
criterion = nn.CrossEntropyLoss() # Use CrossEntropyLoss for multi-class
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Convert to TensorDataset and create DataLoader
train_dataset = TensorDataset(
torch.tensor(X_train.to_numpy(), dtype=torch.float32),
torch.tensor(y_train.to_numpy(), dtype=torch.long), # Use long for labels
)
valid_dataset = TensorDataset(
torch.tensor(X_valid.to_numpy(), dtype=torch.float32), torch.tensor(y_valid.to_numpy(), dtype=torch.long)
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

# Train the model
model.train()
for epoch in range(10):
print(f"Epoch {epoch + 1}/10")
epoch_loss = 0
for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False):
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
print(f"End of epoch {epoch + 1}, Avg Loss: {epoch_loss / len(train_loader):.4f}")

return model


# Prediction function
def predict(model, X):
model.eval()
probabilities = []
with torch.no_grad():
X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device)
for i in tqdm(range(0, len(X_tensor), 32), desc="Predicting", leave=False):
batch = X_tensor[i : i + 32]
pred = model(batch)
probabilities.append(pred.cpu().numpy()) # Collect probabilities
return np.vstack(probabilities) # Return as a 2D array
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""
Motivation of the model:
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
baseline model for many classification tasks.
"""

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
return X


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=10, random_state=32, n_jobs=-1)

# Select features (if any feature selection is needed)
X_train_selected = select(X_train)
X_valid_selected = select(X_valid)

# Fit the model
model.fit(X_train_selected, y_train)

# Validate the model
y_valid_pred = model.predict(X_valid_selected)
# accuracy = accuracy_score(y_valid, y_valid_pred)
# print(f"Validation Accuracy: {accuracy:.4f}")

return model


def predict(model, X):
"""
Keep feature selection's consistency and make predictions.
"""
# Select features (if any feature selection is needed)
X_selected = select(X)

# Predict using the trained model
y_pred_prob = model.predict_proba(X_selected)

# Apply threshold to get boolean predictions
return y_pred_prob
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
motivation of the model
"""

import numpy as np
import pandas as pd
import xgboost as xgb


def select(X: pd.DataFrame) -> pd.DataFrame:
# Ignore feature selection logic
return X


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
"""Define and train the model. Merge feature_select"""
X_train = select(X_train)
X_valid = select(X_valid)
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
num_classes = len(np.unique(y_train))

# TODO: for quick running....
params = {
"objective": "multi:softprob",
"num_class": num_classes,
"nthred": -1,
}
num_round = 100

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist)

return bst


def predict(model, X):
"""
Keep feature select's consistency.
"""
X = select(X)
dtest = xgb.DMatrix(X)
y_pred_prob = model.predict(dtest)
return y_pred_prob
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import importlib.util
import random
from pathlib import Path

import numpy as np
import pandas as pd
from fea_share_preprocess import preprocess_script
from sklearn.metrics import average_precision_score

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
DIRNAME = Path(__file__).absolute().resolve().parent

def compute_map3(y_true, y_pred):
"""Compute Mean Average Precision @ 3 for multi-class classification."""
return average_precision_score(y_true, y_pred, average='micro')

def import_module_from_path(module_name, module_path):
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module

# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids = preprocess_script()

# 2) Auto feature engineering
X_train_l, X_valid_l = [], []
X_test_l = []

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
cls.fit(X_train)
X_train_f = cls.transform(X_train)
X_valid_f = cls.transform(X_valid)
X_test_f = cls.transform(X_test)

if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
X_train_l.append(X_train_f)
X_valid_l.append(X_valid_f)
X_test_l.append(X_test_f)

X_train = pd.concat(X_train_l, axis=1)
X_valid = pd.concat(X_valid_l, axis=1)
X_test = pd.concat(X_test_l, axis=1)

# 3) Train the model
def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
if df.columns.nlevels == 1:
return df
df.columns = ["_".join(col).strip() for col in df.columns.values]
return df

X_train = flatten_columns(X_train)
X_valid = flatten_columns(X_valid)
X_test = flatten_columns(X_test)

model_l = [] # list[tuple[model, predict_func]]
for f in DIRNAME.glob("model/model*.py"):
m = import_module_from_path(f.stem, f)
model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict_proba))

# 4) Evaluate the model on the validation set
y_valid_pred_l = []
for model, predict_func in model_l:
y_valid_pred_l.append(predict_func(model, X_valid))

# 5) Ensemble
y_valid_pred_proba = np.mean(y_valid_pred_l, axis=0)

# Compute metrics
map3 = compute_map3(y_valid, y_valid_pred_proba)
print(f"MAP@3 on validation set: {map3}")

# 6) Save the validation metrics
pd.Series(data=[map3], index=["MAP@3"]).to_csv("submission_score.csv")

# 7) Make predictions on the test set and save them
y_test_pred_l = []
for model, predict_func in model_l:
y_test_pred_l.append(predict_func(model, X_test))

y_test_pred_proba = np.mean(y_test_pred_l, axis=0)

# Get top 3 predictions for each test sample
top_3_indices = np.argsort(-y_test_pred_proba, axis=1)[:, :3]
top_3_place_ids = place_id_encoder.inverse_transform(top_3_indices)

# Create submission DataFrame
submission_result = pd.DataFrame({
'row_id': test_row_ids,
'place_id': [' '.join(map(str, ids)) for ids in top_3_place_ids]
})

submission_result.to_csv("submission.csv", index=False)