Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: supporting Facebook competition (don't merge now) #364

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Prev Previous commit
change the template
  • Loading branch information
WinstonLiyt committed Sep 27, 2024
commit ded4e4f4336544fb09b9ee1cfdf509e686e9d8e8
Original file line number Diff line number Diff line change
@@ -1,30 +1,33 @@
import os

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer


def preprocess_data(df):
"""Preprocess the data with feature engineering."""
# Convert time to more useful features
df['hour'] = df['time'] % 24
df['day'] = (df['time'] // 24) % 7
df['week'] = df['time'] // (24 * 7)
df["hour"] = df["time"] % 24
df["day"] = (df["time"] // 24) % 7
df["week"] = df["time"] // (24 * 7)

# Create distance from center feature
df['dist_from_center'] = np.sqrt(df['x']**2 + df['y']**2)
df["dist_from_center"] = np.sqrt(df["x"] ** 2 + df["y"] ** 2)

# Create accuracy bins
df['accuracy_bins'] = pd.cut(df['accuracy'], bins=5, labels=False)
df["accuracy_bins"] = pd.cut(df["accuracy"], bins=5, labels=False)

# Create interaction features
df['xy'] = df['x'] * df['y']
df['x_accuracy'] = df['x'] * df['accuracy']
df['y_accuracy'] = df['y'] * df['accuracy']
df["xy"] = df["x"] * df["y"]
df["x_accuracy"] = df["x"] * df["accuracy"]
df["y_accuracy"] = df["y"] * df["accuracy"]

return df


def preprocess_script():
"""Main preprocessing function."""
if os.path.exists("/kaggle/input/X_train.pkl"):
@@ -46,16 +49,16 @@ def preprocess_script():

# Encode place_ids
place_id_encoder = LabelEncoder()
place_id_encoder.fit(train_df['place_id'])
train_df['place_id'] = place_id_encoder.transform(train_df['place_id'])
place_id_encoder.fit(train_df["place_id"])
train_df["place_id"] = place_id_encoder.transform(train_df["place_id"])

# Split features and target for training data
X = train_df.drop(['place_id'], axis=1)
y = train_df['place_id']
X = train_df.drop(["place_id"], axis=1)
y = train_df["place_id"]

# Prepare test data
test_row_ids = test_df['row_id']
X_test = test_df.drop(['row_id'], axis=1)
test_row_ids = test_df["row_id"]
X_test = test_df.drop(["row_id"], axis=1)

# Ensure X_test has the same columns as X
for col in X.columns:
@@ -80,4 +83,4 @@ def preprocess_script():
# Count the number of unique classes
n_classes = len(place_id_encoder.classes_)

return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes
return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes
Original file line number Diff line number Diff line change
@@ -10,88 +10,15 @@
from sklearn.metrics import accuracy_score


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
return X


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=10, random_state=32, n_jobs=-1)

# Select features (if any feature selection is needed)
X_train_selected = select(X_train)
X_valid_selected = select(X_valid)

# Fit the model
model.fit(X_train_selected, y_train)

# Validate the model
y_valid_pred = model.predict(X_valid_selected)
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

return model


def predict(model, X):
"""
Keep feature selection's consistency and make predictions.
"""
# Select features (if any feature selection is needed)
X_selected = select(X)

# Predict using the trained model
y_pred = model.predict(X_selected)

# Apply threshold to get boolean predictions
return y_pred.reshape


"""
Motivation of the model:
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
baseline model for many classification tasks.
"""

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
return X


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)

# Select features (if any feature selection is needed)
X_train_selected = select(X_train)
X_valid_selected = select(X_valid)

# Fit the model
model.fit(X_train_selected, y_train)

# Validate the model
y_valid_pred = model.predict(X_valid_selected)
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy: {accuracy:.4f}")
model.fit(X_train, y_train)

return model

@@ -100,10 +27,7 @@ def predict(model, X):
"""
Keep feature selection's consistency and make predictions.
"""
# Select features (if any feature selection is needed)
X_selected = select(X)

# Predict using the trained model
y_pred = model.predict(X_selected)
y_pred = model.predict(X)

return y_pred.reshape(-1, 1)
Original file line number Diff line number Diff line change
@@ -3,14 +3,9 @@
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

def select(X: pd.DataFrame) -> pd.DataFrame:
# Ignore feature selection logic
return X

def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""Define and train the model. Merge feature_select"""
X_train = select(X_train)
X_valid = select(X_valid)

# Combine train and valid labels to get all unique labels
all_labels = np.unique(np.concatenate([y_train, y_valid]))
@@ -44,13 +39,13 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali

return bst


def predict(model, X):
"""
Keep feature select's consistency.
"""
X = select(X)
dtest = xgb.DMatrix(X)
y_pred_prob = model.predict(dtest)
# Convert probabilities back to original labels if needed
# y_pred_labels = model.le.inverse_transform(y_pred_prob.argmax(axis=1))
return y_pred_prob
return y_pred_prob
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(col)).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(col)).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -13,16 +13,19 @@
np.random.seed(SEED)
DIRNAME = Path(__file__).absolute().resolve().parent


def compute_map3(y_true, y_pred):
"""Compute Mean Average Precision @ 3 for multi-class classification."""
return average_precision_score(y_true, y_pred, average='micro')
return average_precision_score(y_true, y_pred, average="micro")


def import_module_from_path(module_name, module_path):
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes = preprocess_script()

@@ -47,29 +50,24 @@ def import_module_from_path(module_name, module_path):
X_test = pd.concat(X_test_l, axis=1)

# 3) Train the model
def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
if df.columns.nlevels == 1:
return df
df.columns = ["_".join(col).strip() for col in df.columns.values]
return df

X_train = flatten_columns(X_train)
X_valid = flatten_columns(X_valid)
X_test = flatten_columns(X_test)

model_l = [] # list[tuple[model, predict_func, validation_score]]
for f in DIRNAME.glob("model/model*.py"):
select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)
select_m = import_module_from_path(select_python_path.stem, select_python_path)
X_train_selected = select_m.select(X_train.copy())
X_valid_selected = select_m.select(X_valid.copy())

m = import_module_from_path(f.stem, f)
# Check if the fit function accepts n_classes
if 'n_classes' in m.fit.__code__.co_varnames:
if "n_classes" in m.fit.__code__.co_varnames:
model = m.fit(X_train, y_train, X_valid, y_valid, n_classes)
else:
model = m.fit(X_train, y_train, X_valid, y_valid)

# Evaluate the model on the validation set
y_valid_pred = m.predict(model, X_valid)
validation_score = log_loss(y_valid, y_valid_pred)

model_l.append((model, m.predict, validation_score))

# Sort models by validation score (lower is better for log loss)
@@ -96,9 +94,8 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
top_3_place_ids = place_id_encoder.inverse_transform(top_3_indices)

# Create submission DataFrame
submission_result = pd.DataFrame({
'row_id': test_row_ids,
'place_id': [' '.join(map(str, ids)) for ids in top_3_place_ids]
})
submission_result = pd.DataFrame(
{"row_id": test_row_ids, "place_id": [" ".join(map(str, ids)) for ids in top_3_place_ids]}
)

submission_result.to_csv("submission.csv", index=False)