Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: supporting Facebook competition (don't merge now) #364

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Prev Previous commit
Next Next commit
Further revisions
  • Loading branch information
xisen-w committed Sep 27, 2024
commit f2d9eac5e48fe2ea6a646d2df58382f591a93561
Original file line number Diff line number Diff line change
@@ -44,42 +44,40 @@ def preprocess_script():
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Encode place_ids first
# Encode place_ids
place_id_encoder = LabelEncoder()
train_df['place_id'] = place_id_encoder.fit_transform(train_df['place_id'])
place_id_encoder.fit(train_df['place_id'])
train_df['place_id'] = place_id_encoder.transform(train_df['place_id'])

# Split features and target
# Split features and target for training data
X = train_df.drop(['place_id'], axis=1)
y = train_df['place_id']

# Count occurrences of each place_id
place_id_counts = y.value_counts()

# Identify place_ids with only one occurrence
single_occurrence_place_ids = place_id_counts[place_id_counts == 1].index
# Prepare test data
test_row_ids = test_df['row_id']
X_test = test_df.drop(['row_id'], axis=1)

# Split the data, ensuring single-occurrence place_ids are in the training set
mask = y.isin(single_occurrence_place_ids)
X_train_single = X[mask]
y_train_single = y[mask]
X_remaining = X[~mask]
y_remaining = y[~mask]
# Ensure X_test has the same columns as X
for col in X.columns:
if col not in X_test.columns:
X_test[col] = 0 # or some other appropriate default value

# Split the remaining data
X_train_rest, X_valid, y_train_rest, y_valid = train_test_split(X_remaining, y_remaining, test_size=0.2, random_state=42, stratify=y_remaining)
X_test = X_test[X.columns] # Reorder columns to match X

# Combine the single-occurrence samples with the rest of the training data
X_train = pd.concat([X_train_single, X_train_rest])
y_train = pd.concat([y_train_single, y_train_rest])

# Prepare test data
X_test = test_df.drop('row_id', axis=1)
test_row_ids = test_df['row_id']
# Attempt stratified split, fall back to random split if necessary
try:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
except ValueError:
print("Warning: Stratified split not possible. Falling back to random split.")
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
imputer = SimpleImputer(strategy="mean")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids
# Count the number of unique classes
n_classes = len(place_id_encoder.classes_)

return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes

This file was deleted.

Original file line number Diff line number Diff line change
@@ -24,7 +24,7 @@ def import_module_from_path(module_name, module_path):
return module

# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids = preprocess_script()
X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes = preprocess_script()

# 2) Auto feature engineering
X_train_l, X_valid_l = [], []
@@ -60,7 +60,12 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
model_l = [] # list[tuple[model, predict_func]]
for f in DIRNAME.glob("model/model*.py"):
m = import_module_from_path(f.stem, f)
model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict_proba))
# Check if the fit function accepts n_classes
if 'n_classes' in m.fit.__code__.co_varnames:
model = m.fit(X_train, y_train, X_valid, y_valid, n_classes)
else:
model = m.fit(X_train, y_train, X_valid, y_valid)
model_l.append((model, m.predict))

# 4) Evaluate the model on the validation set
y_valid_pred_l = []
2 changes: 1 addition & 1 deletion test/utils/test_kaggle.py
Original file line number Diff line number Diff line change
@@ -27,7 +27,7 @@ def test_competition_template(self):
ws.execute()
success = (ws.workspace_path / "submission.csv").exists()
self.assertTrue(success, "submission.csv is not generated")
ws.clear()
# ws.clear()


if __name__ == "__main__":
Loading
Oops, something went wrong.