Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: supporting Facebook competition (don't merge now) #364

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Prev Previous commit
Next Next commit
Debugging
  • Loading branch information
xisen-w committed Sep 26, 2024
commit 2bc36f98988511f80599d8e21cc736889e8f0a5a
Original file line number Diff line number Diff line change
@@ -44,26 +44,42 @@ def preprocess_script():
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Encode place_ids first
place_id_encoder = LabelEncoder()
train_df['place_id'] = place_id_encoder.fit_transform(train_df['place_id'])

# Split features and target
X = train_df.drop(['place_id'], axis=1)
y = train_df['place_id']

# Split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
# Count occurrences of each place_id
place_id_counts = y.value_counts()

# Identify place_ids with only one occurrence
single_occurrence_place_ids = place_id_counts[place_id_counts == 1].index

# Split the data, ensuring single-occurrence place_ids are in the training set
mask = y.isin(single_occurrence_place_ids)
X_train_single = X[mask]
y_train_single = y[mask]
X_remaining = X[~mask]
y_remaining = y[~mask]

# Split the remaining data
X_train_rest, X_valid, y_train_rest, y_valid = train_test_split(X_remaining, y_remaining, test_size=0.2, random_state=42, stratify=y_remaining)

# Combine the single-occurrence samples with the rest of the training data
X_train = pd.concat([X_train_single, X_train_rest])
y_train = pd.concat([y_train_single, y_train_rest])

# Prepare test data
X_test = test_df.drop('row_id', axis=1)
test_row_ids = test_df['row_id']

# Encode place_ids
place_id_encoder = LabelEncoder()
y_train = place_id_encoder.fit_transform(y_train)
y_valid = place_id_encoder.transform(y_valid)

# Handle missing values
imputer = SimpleImputer(strategy="mean")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids
return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids