Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: supporting Facebook competition (don't merge now) #364

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Prev Previous commit
Next Next commit
Fixing XGBoost
  • Loading branch information
xisen-w committed Sep 27, 2024
commit 7e21f126c2c22b8e277b59f7a55f4d98d635aef9
Original file line number Diff line number Diff line change
@@ -34,8 +34,8 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali

# Validate the model
y_valid_pred = model.predict(X_valid_selected)
# accuracy = accuracy_score(y_valid, y_valid_pred)
# print(f"Validation Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

return model

@@ -48,7 +48,62 @@ def predict(model, X):
X_selected = select(X)

# Predict using the trained model
y_pred_prob = model.predict_proba(X_selected)
y_pred = model.predict(X_selected)

# Apply threshold to get boolean predictions
return y_pred_prob
return y_pred.reshape


"""
Motivation of the model:
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
baseline model for many classification tasks.
"""

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
return X


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)

# Select features (if any feature selection is needed)
X_train_selected = select(X_train)
X_valid_selected = select(X_valid)

# Fit the model
model.fit(X_train_selected, y_train)

# Validate the model
y_valid_pred = model.predict(X_valid_selected)
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

return model


def predict(model, X):
"""
Keep feature selection's consistency and make predictions.
"""
# Select features (if any feature selection is needed)
X_selected = select(X)

# Predict using the trained model
y_pred = model.predict(X_selected)

return y_pred.reshape(-1, 1)
Original file line number Diff line number Diff line change
@@ -1,38 +1,48 @@
"""
motivation of the model
"""

import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder

def select(X: pd.DataFrame) -> pd.DataFrame:
# Ignore feature selection logic
return X


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""Define and train the model. Merge feature_select"""
X_train = select(X_train)
X_valid = select(X_valid)
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
num_classes = len(np.unique(y_train))

# TODO: for quick running....
# Combine train and valid labels to get all unique labels
all_labels = np.unique(np.concatenate([y_train, y_valid]))
le = LabelEncoder().fit(all_labels)

# Encode labels
y_train_encoded = le.transform(y_train)
y_valid_encoded = le.transform(y_valid)

dtrain = xgb.DMatrix(X_train, label=y_train_encoded)
dvalid = xgb.DMatrix(X_valid, label=y_valid_encoded)
num_classes = len(le.classes_)

params = {
"objective": "multi:softprob",
"num_class": num_classes,
"nthred": -1,
"max_depth": 6,
"eta": 0.3,
"subsample": 0.8,
"colsample_bytree": 0.8,
"min_child_weight": 1,
"nthread": -1,
}
num_round = 100

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist)
bst = xgb.train(params, dtrain, num_round, evallist, early_stopping_rounds=10)

return bst
# Store the LabelEncoder in the model for later use in prediction
bst.le = le

return bst

def predict(model, X):
"""
@@ -41,4 +51,6 @@ def predict(model, X):
X = select(X)
dtest = xgb.DMatrix(X)
y_pred_prob = model.predict(dtest)
return y_pred_prob
# Convert probabilities back to original labels if needed
# y_pred_labels = model.le.inverse_transform(y_pred_prob.argmax(axis=1))
return y_pred_prob
Loading
Oops, something went wrong.