Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: supporting Facebook competition (don't merge now) #364

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Prev Previous commit
Next Next commit
fix: fix some errors in scenario.py, proposal.py and runner.py and se…
…veral complex competition scenarios(#365)

* fix several bugs in proposal and runner

* fix a bug in feedback-prize-english-language-learning

* fix some bugs and templates

* fix the bug in optiver and nlp problem
  • Loading branch information
WinstonLiyt authored Sep 27, 2024
commit 2e383b175d8448a67cb470f4e3ae8977d8ec6b5b
7 changes: 4 additions & 3 deletions rdagent/log/ui/app.py
Original file line number Diff line number Diff line change
@@ -443,9 +443,10 @@ def tasks_window(tasks: list[FactorTask | ModelTask]):
st.latex(ft.factor_formulation)

mks = "| Variable | Description |\n| --- | --- |\n"
for v, d in ft.variables.items():
mks += f"| ${v}$ | {d} |\n"
st.markdown(mks)
if isinstance(ft.variables, dict):
for v, d in ft.variables.items():
mks += f"| ${v}$ | {d} |\n"
st.markdown(mks)

elif isinstance(tasks[0], ModelTask):
st.markdown("**Model Tasks🚩**")
2 changes: 1 addition & 1 deletion rdagent/scenarios/kaggle/developer/feedback.py
Original file line number Diff line number Diff line change
@@ -163,7 +163,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
self.scen.vector_base.save()
elif self.scen.if_using_graph_rag:
self.scen.trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)

return HypothesisFeedback(
observations=observations,
124 changes: 66 additions & 58 deletions rdagent/scenarios/kaggle/developer/runner.py
Original file line number Diff line number Diff line change
@@ -32,6 +32,48 @@ def get_cache_key(self, exp: ASpecificExp) -> str:
codes = "\n".join(codes)
return md5_hash(codes)

def extract_model_task_from_code(self, code: str) -> str:
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["system"])
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["user"])
.render(file_content=code)
)

model_task_description = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=True,
)

try:
response_json_analysis = json.loads(model_task_description)
task_desc = f"""name: {response_json_analysis['name']}
description: {response_json_analysis['description']}
"""
task_desc += (
f"formulation: {response_json_analysis['formulation']}\n"
if response_json_analysis.get("formulation")
else ""
)
task_desc += f"architecture: {response_json_analysis['architecture']}\n"
task_desc += (
f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
if response_json_analysis.get("variables")
else ""
)
task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
task_desc += f"model_type: {response_json_analysis['model_type']}\n"
except json.JSONDecodeError:
task_desc = "Failed to parse LLM's response as JSON"

return task_desc

def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorExperiment | KGModelExperiment:
"""
For the initial development, the experiment serves as a benchmark for feature engineering.
@@ -59,21 +101,27 @@ def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorE
feature_shape = org_data.shape[-1]
exp.experiment_workspace.data_description.append((sub_task.get_task_information(), feature_shape))

sub_model_1_description = (
self.extract_model_task_from_code(
(exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()
)
+ f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()}"""
)
sub_model_2_description = (
self.extract_model_task_from_code(
(exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()
)
+ f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()}"""
)
model_map = {
"XGBoost": "model_xgboost.py",
"RandomForest": "model_randomforest.py",
"LightGBM": "model_lightgbm.py",
"NN": "model_nn.py",
}

workspace_path = exp.experiment_workspace.workspace_path / "model"

for model_name, model_file in model_map.items():
model_file_path = workspace_path / model_file

exp.experiment_workspace.model_description["XGBoost"] = sub_model_1_description
exp.experiment_workspace.model_description["RandomForest"] = sub_model_2_description
if model_file_path.exists():
model_description = (
self.extract_model_task_from_code(model_file_path.read_text())
+ f"""code: {model_file_path.read_text()}"""
)
else:
model_description = ""

exp.experiment_workspace.model_description[model_name] = model_description

if RUNNER_SETTINGS.cache_result:
self.dump_cache_result(exp, result)
@@ -120,51 +168,7 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:


class KGFactorRunner(KGCachedRunner[KGFactorExperiment]):
def extract_model_task_from_code(self, code: str) -> str:
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["system"])
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["user"])
.render(file_content=code)
)

model_task_description = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=True,
)

try:
response_json_analysis = json.loads(model_task_description)
task_desc = f"""name: {response_json_analysis['name']}
description: {response_json_analysis['description']}
"""
task_desc += (
f"formulation: {response_json_analysis['formulation']}\n"
if response_json_analysis.get("formulation")
else ""
)
task_desc += f"architecture: {response_json_analysis['architecture']}\n"
task_desc += (
f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
if response_json_analysis.get("variables")
else ""
)
task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
task_desc += f"model_type: {response_json_analysis['model_type']}\n"
except json.JSONDecodeError:
task_desc = "Failed to parse LLM's response as JSON"

return task_desc

def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
if exp.based_experiments and exp.based_experiments[-1].result is None:
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
current_feature_file_count = len(list(exp.experiment_workspace.workspace_path.glob("feature/feature*.py")))
implemented_factor_count = 0
for sub_ws in exp.sub_workspace_list:
@@ -179,6 +183,10 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
if implemented_factor_count == 0:
raise FactorEmptyError("No factor is implemented")

# initial template result
if exp.based_experiments and exp.based_experiments[-1].result is None:
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])

if RUNNER_SETTINGS.cache_result:
cache_hit, result = self.get_cache_result(exp)
if cache_hit:
Original file line number Diff line number Diff line change
@@ -36,12 +36,8 @@ def data_cleaner(text):

y_train = train[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]]

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train["full_text"])
X_test = vectorizer.transform(test["full_text"])

X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
X_test = pd.DataFrame.sparse.from_spmatrix(X_test)
X_train = train[["full_text"]]
X_test = test[["full_text"]]

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
@@ -11,12 +12,15 @@ def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass
self.vectorizer = TfidfVectorizer()
self.vectorizer.fit(train_df["full_text"])

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
X = self.vectorizer.transform(X["full_text"])
X = pd.DataFrame.sparse.from_spmatrix(X)
return X


Original file line number Diff line number Diff line change
@@ -22,7 +22,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
X_train = select(X_train)

xgb_estimator = xgb.XGBRegressor(
n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="gpu_hist", device="cuda"
n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="hist", device="cuda"
)

model = MultiOutputRegressor(xgb_estimator, n_jobs=-1)
Original file line number Diff line number Diff line change
@@ -15,6 +15,10 @@ def import_module_from_path(module_name, module_path):
return module


def MCRMSE(y_true, y_pred):
return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))


# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test = preprocess_script()

@@ -24,6 +28,7 @@ def import_module_from_path(module_name, module_path):

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
print(X_train.head())
cls.fit(X_train)
X_train_f = cls.transform(X_train)
X_valid_f = cls.transform(X_valid)
@@ -62,33 +67,18 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:

# 4) Evaluate the model on the validation set
y_valid_pred_l = []
metrics_all = []
for model, predict_func in model_l:
y_valid_pred = predict_func(model, X_valid)
y_valid_pred_l.append(y_valid_pred)
# print(y_valid_pred)
# print(y_valid_pred.shape)

# 5) Ensemble
# Majority vote ensemble
y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0)

metrics = MCRMSE(y_valid, y_valid_pred)
print(f"MCRMSE on valid set: {metrics}")
metrics_all.append(metrics)

# 6) Save the validation metrics
def MCRMSE(y_true, y_pred):
return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))


metrics = MCRMSE(y_valid, y_valid_pred_ensemble)
print(f"MCRMSE on valid set: {metrics}")
pd.Series(data=[metrics], index=["MCRMSE"]).to_csv("submission_score.csv")

# 7) Make predictions on the test set and save them
y_test_pred_l = []
for model, predict_func in model_l:
y_test_pred_l.append(predict_func(model, X_test))
min_index = np.argmin(metrics_all)
pd.Series(data=[metrics_all[min_index]], index=["MCRMSE"]).to_csv("submission_score.csv")

# For multiclass classification, use the mode of the predictions
y_test_pred = np.mean(y_test_pred_l, axis=0)
y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test)


submission_result = pd.read_csv("/kaggle/input/sample_submission.csv")
Original file line number Diff line number Diff line change
@@ -93,6 +93,20 @@ def import_module_from_path(module_name, module_path):
X_te = X_te.loc[:, ~X_te.columns.duplicated()]

# Train the model
def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
"""
Flatten the columns of a DataFrame with MultiIndex columns,
for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b
"""
if df.columns.nlevels == 1:
return df
df.columns = ["_".join(col).strip() for col in df.columns.values]
return df

X_tr = flatten_columns(X_tr)
X_val = flatten_columns(X_val)
X_te = flatten_columns(X_te)

model_l = [] # list[tuple[model, predict_func]]
for f in DIRNAME.glob("model/model*.py"):
m = import_module_from_path(f.stem, f)
Original file line number Diff line number Diff line change
@@ -11,24 +11,26 @@

def prepreprocess():
# Load the training data
train_df = pd.read_csv("/kaggle/input/train.csv").head(1000)
train_df = pd.read_csv("/kaggle/input/train.csv")

# Load book and trade data
book_train = pd.read_parquet("/kaggle/input/book_train.parquet").head(1000)
trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet").head(1000)
book_train = pd.read_parquet("/kaggle/input/book_train.parquet")
trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet")

# Merge book and trade data with train_df
merged_df = pd.merge(train_df, book_train, on=["stock_id", "time_id"], how="left")
merged_df = pd.merge(merged_df, trade_train, on=["stock_id", "time_id"], how="left")

print(merged_df.head())

# Split the data
X = merged_df.drop(["target"], axis=1)
y = merged_df["target"]

print(X.columns.to_list())

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.columns.to_list())

return X_train, X_valid, y_train, y_valid


@@ -60,7 +62,6 @@ def preprocess_fit(X_train: pd.DataFrame):
def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols):
X_transformed = preprocessor.transform(X)

# Convert arrays back to DataFrames
X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index)

return X_transformed
@@ -79,11 +80,6 @@ def preprocess_script():

X_train, X_valid, y_train, y_valid = prepreprocess()

preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)

X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)
X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)

submission_df = pd.read_csv("/kaggle/input/test.csv")

ids = submission_df["row_id"]
@@ -94,10 +90,8 @@ def preprocess_script():
if col not in submission_df.columns:
submission_df[col] = 0 # Fill with 0 or another appropriate value

X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)

# Handle missing values
for df in [X_train, X_valid, X_test]:
for df in [X_train, X_valid, submission_df]:
df.fillna(df.mean(), inplace=True)

return X_train, X_valid, y_train, y_valid, X_test, ids
return X_train, X_valid, y_train, y_valid, submission_df, ids
Original file line number Diff line number Diff line change
@@ -18,7 +18,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
params = {
"objective": "reg:squarederror", # Use squared error for regression
"nthread": -1,
"tree_method": "gpu_hist",
"tree_method": "hist",
"device": "cuda",
}
num_round = 200
Loading
Oops, something went wrong.