In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import mlflow
import mlflow.sklearn
import mlflow.xgboost

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import xgboost as xgb
from sklearn.base import clone

In [None]:

# Load dataset
df = pd.read_csv("stock_data.csv")  # Should include stock features and a binary 'execute_trade' label


In [None]:
target_col = "execute_trade"
features = [col for col in df.columns if col != target_col]


In [None]:

X = df[features]
y = df[target_col]

In [None]:
# Model builder
def get_model_pipeline(model_name, trial):
    if model_name == "xgboost":
        params = {
            "n_estimators": trial.suggest_int("xgb_n_estimators", 100, 300),
            "max_depth": trial.suggest_int("xgb_max_depth", 3, 10),
            "learning_rate": trial.suggest_float("xgb_learning_rate", 0.01, 0.3),
            "subsample": trial.suggest_float("xgb_subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("xgb_colsample", 0.5, 1.0),
        }
        model = xgb.XGBClassifier(**params, objective="binary:logistic", use_label_encoder=False, eval_metric="logloss", random_state=42)

    elif model_name == "random_forest":
        params = {
            "n_estimators": trial.suggest_int("rf_n_estimators", 50, 200),
            "max_depth": trial.suggest_int("rf_max_depth", 3, 15)
        }
        model = RandomForestClassifier(**params, random_state=42)

    elif model_name == "svm":
        params = {
            "C": trial.suggest_float("svm_C", 0.1, 10.0),
            "kernel": trial.suggest_categorical("svm_kernel", ["linear", "rbf", "poly"])
        }
        model = SVC(probability=True, **params)

    elif model_name == "logistic_regression":
        params = {
            "C": trial.suggest_float("lr_C", 0.01, 10.0)
        }
        model = LogisticRegression(**params, max_iter=1000)

    else:
        raise ValueError(f"Unknown model: {model_name}")

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    return pipeline, params

In [None]:
# Cross-validation and evaluation
def cross_validate_model(pipeline, X, y, cv_folds=5):
    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    y_preds = cross_val_predict(pipeline, X, y, cv=skf, method="predict")
    y_probas = cross_val_predict(pipeline, X, y, cv=skf, method="predict_proba")[:, 1]

    acc = accuracy_score(y, y_preds)
    f1 = f1_score(y, y_preds)
    auc = roc_auc_score(y, y_probas)

    return acc, f1, auc, y_preds, y_probas

In [None]:
# Optuna objective
def objective(trial):
    with mlflow.start_run(nested=True):
        model_name = trial.suggest_categorical("model_name", ["xgboost", "random_forest", "svm", "logistic_regression"])
        pipeline, params = get_model_pipeline(model_name, trial)

        acc, f1, auc, y_preds, y_probas = cross_validate_model(pipeline, X, y)

        # Log to MLflow
        mlflow.log_param("model_name", model_name)
        mlflow.log_params(params)
        mlflow.log_metrics({
            "cv_accuracy": acc,
            "cv_f1_score": f1,
            "cv_auc": auc
        })

        # Save classification report
        report = classification_report(y, y_preds, output_dict=True)
        pd.DataFrame(report).transpose().to_csv("cv_classification_report.csv")
        mlflow.log_artifact("cv_classification_report.csv")

        # Save a plot
        plt.figure(figsize=(10, 4))
        plt.plot(y_probas[:100], label="Predicted Probabilities")
        plt.plot(y.values[:100], label="True Labels")
        plt.title(f"{model_name} CV Prediction Sample")
        plt.legend()
        plt.savefig("cv_plot.png")
        mlflow.log_artifact("cv_plot.png")

        # Fit final model on all data and log it
        final_model = clone(pipeline)
        final_model.fit(X, y)
        mlflow.sklearn.log_model(final_model, artifact_path="model")

        return 1 - f1  # Minimize loss


In [None]:

# Set experiment
mlflow.set_experiment("Trade Execution Classifier with CV")

# Run Optuna
with mlflow.start_run(run_name="optuna_cv_study"):
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=30)

# Final best
print("Best model:", study.best_trial.params["model_name"])
print("Best parameters:", study.best_trial.params)
print("Best CV F1 Score:", 1 - study.best_value)