In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    recall_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

# === 1. Load dataset ===
df = pd.read_csv("stock_data.csv")  # must include 'execute_trade'

target_col = "execute_trade"
features = [col for col in df.columns if col != target_col]
X = df[features]
y = df[target_col]

# === 2. Detect columns ===
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# === 3. Preprocessing ===
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# === 4. Models ===
models = {
    "LogisticRegression": LogisticRegression(C=1.0, max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=150, max_depth=10, random_state=42),
    "SVM": SVC(C=1.0, kernel="rbf", probability=True),
    "XGBoost": xgb.XGBClassifier(
        n_estimators=150,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )
}

# === 5. Cross-validation ===
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# === 6. Evaluation & selection ===
def evaluate_and_select_model(models, X, y, preprocessor, cv):
    results = []

    for name, model in models.items():
        print(f"\n🔍 Evaluating: {name}")
        
        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("classifier", model)
        ])

        y_pred = cross_val_predict(pipeline, X, y, cv=cv, method="predict")
        y_proba = cross_val_predict(pipeline, X, y, cv=cv, method="predict_proba")[:, 1]

        recall = recall_score(y, y_pred)
        auc = roc_auc_score(y, y_proba)
        cm = confusion_matrix(y, y_pred)

        print(f"📊 Recall: {recall:.4f}")
        print(f"📈 ROC AUC: {auc:.4f}")
        print("🧾 Classification Report:\n", classification_report(y, y_pred))

        # Plot confusion matrix
        plt.figure(figsize=(5, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["No Trade", "Trade"], yticklabels=["No Trade", "Trade"])
        plt.title(f"{name} - Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.tight_layout()
        plt.savefig(f"confusion_{name}.png")
        plt.close()

        results.append({
            "name": name,
            "model": model,
            "recall": recall,
            "roc_auc": auc
        })

    # Rank models (sort by recall first, then AUC)
    sorted_results = sorted(results, key=lambda r: (r["recall"], r["roc_auc"]), reverse=True)
    best = sorted_results[0]
    
    print(f"\n🏆 Best Model: {best['name']}")
    print(f"📊 Best Recall: {best['recall']:.4f}, 📈 Best ROC AUC: {best['roc_auc']:.4f}")

    return best['name'], best['model']

# === 7. Run selection ===
best_model_name, best_model = evaluate_and_select_model(models, X, y, preprocessor, skf)

# === 8. Refit on all data & save ===
final_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", best_model)
])
final_pipeline.fit(X, y)

joblib.dump(final_pipeline, f"best_model_{best_model_name}.pkl")
print(f"\n📦 Final model saved to: best_model_{best_model_name}.pkl")