In [5]:
import os
import joblib
import json
import pandas as pd
import mlflow
import mlflow.sklearn
import optuna
import matplotlib.pyplot as plt
import optuna.visualization.matplotlib as opt_viz

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlflow.tracking import MlflowClient
from sklearn.model_selection import cross_val_score
mlflow.set_tracking_uri("http://127.0.0.1:5000") 
mlflow.set_experiment("Fraud_Detection_Comparison_v2") 
# ✅ Load dataset
df = pd.read_parquet("datas/fraud_data.parquet").head(1000)

# ✅ Features and label
target = "Class"
X_full = df.drop(columns=[target])
y = df[target]

# ✅ Drop high-cardinality object columns
for col in X_full.select_dtypes(include="object").columns:
    if X_full[col].nunique() > 100:
        print(f"Dropping column: {col} (unique: {X_full[col].nunique()})")
        X_full = X_full.drop(columns=col)

X_full = pd.get_dummies(X_full)
all_features = X_full.columns.tolist()

os.makedirs("artifacts", exist_ok=True)
joblib.dump(all_features, "artifacts/feature_names.pkl")

# ✅ Define feature subsets
half_features = all_features[:len(all_features)//2]
top_features = all_features[10:20]

feature_sets = {
    "all_features": all_features,
    # "half_features": half_features,
    # "top_10_features": top_features
}

# ✅ Define models and hyperparameters
models = {
    "LogisticRegression": {
        "model": LogisticRegression(max_iter=500),
        "params": {"C": [0.1, 1, 10]}
    },
    "RandomForest": {
        "model": RandomForestClassifier(),
        "params": {"n_estimators": [50, 100], "max_depth": [3, 5]}
    },
    "SVC": {
        "model": SVC(),
        "params": {"C": [0.1, 1], "kernel": ["linear", "rbf"]}
    }
}

# ✅ Train and log models
for feature_set_name, feature_set in feature_sets.items():
    X = X_full[feature_set]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
    os.makedirs("datas", exist_ok=True)
    X_test.to_parquet("datas/X_test.parquet")
    y_test.to_frame(name="Class").to_parquet("datas/y_test.parquet", index=False)

    for model_name in models:
        print(f"\n🔍 Running Optuna study for {model_name} with {feature_set_name}...")

        def objective(trial, model_name, X_train, y_train):
            params = {}
            if model_name == "LogisticRegression":
                params["C"] = trial.suggest_categorical("C", [0.1, 1, 10])
                model = LogisticRegression(**params, max_iter=500)
            elif model_name == "RandomForest":
                params["n_estimators"] = trial.suggest_categorical("n_estimators", [50, 100])
                params["max_depth"] = trial.suggest_categorical("max_depth", [3, 5])
                model = RandomForestClassifier(**params)
            elif model_name == "SVC":
                params["C"] = trial.suggest_categorical("C", [0.1, 1])
                params["kernel"] = trial.suggest_categorical("kernel", ["linear", "rbf"])
                model = SVC(**params)

            score = cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy").mean()
            return score

        def optuna_objective(trial):
            return objective(trial, model_name, X_train, y_train)

        study = optuna.create_study(direction="maximize")
        study.optimize(optuna_objective, n_trials=20, timeout=300)

        best_params = study.best_params

        # Train best model
        if model_name == "LogisticRegression":
            best_model = LogisticRegression(**best_params, max_iter=500)
        elif model_name == "RandomForest":
            best_model = RandomForestClassifier(**best_params)
        elif model_name == "SVC":
            best_model = SVC(**best_params)

        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)

        with mlflow.start_run(run_name=f"{model_name}_{feature_set_name}"):
            mlflow.log_param("model", model_name)
            mlflow.log_param("feature_set", feature_set_name)
            mlflow.log_params(best_params)
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("precision", prec)
            mlflow.log_metric("recall", rec)
            mlflow.log_metric("f1_score", f1)

            # ✅ Log model
            artifact_path = f"{model_name}_{feature_set_name}"
            mlflow.sklearn.log_model(best_model, artifact_path=artifact_path)

            client = MlflowClient()
            run_id = mlflow.active_run().info.run_id
            model_uri = f"runs:/{run_id}/{artifact_path}"
            registered_model_name = f"{model_name}_{feature_set_name}_Model"
            mlflow.register_model(model_uri, registered_model_name)

            # ✅ Log Optuna artifacts
            optuna_dir = f"optuna_artifacts/{model_name}_{feature_set_name}"
            os.makedirs(optuna_dir, exist_ok=True)

            # 1. Save summary
            summary_path = os.path.join(optuna_dir, "summary.json")
            with open(summary_path, "w") as f:
                json.dump({
                    "best_params": best_params,
                    "best_value": study.best_value,
                    "best_trial": study.best_trial.number
                }, f, indent=4)

            # 2. Optimization history plot
            plt.figure()
            opt_viz.plot_optimization_history(study)
            plt.title("Optuna Optimization History")
            opt_hist_path = os.path.join(optuna_dir, "opt_history.png")
            plt.savefig(opt_hist_path)
            plt.close()

            # 3. Param importance plot
            plt.figure()
            opt_viz.plot_param_importances(study)
            plt.title("Optuna Param Importance")
            param_imp_path = os.path.join(optuna_dir, "param_importance.png")
            plt.savefig(param_imp_path)
            plt.close()

            # 4. Log to MLflow
            mlflow.log_artifact(summary_path)
            mlflow.log_artifact(opt_hist_path)
            mlflow.log_artifact(param_imp_path)

            print(f"✅ Run logged for {model_name} with {feature_set_name}")
            print("🚀 Run ID:", run_id)


[I 2025-07-29 15:30:40,069] A new study created in memory with name: no-name-aa36bfa2-abe2-4db9-9838-c53b9421c763
[I 2025-07-29 15:30:40,087] Trial 0 finished with value: 0.9987515605493135 and parameters: {'C': 10}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:40,125] Trial 1 finished with value: 0.9987515605493135 and parameters: {'C': 10}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:40,155] Trial 2 finished with value: 0.9987515605493135 and parameters: {'C': 10}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:40,177] Trial 3 finished with value: 0.9987515605493135 and parameters: {'C': 0.1}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:40,203] Trial 4 finished with value: 0.9987515605493135 and parameters: {'C': 10}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:40,226] Trial 5 finished with value: 0.9987515605493135 and parameters: {'C': 1}. Best is trial 0 with value: 

Dropping column: TransactionID (unique: 1000)

🔍 Running Optuna study for LogisticRegression with all_features...


[I 2025-07-29 15:30:40,253] Trial 6 finished with value: 0.9987515605493135 and parameters: {'C': 0.1}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:40,274] Trial 7 finished with value: 0.9987515605493135 and parameters: {'C': 1}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:40,307] Trial 8 finished with value: 0.9987515605493135 and parameters: {'C': 10}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:40,336] Trial 9 finished with value: 0.9987515605493135 and parameters: {'C': 1}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:40,364] Trial 10 finished with value: 0.9987515605493135 and parameters: {'C': 10}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:40,390] Trial 11 finished with value: 0.9987515605493135 and parameters: {'C': 10}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:40,417] Trial 12 finished with value: 0.9987515605493135 and parameters: 

✅ Run logged for LogisticRegression with all_features
🚀 Run ID: ba43822afefd46dfa1d14a211968677e
🏃 View run LogisticRegression_all_features at: http://127.0.0.1:5000/#/experiments/757674105811204095/runs/ba43822afefd46dfa1d14a211968677e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/757674105811204095

🔍 Running Optuna study for RandomForest with all_features...


[I 2025-07-29 15:30:45,659] Trial 0 finished with value: 0.9987515605493135 and parameters: {'n_estimators': 100, 'max_depth': 3}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:46,198] Trial 1 finished with value: 0.9975031210986267 and parameters: {'n_estimators': 100, 'max_depth': 3}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:46,657] Trial 2 finished with value: 0.9987515605493135 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:46,980] Trial 3 finished with value: 0.9987515605493135 and parameters: {'n_estimators': 50, 'max_depth': 3}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:47,498] Trial 4 finished with value: 0.9987515605493135 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: 0.9987515605493135.
[I 2025-07-29 15:30:47,968] Trial 5 finished with value: 0.9987515605493135 and parameters: {'n_estimators

✅ Run logged for RandomForest with all_features
🚀 Run ID: f5b3659f07ea4d84910d0b9aa91c3455
🏃 View run RandomForest_all_features at: http://127.0.0.1:5000/#/experiments/757674105811204095/runs/f5b3659f07ea4d84910d0b9aa91c3455
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/757674105811204095

🔍 Running Optuna study for SVC with all_features...


[I 2025-07-29 15:30:59,974] Trial 7 finished with value: 0.9987515605493135 and parameters: {'C': 1, 'kernel': 'linear'}. Best is trial 2 with value: 0.9987515605493135.
[I 2025-07-29 15:30:59,995] Trial 8 finished with value: 0.9987515605493135 and parameters: {'C': 1, 'kernel': 'linear'}. Best is trial 2 with value: 0.9987515605493135.
[I 2025-07-29 15:31:00,030] Trial 9 finished with value: 0.9975031210986267 and parameters: {'C': 0.1, 'kernel': 'rbf'}. Best is trial 2 with value: 0.9987515605493135.
[I 2025-07-29 15:31:00,054] Trial 10 finished with value: 0.9987515605493135 and parameters: {'C': 0.1, 'kernel': 'linear'}. Best is trial 2 with value: 0.9987515605493135.
[I 2025-07-29 15:31:00,078] Trial 11 finished with value: 0.9987515605493135 and parameters: {'C': 0.1, 'kernel': 'linear'}. Best is trial 2 with value: 0.9987515605493135.
[I 2025-07-29 15:31:00,102] Trial 12 finished with value: 0.9987515605493135 and parameters: {'C': 0.1, 'kernel': 'linear'}. Best is trial 2 with

✅ Run logged for SVC with all_features
🚀 Run ID: ce50a301706140ceac7a9fec962e658e
🏃 View run SVC_all_features at: http://127.0.0.1:5000/#/experiments/757674105811204095/runs/ce50a301706140ceac7a9fec962e658e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/757674105811204095


<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

In [8]:
# ---------------------- ✅ Deploy Logistic Regression to Arize ----------------------
import pandas as pd
import mlflow
import arize
from arize.pandas.logger import Client
from arize.utils.types import Schema, ModelTypes, Environments

# ✅ Arize credentials
space_id = "U3BhY2U6MjM3MTI6RThBTQ=="                  
api_key = "ak-8c93aa68-e105-4c23-b977-4ffb437fe7a5-rZPuli0UaGIrRAJ3x-OkK1sg_l5e5mFT"

# ✅ Arize model metadata
MODEL_ID = "logistic_fraud_model"
MODEL_VERSION = "v1"

# ✅ MLflow model details
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.get_experiment_by_name("Fraud_Detection_Comparison_v2")
logistic_model_name = "LogisticRegression_all_features"
# Find run ID for this model
runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    filter_string=f"tags.mlflow.runName = '{logistic_model_name}'",
    order_by=["start_time DESC"]
)


if runs.empty:
    print("❌ No run found for LogisticRegression_all_features")
else:
    run_id = runs.iloc[0]["run_id"]
    print("✅ Deploying run:", run_id)

    # ✅ Load model from MLflow
    model_uri = f"runs:/{run_id}/{logistic_model_name}"
    model = mlflow.sklearn.load_model(model_uri)

    # ✅ Load test data
    X_test = pd.read_parquet("datas/X_test.parquet")
    y_test = pd.read_parquet("datas/y_test.parquet")
    if isinstance(y_test, pd.DataFrame):
        y_test = y_test.squeeze()  # ensure it's a Series

    # ✅ Make predictions
    y_pred = model.predict(X_test)

    # Create dataframe for Arize
    df = X_test.copy()
    df["prediction_id"] = [f"id_{i}" for i in range(len(df))]
    df["prediction"] = y_pred
    df["actual"] = y_test.values
    df = df.reset_index(drop=True)
    # Log to Arize
    arize_client = Client(space_id=space_id, api_key=api_key)
    schema = Schema(
        prediction_id_column_name="prediction_id",
        prediction_label_column_name="prediction",
        actual_label_column_name="actual"
    )
    
   

    response = arize_client.log(
        dataframe=df,
        model_id=MODEL_ID,
        model_version=MODEL_VERSION,
        model_type=ModelTypes.BINARY_CLASSIFICATION,
        environment=Environments.PRODUCTION,
        schema=schema
    )

    print("🚀 Arize upload response:", response)


✅ Deploying run: ba43822afefd46dfa1d14a211968677e


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

[38;21m  arize.utils.logging | INFO | Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlvbjoyMjcwNjoyN0V4/spaces/U3BhY2U6MjM3MTI6RThBTQ==/models/modelName/logistic_fraud_model?selectedTab=performance[0m
🚀 Arize upload response: <Response [200]>
