In [None]:
import os
import joblib
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score
from mlflow.tracking import MlflowClient

# ✅ Load dataset
## Load your dataset
df = pd.read_parquet("data/fraud_data.parquet")
# ✅ Features and label
target = "Class"
X_full = df.drop(columns=[target])
y = df[target]

# ✅ Define feature subsets
# Drop columns with too many unique values (e.g., more than 100)
for col in X_full.select_dtypes(include="object").columns:
    if X_full[col].nunique() > 100:
        print(f"Dropping column: {col} (unique: {X_full[col].nunique()})")
        X_full = X_full.drop(columns=col)

X_full = pd.get_dummies(X_full)  # One-hot encode all object-type columns
all_features = X_full.columns.tolist()
os.makedirs("artifacts", exist_ok=True)  # Create directory if it doesn't exist
joblib.dump(X_full.columns.tolist(), "artifacts/feature_names.pkl")

half_features = all_features[:len(all_features)//2]
top_features = all_features[10:20]  # Example: features ranked important by domain or feature importance

feature_sets = {
    "all_features": all_features,
   # "half_features": half_features,
    #"top_10_features": top_features
}

# ✅ Define models and hyperparameters
models = {
    "LogisticRegression": {
        "model": LogisticRegression(max_iter=500),
        "params": {"C": [0.1, 1, 10]}
    },
    "RandomForest": {
        "model": RandomForestClassifier(),
        "params": {"n_estimators": [50, 100], "max_depth": [3, 5]}
    },
    "SVC": {
        "model": SVC(),
        "params": {"C": [0.1, 1], "kernel": ["linear", "rbf"]}
    }
}


for exp_id in range(1, 4):
    experiment_name = "Fraud_Detection_Comparison_v1"  # Define the name as a string
    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    experiment = mlflow.set_experiment(experiment_name)  # Set and get experiment object
    experiment_id = experiment.experiment_id  # Extract the ID

    for feature_set_name, selected_features in feature_sets.items():
        X = X_full[selected_features]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=exp_id)

        for model_name, config in models.items():
            model = config["model"]
            param_grid = config["params"]

            grid = GridSearchCV(model, param_grid, cv=3, scoring="accuracy")
            grid.fit(X_train, y_train)

            best_model = grid.best_estimator_
            y_pred = best_model.predict(X_test)

            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred, zero_division=0)
            rec = recall_score(y_test, y_pred, zero_division=0)
            f1 = f1_score(y_test, y_pred, zero_division=0)  # ✅ new
            
            with mlflow.start_run(run_name=f"{model_name}_{feature_set_name}"):
                print(model_name,"printing model name before logging")
                mlflow.log_param("model", model_name)
                mlflow.log_param("feature_set", feature_set_name)
                mlflow.log_params(grid.best_params_)
                mlflow.log_metric("accuracy", acc)
                mlflow.log_metric("precision", prec)
                mlflow.log_metric("recall", rec)
                artifact_path = f"{model_name}_{feature_set_name}"
                mlflow.sklearn.log_model(best_model, artifact_path=artifact_path)
                client = MlflowClient()
                run_id = mlflow.active_run().info.run_id
                model_uri =f"runs:/{run_id}/{artifact_path}"
                
                registered_model_name = artifact_path
                mlflow.register_model(model_uri, registered_model_name)
                print(f"✅ Run logged for {model_name} with {feature_set_name} in {experiment_name}")
                print("🚀 Run ID:", mlflow.active_run().info.run_id)


Dropping column: TransactionID (unique: 10000)




LogisticRegression printing model name before logging


Registered model 'LogisticRegression_all_features' already exists. Creating a new version of this model...
2025/07/28 15:23:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression_all_features, version 13
Created version '13' of model 'LogisticRegression_all_features'.


✅ Run logged for LogisticRegression with all_features in Fraud_Detection_Comparison_v1
🚀 Run ID: c2dcdd91a02c409d84a4f3f0652db92b
🏃 View run LogisticRegression_all_features at: http://127.0.0.1:5000/#/experiments/174702888862868240/runs/c2dcdd91a02c409d84a4f3f0652db92b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/174702888862868240




RandomForest printing model name before logging


Registered model 'RandomForest_all_features' already exists. Creating a new version of this model...
2025/07/28 15:23:28 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForest_all_features, version 12
Created version '12' of model 'RandomForest_all_features'.


✅ Run logged for RandomForest with all_features in Fraud_Detection_Comparison_v1
🚀 Run ID: 72a12d2a3bca47ef9008483deb720c25
🏃 View run RandomForest_all_features at: http://127.0.0.1:5000/#/experiments/174702888862868240/runs/72a12d2a3bca47ef9008483deb720c25
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/174702888862868240




SVC printing model name before logging


Registered model 'SVC_all_features' already exists. Creating a new version of this model...
2025/07/28 15:23:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: SVC_all_features, version 12
Created version '12' of model 'SVC_all_features'.


✅ Run logged for SVC with all_features in Fraud_Detection_Comparison_v1
🚀 Run ID: 4f674079867145998d09df0dfef29d33
🏃 View run SVC_all_features at: http://127.0.0.1:5000/#/experiments/174702888862868240/runs/4f674079867145998d09df0dfef29d33
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/174702888862868240




LogisticRegression printing model name before logging


Registered model 'LogisticRegression_all_features' already exists. Creating a new version of this model...
2025/07/28 15:23:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression_all_features, version 14
Created version '14' of model 'LogisticRegression_all_features'.


✅ Run logged for LogisticRegression with all_features in Fraud_Detection_Comparison_v1
🚀 Run ID: c8a27a0e5285418e89983e500750801f
🏃 View run LogisticRegression_all_features at: http://127.0.0.1:5000/#/experiments/174702888862868240/runs/c8a27a0e5285418e89983e500750801f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/174702888862868240




RandomForest printing model name before logging


Registered model 'RandomForest_all_features' already exists. Creating a new version of this model...
2025/07/28 15:23:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForest_all_features, version 13
Created version '13' of model 'RandomForest_all_features'.


✅ Run logged for RandomForest with all_features in Fraud_Detection_Comparison_v1
🚀 Run ID: f34d66d9553947d3a6f8ce25603aee34
🏃 View run RandomForest_all_features at: http://127.0.0.1:5000/#/experiments/174702888862868240/runs/f34d66d9553947d3a6f8ce25603aee34
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/174702888862868240




SVC printing model name before logging


Registered model 'SVC_all_features' already exists. Creating a new version of this model...
2025/07/28 15:23:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: SVC_all_features, version 13
Created version '13' of model 'SVC_all_features'.


✅ Run logged for SVC with all_features in Fraud_Detection_Comparison_v1
🚀 Run ID: 75c1be8efb504beca310dad08bc81357
🏃 View run SVC_all_features at: http://127.0.0.1:5000/#/experiments/174702888862868240/runs/75c1be8efb504beca310dad08bc81357
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/174702888862868240




LogisticRegression printing model name before logging
