# Importing Libraries

In [2]:
pip install mlflow

Collecting mlflow
  Downloading mlflow-2.20.2-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.20.2 (from mlflow)
  Downloading mlflow_skinny-2.20.2-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.20.2->mlflow)
  Downloading databricks_sdk-0.44.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Colle

In [3]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd

## Load the data

In [4]:
df_fraud = pd.read_csv('/content/fraud_data_merged.csv')

In [5]:
# drop signup_time, purchase_time
df_fraud.drop(columns=['signup_time', 'purchase_time'], inplace=True)
# drop country
df_fraud.drop(columns=['country'], inplace=True)

In [6]:
# drop na for fraud data
df_fraud.dropna(inplace=True)

In [9]:
# set target feature
X_fraud = df_fraud.drop(columns=['class'])
y_fraud = df_fraud['class']

In [10]:
# Split the data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

In [12]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_fraud_res, y_train_fraud_res = smote.fit_resample(X_train_fraud, y_train_fraud)

# Start MLflow experiment
mlflow.set_experiment("E-Commerce Fraud Detection")


2025/02/13 14:12:58 INFO mlflow.tracking.fluent: Experiment with name 'E-Commerce Fraud Detection' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/539113993253351231', creation_time=1739455978769, experiment_id='539113993253351231', last_update_time=1739455978769, lifecycle_stage='active', name='E-Commerce Fraud Detection', tags={}>

# MLflow

In [13]:
# Start MLflow experiment
mlflow.set_experiment("E-Commerce Fraud Detection")

with mlflow.start_run():
    # Log parameters
    mlflow.log_param("dataset", "Fraud_Data")
    mlflow.log_param("model_type", "Random Forest")
    mlflow.log_param("class_imbalance_handling", "SMOTE")

    # Train the model
    rf_model_fraud = RandomForestClassifier(random_state=42)
    rf_model_fraud.fit(X_train_fraud_res, y_train_fraud_res)

    # Evaluate the model
    y_pred_fraud = rf_model_fraud.predict(X_test_fraud)
    report_fraud = classification_report(y_test_fraud, y_pred_fraud, output_dict=True)
    roc_auc_fraud = roc_auc_score(y_test_fraud, y_pred_fraud)


In [15]:
# Log metrics
mlflow.log_metric("precision", report_fraud["1"]["precision"])
mlflow.log_metric("recall", report_fraud["1"]["recall"])
mlflow.log_metric("f1_score", report_fraud["1"]["f1-score"])
mlflow.log_metric("roc_auc", roc_auc_fraud)

# Log the model
mlflow.sklearn.log_model(rf_model_fraud, "random_forest_fraud_data_model")
print("Fraud Data Model Logged to MLflow!")



Fraud Data Model Logged to MLflow!


In [18]:
# download the model as a pkl
import joblib
joblib.dump(rf_model_fraud, 'models/random_forest_fraud_data_model.pkl')

FileNotFoundError: [Errno 2] No such file or directory: 'models/random_forest_fraud_data_model.pkl'

In [None]:
#### to view mlflow experiment use bash and
# mlflow ui

# Versioning Models

In [16]:
import mlflow.sklearn
# Load the logged model for fraud data
model_fraud = mlflow.sklearn.load_model("runs:/<RUN_ID>/random_forest_fraud_data_model")

MlflowException: Invalid value "<RUN_ID>" for parameter 'run_id' supplied.

# Model Explainability

In [17]:
pip install shap lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=aacfa4fe8daec8e09226c8709053c501c0e53ce8212fe8feedf9a6cc41d763bc
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


## Explaining the Random Forest Model with SHAP

In [None]:
import shap

# Load the trained Random Forest model (replace with your model)
rf_model_fraud = joblib.load("models/random_forest_fraud_data_model.pkl")

# Initialize SHAP explainer for fraud data
explainer_fraud = shap.TreeExplainer(rf_model_fraud)
shap_values_fraud = explainer_fraud.shap_values(X_test_fraud)

In [None]:
# Summary plot for fraud data
shap.summary_plot(shap_values_fraud, X_test_fraud, plot_type="bar")

In [None]:
# The force plot visualizes the contribution of features for a single prediction
# Force plot for fraud data (first instance in the test set)
shap.force_plot(explainer_fraud.expected_value[1], shap_values_fraud[1][0, :], X_test_fraud.iloc[0, :])

In [None]:
# The dependence plot shows the relationship between a feature and the model's output.
# Dependence plot for fraud data (e.g., feature 'purchase_value')
shap.dependence_plot("purchase_value", shap_values_fraud[1], X_test_fraud)

## Using LIME for Explainability

In [None]:
import lime
import lime.lime_tabular

# Initialize LIME explainer for fraud data
explainer_fraud_lime = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train_fraud.values,
    feature_names=X_train_fraud.columns,
    class_names=['Non-Fraud', 'Fraud'],
    mode='classification'
)

In [None]:
# Explain a single prediction for fraud data (first instance in the test set)
exp_fraud = explainer_fraud_lime.explain_instance(X_test_fraud.iloc[0, :], rf_model_fraud.predict_proba, num_features=10)
exp_fraud.show_in_notebook()