In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import mlflow
import mlflow.sklearn



# Load datasets

In [2]:
fraud_data = pd.read_csv('../assets/Data/Fraud_Data.csv')
creditcard_data = pd.read_csv('../assets/Data/creditcard.csv')


# Data Preparation
# Fraud Data

In [7]:
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])
fraud_data['signup_time'] = fraud_data['signup_time'].astype(int) / 10**9
fraud_data['purchase_time'] = fraud_data['purchase_time'].astype(int) / 10**9

X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']


# Credit Card Data

In [8]:
X_credit = creditcard_data.drop(columns=['Class'])
y_credit = creditcard_data['Class']


# Train-Test Split

In [9]:
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)
X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)


# Preprocessing pipeline for fraud data


In [15]:
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder



fraud_numeric_features = ['signup_time', 'purchase_time', 'purchase_value', 'age']
fraud_categorical_features = ['source', 'browser', 'sex']

fraud_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), fraud_numeric_features),
        ('cat', OneHotEncoder(), fraud_categorical_features)
    ])



# Preprocessing pipeline for credit card data


In [16]:
credit_numeric_features = X_credit.columns

credit_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), credit_numeric_features)
    ])


# Model Selection


In [17]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP": MLPClassifier()
}


# Model Training and Evaluation


In [18]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_pred)
    }
    return metrics


# Initialize MLflow


In [None]:
mlflow.set_experiment("Fraud Detection Models")

for model_name, model in models.items():
    # Fraud Data
    with mlflow.start_run(run_name=f"{model_name} - Fraud Data"):
        fraud_pipeline = Pipeline(steps=[('preprocessor', fraud_preprocessor), ('model', model)])
        fraud_metrics = evaluate_model(fraud_pipeline, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test)
        mlflow.log_params({"model": model_name, "dataset": "Fraud Data"})
        mlflow.log_metrics(fraud_metrics)
        mlflow.sklearn.log_model(fraud_pipeline, "model")

    # Credit Card Data
    with mlflow.start_run(run_name=f"{model_name} - Credit Card Data"):
        credit_pipeline = Pipeline(steps=[('preprocessor', credit_preprocessor), ('model', model)])
        credit_metrics = evaluate_model(credit_pipeline, X_credit_train, y_credit_train, X_credit_test, y_credit_test)
        mlflow.log_params({"model": model_name, "dataset": "Credit Card Data"})
        mlflow.log_metrics(credit_metrics)
        mlflow.sklearn.log_model(credit_pipeline, "model")

print("Model training and evaluation completed.")

