In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier  # Faster alternative to Gradient Boosting
from sklearn.neural_network import MLPClassifier  # For MLP
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
from joblib import Parallel, delayed

In [2]:
# Load datasets
fraud_data = pd.read_csv('Fraud_Data.csv')
creditcard_data = pd.read_csv('creditcard.csv')

# Reduce dataset size for testing
fraud_data = fraud_data.sample(frac=0.1, random_state=42)  # Use 10% of the data
creditcard_data = creditcard_data.sample(frac=0.1, random_state=42)  # Use 10% of the data

In [3]:
# Convert date and time fields to datetime
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

In [4]:
# Convert datetime fields to numerical features
fraud_data['signup_year'] = fraud_data['signup_time'].dt.year
fraud_data['signup_month'] = fraud_data['signup_time'].dt.month
fraud_data['signup_day'] = fraud_data['signup_time'].dt.day
fraud_data['signup_hour'] = fraud_data['signup_time'].dt.hour
fraud_data['purchase_year'] = fraud_data['purchase_time'].dt.year
fraud_data['purchase_month'] = fraud_data['purchase_time'].dt.month
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.day
fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour

# Drop the original datetime columns
fraud_data = fraud_data.drop(columns=['signup_time', 'purchase_time'])

In [6]:
# Import preprocessing libraries
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Identify categorical columns
categorical_cols = ['device_id', 'source', 'browser', 'sex']  # Include 'device_id'

# Define column transformer with one-hot encoding for categorical features and scaling for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), ['purchase_value', 'age', 'signup_year', 'signup_month', 'signup_day', 'signup_hour', 'purchase_year', 'purchase_month', 'purchase_day', 'purchase_hour'])
    ],
    remainder='passthrough'
)

In [7]:
# Transform the features
X_fraud = preprocessor.fit_transform(fraud_data.drop(columns=['class']))
X_creditcard = StandardScaler().fit_transform(creditcard_data.drop(columns=['Class']).values)
y_fraud = fraud_data['class']
y_creditcard = creditcard_data['Class']

In [8]:
# Train-Test Split
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.1, random_state=42)  # Use 10% test size
X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test = train_test_split(X_creditcard, y_creditcard, test_size=0.1, random_state=42)


In [9]:
# Model Selection
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=50),  # Reduce number of trees
    'XGBoost': XGBClassifier(),  # Faster alternative to Gradient Boosting
    'MLP': MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=500, random_state=42),  # Simpler MLP
}

In [10]:
# Function to train and evaluate models
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=1)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return acc, precision, recall, f1

In [11]:
# Function to train and log models
def train_and_log_model(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run(nested=True):
        # Train and evaluate on fraud data
        acc_fraud, precision_fraud, recall_fraud, f1_fraud = train_and_evaluate(model, X_train, X_test, y_train, y_test)
        
        # Log metrics for fraud data
        mlflow.log_param("model", model_name)
        mlflow.log_metric("accuracy_fraud", acc_fraud)
        mlflow.log_metric("precision_fraud", precision_fraud)
        mlflow.log_metric("recall_fraud", recall_fraud)
        mlflow.log_metric("f1_fraud", f1_fraud)
        
        # Create an input example and convert to dense array
        input_example = np.array(X_train[0].todense()).reshape(1, -1)
        
        # Log model with input example
        mlflow.sklearn.log_model(model, model_name, input_example=input_example)

In [None]:

# MLflow tracking setup
mlflow.set_experiment("Fraud Detection Models")

In [None]:
# Train, evaluate, and log models
with mlflow.start_run():
    Parallel(n_jobs=-1)(delayed(train_and_log_model)(model_name, model, X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test) for model_name, model in models.items())

print("Model training and evaluation complete. Check MLflow for detailed metrics and logs.")