In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [3]:

# Load datasets
fraud_data = pd.read_csv('Fraud_Data.csv')
creditcard_data = pd.read_csv('creditcard.csv')


In [5]:
# Convert date and time fields to datetime
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])


In [6]:
# Convert datetime fields to numerical features
fraud_data['signup_year'] = fraud_data['signup_time'].dt.year
fraud_data['signup_month'] = fraud_data['signup_time'].dt.month
fraud_data['signup_day'] = fraud_data['signup_time'].dt.day
fraud_data['signup_hour'] = fraud_data['signup_time'].dt.hour
fraud_data['purchase_year'] = fraud_data['purchase_time'].dt.year
fraud_data['purchase_month'] = fraud_data['purchase_time'].dt.month
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.day
fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour

# Drop the original datetime columns
fraud_data = fraud_data.drop(columns=['signup_time', 'purchase_time'])



In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
# Identify categorical columns
categorical_cols = ['device_id', 'source', 'browser', 'sex']

# Define column transformer with one-hot encoding for categorical features and scaling for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), ['purchase_value', 'age', 'signup_year', 'signup_month', 'signup_day', 'signup_hour', 'purchase_year', 'purchase_month', 'purchase_day', 'purchase_hour'])
    ],
    remainder='passthrough'
)

# Transform the features
X_fraud = preprocessor.fit_transform(fraud_data.drop(columns=['class']))
X_creditcard = StandardScaler().fit_transform(creditcard_data.drop(columns=['Class']).values)
y_fraud = fraud_data['class']
y_creditcard = creditcard_data['Class']


In [8]:
# Train-Test Split
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)
X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test = train_test_split(X_creditcard, y_creditcard, test_size=0.2, random_state=42)



In [9]:
# Model Selection
models = {
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}


In [10]:
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=1)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return acc, precision, recall, f1


In [None]:
# MLflow tracking
mlflow.set_experiment("Fraud Detection Models")
import numpy as np

with mlflow.start_run():
    for model_name, model in models.items():
        with mlflow.start_run(nested=True):  # Create a nested run for each model
            # Train and evaluate on fraud data
            acc_fraud, precision_fraud, recall_fraud, f1_fraud = train_and_evaluate(model, X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test)
            
            # Log metrics for fraud data
            mlflow.log_param("model", model_name)
            mlflow.log_metric("accuracy_fraud", acc_fraud)
            mlflow.log_metric("precision_fraud", precision_fraud)
            mlflow.log_metric("recall_fraud", recall_fraud)
            mlflow.log_metric("f1_fraud", f1_fraud)
            
            # Train and evaluate on creditcard data
            acc_cc, precision_cc, recall_cc, f1_cc = train_and_evaluate(model, X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test)
            
            # Log metrics for creditcard data
            mlflow.log_metric("accuracy_cc", acc_cc)
            mlflow.log_metric("precision_cc", precision_cc)
            mlflow.log_metric("recall_cc", recall_cc)
            mlflow.log_metric("f1_cc", f1_cc)
            
            # Create an input example and convert to dense array
            input_example = np.array(X_fraud_train[0].todense()).reshape(1, -1)
            
            # Log model with input example
            mlflow.sklearn.log_model(model, model_name, input_example=input_example)

print("Model training and evaluation complete. Check MLflow for detailed metrics and logs.")


  "inputs": [
    [
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
 