### 🕵️‍♂️ Comprehensive Guide to Fraud Detection System Workflow

This summary encapsulates the workflow of a fraud detection system, detailing the steps from data loading and preprocessing to model training, evaluation, and outlier detection. It emphasizes the systematic approach to identifying fraud using machine learning techniques and advanced analytics, highlighting the integration of Random Forest and Local Outlier Factor (LOF) models for refined fraud detection and analysis.


This code outlines a process for loading, preprocessing, splitting, and evaluating data for a fraud detection system. It begins by loading data from a specified file path and preprocessing it, which includes encoding categorical columns. The data is then split based on the month, with earlier months used for training and later months for testing. A Random Forest Classifier is trained on the training set. The model's performance is evaluated by predicting fraud probabilities, selecting a threshold for classification based on a desired false positive rate, and calculating recall. Additionally, Local Outlier Factor (LOF) is applied to non-fraud cases to identify outliers. The process includes logging for each major step, saving the trained Random Forest and LOF models, and visualizing results through ROC curves, confusion matrices, LOF scores, and a classification report

### 📦 Install Packages

In [2]:
!pip install loguru



### ⚙️ Config

In [3]:
# config.py
config = {
    "FILE_PATH": '/kaggle/input/bank-account-fraud-dataset-neurips-2022/Variant I.csv',
    "LOG_FILE": "my_log.log",
    "LOG_ROTATION": "10 MB",
    "RANDOM_STATE": 42,
    "N_NEIGHBORS": 20,
    "CONTAMINATION": 'auto',
}


### 📚 Import Libraries

In [4]:
from loguru import logger
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report

import pickle
#from config import config  # Make sure to import config correctly

logger.add(config["LOG_FILE"], rotation=config["LOG_ROTATION"])  # Use config dictionary values


1

### 📊 Utils Plots

In [5]:
def plot_roc_auc(y_test, predictions):
    try:
        fpr, tpr, _ = roc_curve(y_test, predictions)
        roc_auc = auc(fpr, tpr)
        plt.figure()
        plt.plot(fpr, tpr, label='RF AUC = %0.2f' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.show()
        logger.info("ROC/AUC plot generated successfully.")
    except Exception as e:
        logger.error(f"Error generating ROC/AUC plot: {e}")

def plot_confusion_matrix(y_test, preds_binary):
    try:
        cm = confusion_matrix(y_test, preds_binary)
        plt.figure(figsize=(5,5))
        sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, cmap='Blues', square=True)
        plt.xlabel('Predicted label')
        plt.ylabel('True label')
        plt.title('Confusion Matrix')
        plt.show()
        logger.info("Confusion matrix plotted successfully.")
    except Exception as e:
        logger.error(f"Error plotting confusion matrix: {e}")

def plot_lof_scores(non_fraud_data, lof_scores):
    try:
        plt.figure()
        plt.scatter(non_fraud_data.index, lof_scores, c='blue', label='LOF Score')
        plt.xlabel('Sample Index')
        plt.ylabel('LOF Score')
        plt.title('LOF Scores for Non-Fraud Cases')
        plt.legend()
        plt.show()
        logger.info("LOF scores scatter plot generated successfully.")
    except Exception as e:
        logger.error(f"Error generating LOF scores scatter plot: {e}")
        


### 🕵️‍♂️ Fraud Detection System

In [None]:
#def load_and_preprocess_data(file_path, nrows=2000):
def load_and_preprocess_data(file_path,nrows=10000):
    try:
        data = pd.read_csv(file_path)
        logger.info("Data loaded successfully.")  # Use logger instead of logging
        categorical_cols = ['payment_type', 'source', 'device_os', 'employment_status', 'housing_status']
        for col in categorical_cols:
            if col in data.columns:
                data[col] = LabelEncoder().fit_transform(data[col])
        logger.info("Categorical columns encoded.")
        return data
    except Exception as e:
        logger.error(f"Error loading or preprocessing data: {e}")
        raise
        
def split_data_by_month(data):
    # Assuming 'month' is a column in your DataFrame indicating the month of each record
    train_data = data[data["month"] < 6].sample(frac=1, replace=False)
    test_data = data[data["month"] >= 6].sample(frac=1, replace=False)
    return train_data, test_data


def evaluate_model(rf_model, X_test, y_test):
    # Get model predictions
    predictions = rf_model.predict_proba(X_test)[:, 1]
    
    # Obtain ROC curve
    fprs, tprs, thresholds = metrics.roc_curve(y_test, predictions)
    
    # Select 5% FPR as threshold
    threshold = thresholds[fprs == max(fprs[fprs < 0.05])][0]
    recall = tprs[fprs == max(fprs[fprs < 0.05])][0]
    
    # Binarize predictions based on the selected threshold
    preds_binary = (predictions > threshold).astype(int)
    
    logger.info(f"Selected Threshold: {threshold}, Recall at 5% FPR: {recall}")
    
    # Further steps for Aequitas or other analyses would go here
    
    return preds_binary, recall

def main():
    try:
        data = load_and_preprocess_data(config["FILE_PATH"])  # Access FILE_PATH using config dictionary

        #X = data.drop('fraud_bool', axis=1)
        #y = data['fraud_bool']
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=config["RANDOM_STATE"])
        
        # Split data by month instead of using train_test_split
        train_data, test_data = split_data_by_month(data)
        #print(train_data)
        #print(test_data)
        
        # Separate features and labels for training and testing sets
        X_train = train_data.drop('fraud_bool', axis=1)
        y_train = train_data['fraud_bool']
        X_test = test_data.drop('fraud_bool', axis=1)
        y_test = test_data['fraud_bool']

        rf_model = RandomForestClassifier(random_state=config["RANDOM_STATE"])
        rf_model.fit(X_train, y_train)
        logger.info("Random Forest model trained.")
        
        preds_binary, recall = evaluate_model(rf_model, X_test, y_test)
        
        # Continue with any additional processing or logging
        logger.info("Evaluation complete.")

        rf_predictions = rf_model.predict(X_test)
        predictions_proba = rf_model.predict_proba(X_test)[:, 1]

        
        non_fraud_indices = np.where(rf_predictions == 0)[0]
        non_fraud_data = X_test.iloc[non_fraud_indices]
       

        lof_model = LocalOutlierFactor(n_neighbors=config["N_NEIGHBORS"], contamination=config["CONTAMINATION"])
        lof_predictions = lof_model.fit_predict(non_fraud_data)
        lof_scores = -lof_model.negative_outlier_factor_
        
        
    
        plot_roc_auc(y_test, predictions_proba, ax=axs[0, 0])
        plot_confusion_matrix(y_test, preds_binary, ax=axs[0, 1])
        plot_lof_scores(non_fraud_data, lof_scores, ax=axs[1, 0])

        classification_text = "Classification Report:\n" + str(classification_report(y_test, preds_binary))
        classification_text
        
        logger.info("LOF model applied to non-fraud cases.")

        with open('random_forest_model.pkl', 'wb') as file:
            pickle.dump(rf_model, file)
        logger.info("Random Forest model saved.")
        
        with open('lof_model_config.pkl', 'wb') as file:
            pickle.dump(lof_model, file)
        logger.info("LOF model configuration saved.")

        logger.info("Processing complete.")
    except Exception as e:
        logger.error(f"Error in main processing: {e}")

if __name__ == "__main__":
    main()


[32m2024-02-22 04:38:38.371[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_and_preprocess_data[0m:[36m5[0m - [1mData loaded successfully.[0m
[32m2024-02-22 04:38:39.885[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_and_preprocess_data[0m:[36m10[0m - [1mCategorical columns encoded.[0m
