## NullFraud Predictive Modeling

## Loaded the required library

In [25]:
library(tidymodels)
library(tidyverse)
library(dplyr)
library(repr)
# Installing libraries
install.packages("aod")
install.packages("glmnet")
library(caret)
library(glmnet)
library(readr)
# Setting a seed so our analysis is random but reproducible
set.seed(2005)

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



# Cleaning and wrangling the data

In [27]:
# Read the data
fraud_data <- read_csv("BOLT Data Set.csv")

#Rename the column

fraud_data <- fraud_data |>
rename("Card_Identifier" = "Card Identifier", "Transaction_Date" = "Transaction Date", "Transaction_Time" = "Transaction Time",
       "Risk_Assessment" = "Risk Assessment", "Payment_Method" = "Payment Method", "Transaction_Value" = "Transaction Value",
       "Merchant_Location" = "Merchant Location", "Card_Present_Status" = "Card Present Status", 
       "Chip_Usage" = "Chip Usage" , "Cross_border_Transaction" = "Cross-border Transaction (Yes/No)",
        "Acquiring_Institution_ID" = "Acquiring Institution ID", "Merchant_Identifier" = "Merchant Identifier", 
       "Merchant_Category_Code" = "Merchant Category Code (MCC)", "Fraud_Indicator" = "Fraud Indicator (Yes/No)") 
     

[1mRows: [22m[34m100000[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (10): Card Identifier, Transaction Time, Payment Method, Merchant Locat...
[32mdbl[39m   (3): Risk Assessment, Transaction Value, Merchant Category Code (MCC)
[34mdttm[39m  (1): Transaction Date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


The purpose of renaming the columns is to make them more suitable for analysis or to adhere to a specific naming convention.

In [28]:
# Converting to correct data types
numeric_column_names <- c("Risk_Assessment", "Transaction_Value")
factor_column_names <- c("Card_Identifier", "Payment_Method", "Merchant_Location", "Card_Present_Status", "Chip_Usage",
                       "Cross_border_Transaction", "Acquiring_Institution_ID", "Merchant_Identifier", 
                        "Fraud_Indicator")

In [29]:
fraud_data$Chip_Usage <- ifelse(fraud_data$Chip_Usage == "Yes", 1, 0)
fraud_data$Cross_border_Transaction <- ifelse(fraud_data$Cross_border_Transaction == "Yes", 1, 0)
fraud_data$Fraud_Indicator <- ifelse(fraud_data$Fraud_Indicator == "Yes", 1, 0)
fraud_data$Card_Present_Status <- ifelse(fraud_data$Card_Present_Status == "Yes", 1, 0)

fraud_data <- fraud_data |>
    mutate(across(all_of(numeric_column_names), as.numeric)) |>
    mutate(across(all_of(factor_column_names), as.factor))

fraud_data$Risk_Assessment <- ifelse(is.na(fraud_data$Risk_Assessment), mean(fraud_data$Risk_Assessment, na.rm = TRUE), fraud_data$Risk_Assessment)

glimpse(fraud_data)

Rows: 100,000
Columns: 14
$ Card_Identifier          [3m[90m<fct>[39m[23m card 1, card 2, card 3, card 4, card 5, card …
$ Transaction_Date         [3m[90m<dttm>[39m[23m 2023-05-11, 2023-06-05, 2023-06-05, 2023-07-…
$ Transaction_Time         [3m[90m<chr>[39m[23m "16:22:14.0", "15:16:35.0", "11:57:40.0", "18…
$ Risk_Assessment          [3m[90m<dbl>[39m[23m 362, 602, 482, 947, 1382, 612, 1327, 1037, 49…
$ Payment_Method           [3m[90m<fct>[39m[23m Paypass - Contactless, Online, Unknown, Onlin…
$ Transaction_Value        [3m[90m<dbl>[39m[23m 13.98, 24.64, 15.00, 30.56, 50.85, 75.77, 0.0…
$ Merchant_Location        [3m[90m<fct>[39m[23m USA, USA, USA, USA, USA, USA, USA, USA, USA, …
$ Card_Present_Status      [3m[90m<fct>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ Chip_Usage               [3m[90m<fct>[39m[23m 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
$ Cross_border_Transaction [3m[90m<fct>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1

First, I defined lists of column names to be converted to numeric and factor data types. Then, I converted binary categorical variables to numeric (0 or 1) and converted specified columns to their respective data types (numeric or factor). Finally, to tackle NA values, I imputed missing values in the "Risk_Assessment" column with the mean of non-missing values.

## Split the data set into training and testing data 

In [30]:
#Split the data into training and testing set
fraud_data_split <- fraud_data |>
    initial_split(prop = 0.8, strata = "Fraud_Indicator")

fraud_training <- training(fraud_data_split)
fraud_testing <- testing(fraud_data_split)

I splitted data into training and test set using the initial_split() function and the prop attribute defining the train data proportion.

In [31]:
# Define the logistic regression model with penalty and mixture hyperparameters
log_reg <- logistic_reg(mixture = tune(), penalty = tune(), engine = "glmnet")

# Define the grid search for the hyperparameters
grid <- grid_regular(mixture(), penalty(), levels = c(mixture = 4, penalty = 3))

# Define the workflow for the model
log_reg_wf <- workflow() %>%
  add_model(log_reg) %>%
  add_formula(Fraud_Indicator ~ Risk_Assessment + Chip_Usage)

# Define the resampling method for the grid search
folds <- vfold_cv(fraud_training, v = 5)

# Tune the hyperparameters using the grid search
log_reg_tuned <- tune_grid(
  log_reg_wf,
  resamples = folds,
  grid = grid,
  control = control_grid(save_pred = TRUE))

select_best(log_reg_tuned, metric = "roc_auc")

penalty,mixture,.config
<dbl>,<dbl>,<chr>
1,0,Preprocessor1_Model03


We optimized the predictive power of the model by tuning it, which is running the model lots of times with different values of the hyperparameters, and see which one gives the best predictions.

In [35]:
# Train a logistic regression model
model <- logistic_reg(mixture = double(1), penalty = double(1)) %>%
  set_engine("glmnet") %>%
  set_mode("classification") %>%
  fit(Fraud_Indicator ~ Risk_Assessment + Chip_Usage, data = fraud_training)

# Model summary
tidy(model)

term,estimate,penalty
<chr>,<dbl>,<dbl>
(Intercept),-7.177048384,0
Risk_Assessment,0.001059926,0
Chip_Usage1,-0.70084497,0


In [37]:
# Class Predictions
pred_class <- predict(model,
                      new_data = fraud_testing,
                      type = "class")

# Class Probabilities
pred_proba <- predict(model,
                      new_data = fraud_testing,
                      type = "prob")

In [41]:
log_reg_final <- logistic_reg(penalty = 1, mixture = 0) %>%
                 set_engine("glmnet") %>%
                 set_mode("classification") %>%
                 fit(Fraud_Indicator~Risk_Assessment + Chip_Usage, data = fraud_training)

# Evaluate the model performance on the testing set
pred_class <- predict(log_reg_final,
                      new_data = fraud_testing,
                      type = "class")
results <- fraud_testing %>%
  select(Fraud_Indicator) %>%
  bind_cols(pred_class, pred_proba)

# Create confusion matrix
conf_mat(results, truth = Fraud_Indicator,
         estimate = .pred_class)

accuracy(results, truth = Fraud_Indicator, estimate = .pred_class)

          Truth
Prediction     0     1
         0 19957    43
         1     0     0

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.99785


Using the best hyperparameters, we trained a logistic regression model and used it to generate predictions on test set. Then we created a confusion matrix using the true values, and the estimates