## NullFraud Predictive Modeling

## Loaded the required library

In [1]:
library(tidymodels)
library(tidyverse)
library(readxl)
library(dplyr)
library(repr)

# Setting a seed so our analysis is random but reproducible
set.seed(2005)

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.5     [32m✔[39m [34mrecipes     [39m 1.0.8
[32m✔[39m [34mdials       [39m 1.2.0     [32m✔[39m [34mrsample     [39m 1.2.0
[32m✔[39m [34mdplyr       [39m 1.1.3     [32m✔[39m [34mtibble      [39m 3.2.1
[32m✔[39m [34mggplot2     [39m 3.5.0     [32m✔[39m [34mtidyr       [39m 1.3.0
[32m✔[39m [34minfer       [39m 1.0.5     [32m✔[39m [34mtune        [39m 1.1.2
[32m✔[39m [34mmodeldata   [39m 1.2.0     [32m✔[39m [34mworkflows   [39m 1.1.3
[32m✔[39m [34mparsnip     [39m 1.1.1     [32m✔[39m [34mworkflowsets[39m 1.0.1
[32m✔[39m [34mpurrr       [39m 1.0.2     [32m✔[39m [34myardstick   [39m 1.2.0

“package ‘ggplot2’ was built under R version 4.3.2”
── [1mConflicts[22m ───────────────────────────────────────── tidymodels_conflicts() ──
[31m✖[39m [34mpurrr[39m::[32mdiscard()[39m masks [34mscales[39m::disca

## Cleaning and wrangling the data

In [2]:
# Read the data
fraud_data <- read_excel("BOLT Data Set.xlsx")

#Rename the column

fraud_data <- fraud_data |>
rename("Card_Identifier" = "Card Identifier", "Transaction_Date" = "Transaction Date", "Transaction_Time" = "Transaction Time",
       "Risk_Assessment" = "Risk Assessment", "Payment_Method" = "Payment Method", "Transaction_Value" = "Transaction Value",
       "Merchant_Location" = "Merchant Location", "Card_Present_Status" = "Card Present Status", 
       "Chip_Usage" = "Chip Usage" , "Cross_border_Transaction" = "Cross-border Transaction (Yes/No)",
        "Acquiring_Institution_ID" = "Acquiring Institution ID", "Merchant_Identifier" = "Merchant Identifier", 
       "Merchant_Category_Code" = "Merchant Category Code (MCC)", "Fraud_Indicator" = "Fraud Indicator (Yes/No)") 


In [3]:
# Converting to correct data types
numeric_column_names <- c("Risk_Assessment", "Transaction_Value")
factor_column_names <- c("Card_Identifier", "Payment_Method", "Merchant_Location", "Card_Present_Status", "Chip_Usage",
                       "Cross_border_Transaction", "Acquiring_Institution_ID", "Merchant_Identifier", 
                        "Fraud_Indicator")
fraud_data <- fraud_data |>
    mutate(across(all_of(numeric_column_names), as.numeric)) |>
    mutate(across(all_of(factor_column_names), as_factor)) 

fraud_data$Chip_Usage <- ifelse(fraud_data$Chip_Usage == "Yes", 1, 0)
fraud_data$Cross_border_Transaction <- ifelse(fraud_data$Cross_border_Transaction == "Yes", 1, 0)
fraud_data$Fraud_Indicator <- ifelse(fraud_data$Fraud_Indicator == "Yes", 1, 0)
fraud_data$Card_Present_Status <- ifelse(fraud_data$Card_Present_Status == "Yes", 1, 0)
    
glimpse(fraud_data)

Rows: 100,000
Columns: 14
$ Card_Identifier          [3m[90m<fct>[39m[23m card 1, card 2, card 3, card 4, card 5, card …
$ Transaction_Date         [3m[90m<dttm>[39m[23m 2023-05-11, 2023-06-05, 2023-06-05, 2023-07-…
$ Transaction_Time         [3m[90m<chr>[39m[23m "16:22:14.0", "15:16:35.0", "11:57:40.0", "18…
$ Risk_Assessment          [3m[90m<dbl>[39m[23m 362, 602, 482, 947, 1382, 612, 1327, 1037, 49…
$ Payment_Method           [3m[90m<fct>[39m[23m Paypass - Contactless, Online, Unknown, Onlin…
$ Transaction_Value        [3m[90m<dbl>[39m[23m 13.98, 24.64, 15.00, 30.56, 50.85, 75.77, 0.0…
$ Merchant_Location        [3m[90m<fct>[39m[23m USA, USA, USA, USA, USA, USA, USA, USA, USA, …
$ Card_Present_Status      [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ Chip_Usage               [3m[90m<dbl>[39m[23m 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
$ Cross_border_Transaction [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1

In [4]:
#Split the data into training and testing set
fraud_data_split <- fraud_data |>
    initial_split(prop = 0.75, strata = "Fraud_Indicator")

fraud_training <- training(fraud_data_split)
fraud_testing <- testing(fraud_data_split)


In [5]:
# Installing libraries
install.packages("aod")
#install.packages("glmnet")
install.packages("glmnet", repos = "https://cran.us.r-project.org")

“installation of package ‘aod’ had non-zero exit status”
Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

“unable to access index for repository https://cran.us.r-project.org/src/contrib:
  cannot open URL 'https://cran.us.r-project.org/src/contrib/PACKAGES'”
“package ‘glmnet’ is not available for this version of R

A version of this package for your version of R might be available elsewhere,
see the ideas at
https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages”


In [6]:
library(caret)
library(glmnet)

Loading required package: lattice


Attaching package: ‘caret’


The following objects are masked from ‘package:yardstick’:

    precision, recall, sensitivity, specificity


The following object is masked from ‘package:purrr’:

    lift




ERROR: Error in library(glmnet): there is no package called ‘glmnet’


## Making predictions based on the testing data

In [None]:
# Train a logistic regression model
model <- logistic_reg(mixture = double(1), penalty = double(1)) |>
  set_engine("glmnet") |>
  set_mode("classification") |>
  fit(Fraud_Indicator ~ ., data = fraud_training)

# Model summary
tidy(model)

# Class Predictions
pred_class <- predict(model,
                      new_data = fraud_testing,
                      type = "class")

# Class Probabilities
pred_proba <- predict(model,
                      new_data = fraud_testing,
                      type = "prob")