[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/umatter/EDFB/blob/main/notebooks/R/EDFB_Digital_Finance_%26_Banking_Logistic_Regression_R.ipynb)

# EDFB - Digital Finance & Banking - Logistic Regression

---

This notebook demonstrates how to train and evaluate a logistic regression classifier in R, using tidyverse and modern R best practices. The workflow and explanations closely follow the Python version, but use idiomatic R code and packages. The dataset used is `banking.csv` (same as in the Python notebook).

Tested on: R 4.3+ (Colab VM), packages from Posit Package Manager.

In [None]:
# Install and load required libraries (self-bootstrapping with Posit Package Manager)
options(repos = c(CRAN = "https://packagemanager.posit.co/cran/__linux__/jammy/latest"))
required_packages <- c(
  "tidyverse","janitor","skimr","caret","recipes","ggplot2",
  "forcats","pROC","broom","readr","knitr","kableExtra"
)
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if (length(new_packages)) install.packages(new_packages, dependencies = TRUE, quiet = TRUE)

lapply(required_packages, function(pkg) {
  if (!suppressWarnings(require(pkg, character.only = TRUE, quietly = TRUE))) {
    stop(paste("Failed to load package:", pkg))
  }
})
theme_set(theme_minimal())
cat("\nR version:", R.version.string, "\n")

## Data Import

We load `data/banking.csv` from the repo if available; otherwise download it from GitHub (umatter/EDFB). This keeps the R notebook aligned with the Python version and runnable in Colab.

In [None]:
data_path <- "data/banking.csv"
dir.create("data", showWarnings = FALSE, recursive = TRUE)
if (file.exists(data_path)) {
  message("Loading data from ", data_path)
  banking <- readr::read_csv(data_path, show_col_types = FALSE) %>% janitor::clean_names()
} else {
  url <- "https://raw.githubusercontent.com/umatter/EDFB/main/data/banking.csv"
  tryCatch({
    message("Attempting to download banking.csv from ", url)
    utils::download.file(url, destfile = data_path, quiet = TRUE)
    banking <- readr::read_csv(data_path, show_col_types = FALSE) %>% janitor::clean_names()
    message("Downloaded to ", data_path)
  }, error = function(e) {
    stop("Could not obtain banking.csv from repo. Please add data/banking.csv and re-run. Error: ", conditionMessage(e))
  })
}

In [None]:
# Quick overview
glimpse(banking)
skim(banking)

## Data Structure

Let's check the types of variables and identify which are categorical and which are numeric.

In [None]:
# Convert character columns to factors (categorical)
banking <- banking %>%
  mutate(across(where(is.character), as.factor))

# Identify variable types
num_vars <- banking %>% select(where(is.numeric)) %>% names()
cat_vars <- banking %>% select(where(is.factor)) %>% names()
cat_vars <- setdiff(cat_vars, 'y') # y is the target

In [None]:
# Show variable types
list(numeric = num_vars, categorical = cat_vars)

## Missing Values

Check for missing values in the dataset.

In [None]:
banking %>% summarise(across(everything(), ~sum(is.na(.))))

## Descriptive Statistics

Get summary statistics for the numeric variables.

In [None]:
banking %>%
  select(all_of(num_vars)) %>%
  summary()

## Boxplots for Numeric Variables

Visualize the distribution and dispersion of numeric variables using boxplots. We'll standardize the variables for better comparison.

In [None]:
banking %>%
  select(all_of(num_vars)) %>%
  scale() %>%
  as_tibble() %>%
  pivot_longer(everything(), names_to = 'variable', values_to = 'value') %>%
  ggplot(aes(x = value, y = fct_rev(variable))) +
  geom_boxplot(fill = 'skyblue', alpha = 0.7) +
  labs(title = 'Standardized Boxplots of Numeric Variables', x = '', y = '')

## Remove Variables Not Used for Modeling

As in the Python notebook, we remove variables that are not used for modeling (e.g., duration, pdays, campaign, previous, age).

In [None]:
banking <- banking %>%
  select(-c(duration, pdays, age, campaign, previous))
num_vars <- setdiff(num_vars, c('duration', 'pdays', 'age', 'campaign', 'previous'))

## Target Variable Distribution

Visualize the distribution of the target variable `y`.

In [None]:
banking %>%
  ggplot(aes(x = factor(y))) +
  geom_bar(fill = 'steelblue') +
  labs(title = 'Distribution of Target Variable y', x = 'y', y = 'Count')

## Address Class Imbalance (Undersampling)

The dataset is imbalanced. We'll undersample the majority class (y=0) to have a 2:1 ratio with the minority class (y=1), as in the Python notebook.

In [None]:
set.seed(42)
data_1 <- banking %>% filter(y == 1)
data_0 <- banking %>% filter(y == 0)
n_1 <- nrow(data_1)
data_0_small <- data_0 %>% sample_n(size = 2 * n_1)
banking_bal <- bind_rows(data_1, data_0_small) %>%
  mutate(y = factor(y)) %>%
  slice_sample(prop = 1) # shuffle

In [None]:
# Check new target distribution
banking_bal %>%
  count(y) %>%
  ggplot(aes(x = y, y = n, fill = y)) +
  geom_col() +
  labs(title = 'Target Distribution After Downsampling', y = 'Count')

## Boxplots by Target

Visualize the distribution of numeric variables by target class.

In [None]:
banking_bal %>%
  select(all_of(num_vars), y) %>%
  pivot_longer(-y, names_to = 'variable', values_to = 'value') %>%
  ggplot(aes(x = y, y = value, fill = y)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ variable, scales = 'free', ncol = 2) +
  labs(title = 'Numeric Variable Distributions by Target', x = 'y', y = '')

## Categorical Variable Distributions by Target

Visualize the distribution of categorical variables by target class.

In [None]:
for (var in cat_vars) {
  p <- banking_bal %>%
    group_by(.data[[var]], y) %>%
    summarise(n = n(), .groups = 'drop') %>%
    group_by(.data[[var]]) %>%
    mutate(prop = n / sum(n)) %>%
    ggplot(aes(x = .data[[var]], y = prop, fill = y)) +
    geom_col(position = 'fill') +
    labs(title = paste('Target Distribution by', var), x = var, y = 'Proportion') +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  print(p)
}

## Data Preprocessing: Dummy Variables & Standardization

We'll use the `recipes` package to create dummy variables for categorical predictors and standardize numeric predictors.

In [None]:
rec <- recipe(y ~ ., data = banking_bal) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_center(all_numeric_predictors()) %>%
  step_scale(all_numeric_predictors())

prep_rec <- prep(rec)
banking_proc <- bake(prep_rec, new_data = NULL)

In [None]:
# Check processed data
glimpse(banking_proc)

## Correlation Analysis

Check for highly correlated predictors and remove them if necessary.

In [None]:
cor_mat <- banking_proc %>%
  dplyr::select(where(is.numeric)) %>%
  cor()

# Show top 10 absolute correlations (exclude self)
cor_df <- cor_mat %>%
  as.data.frame() %>%
  tibble::rownames_to_column("var1") %>%
  tidyr::pivot_longer(-var1, names_to = "var2", values_to = "correlation") %>%
  dplyr::filter(var1 != var2) %>%
  dplyr::mutate(abs_corr = abs(correlation)) %>%
  dplyr::arrange(dplyr::desc(abs_corr)) %>%
  dplyr::slice_head(n = 10)
cor_df

In [None]:
# Drop the same columns as in the Python notebook for alignment
col_to_drop <- c("emp_var_rate", "cons_price_idx", "euribor3m", "nr_employed", "loan_unknown", "housing_unknown")
banking_proc <- banking_proc %>% dplyr::select(-tidyselect::any_of(col_to_drop))

## Train/Test Split

Split the data into training and test sets (80/20 split, stratified by y).

In [None]:
set.seed(42)
train_idx <- createDataPartition(banking_proc$y, p = 0.8, list = FALSE)
train <- banking_proc[train_idx, ]
test <- banking_proc[-train_idx, ]

In [None]:
# Check split
table(train$y)
table(test$y)

## Fit Logistic Regression Model

Fit a logistic regression model using all predictors.

In [None]:
fit <- glm(y ~ ., data = train, family = binomial())
summary(fit)

## Model Evaluation: Predictions

Predict on the test set and compare predictions to the true values.

In [None]:
test$prob <- predict(fit, newdata = test, type = 'response')
test$pred <- ifelse(test$prob > 0.5, 1, 0)
test$pred <- factor(test$pred, levels = levels(test$y))

test %>%
  select(y, pred, prob) %>%
  head(10) %>%
  kable() %>%
  kable_styling()

## Confusion Matrix

Evaluate the confusion matrix and classification metrics.

In [None]:
conf_mat <- confusionMatrix(test$pred, test$y, positive = '1')
conf_mat

## Classification Report

Show precision, recall, F1-score, and accuracy.

In [None]:
conf_mat$byClass
conf_mat$overall

## ROC Curve & AUC

Plot the ROC curve and compute the AUC.

In [None]:
roc_obj <- roc(test$y, test$prob)
auc(roc_obj)
plot(roc_obj, col = 'blue', main = 'ROC Curve (AUC = {round(auc(roc_obj), 2)})')
abline(a = 0, b = 1, lty = 2, col = 'red')

## Model Coefficients (Odds Ratios)

Show the estimated odds ratios for each predictor.

In [None]:
exp(coef(fit)) %>%
  as.data.frame() %>%
  rownames_to_column('Variable') %>%
  rename(OddsRatio = 2) %>%
  arrange(desc(abs(OddsRatio))) %>%
  kable() %>%
  kable_styling()

## Conclusion

This notebook demonstrated a full workflow for logistic regression in R, including data cleaning, visualization, class balancing, preprocessing, model fitting, and evaluation. The approach and explanations are designed to be directly comparable to the Python version, but use tidyverse and modern R idioms throughout.