# Practical of Real Financial Data (R version)

---
This notebook covers an end-to-end unsupervised and supervised learning task on real financial data, focusing on SMEs applying for loans at a P2P lending platform. The workflow mirrors the Python version, but uses idiomatic R and tidyverse approaches for clarity and comparison.

## Topics covered

* Data description and pre-processing
* K-means clustering with different k values
* Cluster evaluation and selection of best k
* Logistic regression classifier for loan default prediction (full dataset)
* Separate models for each identified cluster


In [ ]:
# Load required libraries
library(tidyverse)
library(cluster)
library(factoextra)
library(caret)
library(ggplot2)
library(gridExtra)
library(broom)
library(pROC)
library(ROCR)
library(scales)


In [ ]:
# Set seed for reproducibility
set.seed(42)

## Data Import

We use a CSV file `borrower_companies.csv` with financial ratios and a `status` column (1 = default, 0 = paid back).

In [ ]:
# Read the data (assume file is in working directory)
dataset <- read_csv('borrower_companies.csv')
glimpse(dataset)

## Data Exploration

Let's check the structure, missing values, and summary statistics.

In [ ]:
# Check dimensions and missing values
dim(dataset)
colSums(is.na(dataset))

In [ ]:
# Summary statistics
summary(dataset)

## Visualize Feature Distributions

Boxplots (standardized) for all features except `status`.

In [ ]:
# Standardize features (excluding status)
features <- dataset %>% select(-status)
features_scaled <- as_tibble(scale(features))
features_scaled_long <- features_scaled %>% 
  mutate(row = row_number()) %>%
  pivot_longer(-row, names_to = 'variable', values_to = 'value')

ggplot(features_scaled_long, aes(x = value, y = variable)) +
  geom_boxplot(fill = 'skyblue', outlier.alpha = 0.2) +
  labs(title = 'Standardized Feature Distributions', x = '', y = '') +
  theme_minimal()

## Outlier Removal (Z-score method)

Remove rows where any feature has |z| > 4.

In [ ]:
z_scores <- as_tibble(scale(features))
outlier_mask <- apply(abs(z_scores), 1, function(x) all(x < 4))
dataset_o <- dataset[outlier_mask, ]
dim(dataset_o)

In [ ]:
# Boxplot after outlier removal
features_o <- dataset_o %>% select(-status)
features_o_scaled <- as_tibble(scale(features_o))
features_o_scaled_long <- features_o_scaled %>% 
  mutate(row = row_number()) %>%
  pivot_longer(-row, names_to = 'variable', values_to = 'value')

ggplot(features_o_scaled_long, aes(x = value, y = variable)) +
  geom_boxplot(fill = 'lightgreen', outlier.alpha = 0.2) +
  labs(title = 'Standardized Feature Distributions (Outliers Removed)', x = '', y = '') +
  theme_minimal()

## Prepare Data for Clustering

Standardize features for clustering.

In [ ]:
X <- dataset_o %>% select(-status)
X_scaled <- scale(X)
y <- dataset_o$status

## Principal Component Analysis (PCA)

Visualize explained variance to understand dimensionality.

In [ ]:
pca <- prcomp(X_scaled, center = TRUE, scale. = TRUE)
explained_var <- pca$sdev^2 / sum(pca$sdev^2)
cum_var <- cumsum(explained_var)

tibble(PC = 1:length(explained_var),
       Explained = explained_var,
       Cumulative = cum_var) %>%
  ggplot(aes(x = PC)) +
  geom_bar(aes(y = Explained), stat = 'identity', fill = 'steelblue', alpha = 0.6) +
  geom_line(aes(y = Cumulative), color = 'red', size = 1) +
  geom_point(aes(y = Cumulative), color = 'red', size = 2) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(title = 'PCA: Explained Variance', y = 'Variance', x = 'Principal Component') +
  theme_minimal()

## K-means Clustering: Try Different k

Evaluate clusters using silhouette and WCSS (within-cluster sum of squares).

In [ ]:
max_k <- 7
silhouette_scores <- numeric(max_k - 1)
wcss <- numeric(max_k - 1)
labels_list <- list()

for (k in 2:max_k) {
  km <- kmeans(X_scaled, centers = k, nstart = 25)
  ss <- silhouette(km$cluster, dist(X_scaled))
  silhouette_scores[k-1] <- mean(ss[, 3])
  wcss[k-1] <- km$tot.withinss
  labels_list[[as.character(k)]] <- km$cluster
}

tibble(Clusters = 2:max_k, Silhouette = silhouette_scores, WCSS = wcss)

In [ ]:
# Visualize clusters in PCA space for each k
pca_scores <- as_tibble(pca$x[, 1:2])
plots <- list()
for (k in 2:max_k) {
  clust <- as.factor(labels_list[[as.character(k)]])
  plots[[k-1]] <- ggplot(pca_scores, aes(x = PC1, y = PC2, color = clust)) +
    geom_point(alpha = 0.6) +
    labs(title = paste('k =', k), color = 'Cluster') +
    theme_minimal()
}
do.call(grid.arrange, c(plots, ncol = 2))

## Elbow and Silhouette Plots

Choose the optimal number of clusters.

In [ ]:
elbow_df <- tibble(Clusters = 2:max_k, Silhouette = silhouette_scores, WCSS = wcss)

ggplot(elbow_df, aes(x = Clusters)) +
  geom_line(aes(y = Silhouette), color = 'blue') +
  geom_point(aes(y = Silhouette), color = 'blue') +
  scale_y_continuous(sec.axis = sec_axis(~ ., name = 'WCSS', labels = comma)) +
  geom_line(aes(y = rescale(WCSS, to = range(Silhouette))), color = 'red') +
  geom_point(aes(y = rescale(WCSS, to = range(Silhouette))), color = 'red') +
  labs(y = 'Silhouette', x = 'Number of clusters',
       title = 'Elbow & Silhouette for k-means') +
  theme_minimal()

## Inspect Cluster Feature Distributions

Pick k = 3 for illustration.

In [ ]:
chosen_k <- 3
cluster_labels <- labels_list[[as.character(chosen_k)]]
X_labeled <- X_scaled %>% as_tibble() %>% mutate(cluster = factor(cluster_labels))

X_long <- X_labeled %>%
  pivot_longer(-cluster, names_to = 'variable', values_to = 'value')

ggplot(X_long, aes(x = value, fill = cluster)) +
  geom_density(alpha = 0.4) +
  facet_wrap(~ variable, scales = 'free', ncol = 2) +
  labs(title = 'Feature Distributions by Cluster', x = '', y = '') +
  theme_minimal()

# Supervised Learning: Logistic Regression

Compare a model trained on the full dataset vs. one per cluster.

In [ ]:
# Downsample to balance classes (undersample majority class)
library(rsample)
library(recipes)

dataset_o$status <- as.factor(dataset_o$status)
minority <- dataset_o %>% filter(status == 1)
majority <- dataset_o %>% filter(status == 0)
set_size <- nrow(minority) * 2
majority_down <- majority %>% sample_n(set_size)
balanced <- bind_rows(minority, majority_down)
balanced <- balanced %>% sample_frac(1) # shuffle
table(balanced$status)

In [ ]:
# Split into train/test
set.seed(42)
split <- initial_split(balanced, prop = 0.8, strata = status)
train <- training(split)
test <- testing(split)

# Standardize features
rec <- recipe(status ~ ., data = train) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  prep()
X_train <- bake(rec, new_data = train) %>% select(-status)
y_train <- train$status
X_test <- bake(rec, new_data = test) %>% select(-status)
y_test <- test$status

In [ ]:
# Fit logistic regression
model <- glm(status ~ ., data = cbind(X_train, status = y_train), family = binomial())
summary(model)

In [ ]:
# Predict and evaluate
pred_probs <- predict(model, newdata = X_test, type = 'response')
pred_class <- ifelse(pred_probs > 0.5, 1, 0)
conf_mat <- table(Predicted = pred_class, Actual = as.numeric(as.character(y_test)))
conf_mat

In [ ]:
# Classification report
caret::confusionMatrix(as.factor(pred_class), y_test, positive = '1')

In [ ]:
# ROC curve and AUC
roc_obj <- roc(as.numeric(as.character(y_test)), pred_probs)
plot(roc_obj, col = 'blue', main = 'ROC Curve (Full Dataset)')
auc(roc_obj)

# Per-Cluster Logistic Regression

Repeat the above for each cluster (example for cluster 1).

In [ ]:
for (cl in 1:chosen_k) {
  cat('---\nCluster', cl, '\n')
  idx <- which(cluster_labels == cl)
  cluster_data <- dataset_o[idx, ]
  cluster_data$status <- as.factor(cluster_data$status)
  minority <- cluster_data %>% filter(status == 1)
  majority <- cluster_data %>% filter(status == 0)
  if (nrow(minority) < 5 | nrow(majority) < 5) {
    cat('Too few samples, skipping\n')
    next
  }
  set_size <- nrow(minority) * 2
  majority_down <- majority %>% sample_n(min(set_size, nrow(majority)))
  balanced <- bind_rows(minority, majority_down) %>% sample_frac(1)
  split <- initial_split(balanced, prop = 0.8, strata = status)
  train <- training(split)
  test <- testing(split)
  rec <- recipe(status ~ ., data = train) %>%
    step_center(all_predictors()) %>%
    step_scale(all_predictors()) %>%
    prep()
  X_train <- bake(rec, new_data = train) %>% select(-status)
  y_train <- train$status
  X_test <- bake(rec, new_data = test) %>% select(-status)
  y_test <- test$status
  model <- glm(status ~ ., data = cbind(X_train, status = y_train), family = binomial())
  pred_probs <- predict(model, newdata = X_test, type = 'response')
  pred_class <- ifelse(pred_probs > 0.5, 1, 0)
  conf_mat <- table(Predicted = pred_class, Actual = as.numeric(as.character(y_test)))
  print(conf_mat)
  print(caret::confusionMatrix(as.factor(pred_class), y_test, positive = '1'))
  roc_obj <- roc(as.numeric(as.character(y_test)), pred_probs)
  plot(roc_obj, col = 'red', main = paste('ROC Curve (Cluster', cl, ')'))
  print(auc(roc_obj))
}


----

This notebook demonstrates a full unsupervised + supervised learning workflow in R, closely paralleling the Python version for easy comparison. All code uses tidyverse and modern R best practices.