# Clustering & Company Valuation using Clustering (R Version)

---

This notebook demonstrates unsupervised learning and company valuation using clustering, implemented in R. The structure and flow mirror the Python version, but use idiomatic R and tidyverse approaches for clarity and best practice.

We cover:
- **Introduction to clustering**: Simulate data, apply k-means and DBSCAN, and evaluate clustering.
- **Clustering for company valuation**: Use clustering to value companies using the multiples method, with real data.


# Introduction to Clustering in R

We start by loading the necessary libraries.

In [ ]:
library(tidyverse)
library(cluster)
library(factoextra)
library(dbscan)
library(ggplot2)
set.seed(50)

In [ ]:
# Simulate a dataset with 4 clusters
n <- 200
centers <- matrix(c(2,2, 8,3, 3,6, 7,7), ncol=2, byrow=TRUE)
X <- as_tibble(cluster::clusplot::simdata(n, 2, centers=centers, sd=1.6))
colnames(X) <- c("x1", "x2")
X$label <- as.factor(rep(1:4, each=n/4))
head(X)

In [ ]:
# Plot the simulated data
ggplot(X, aes(x=x1, y=x2, color=label)) +
  geom_point(size=2) +
  theme_minimal() +
  labs(title="Simulated Data with 4 Clusters")

In [ ]:
# Apply k-means clustering
kmeans_result <- kmeans(X %>% select(x1, x2), centers=4, nstart=25)
X$cluster <- as.factor(kmeans_result$cluster)
fviz_cluster(list(data=X %>% select(x1, x2), cluster=kmeans_result$cluster), geom="point")

In [ ]:
# Evaluate clustering: Silhouette and WCSS
sil <- silhouette(kmeans_result$cluster, dist(X %>% select(x1, x2)))
mean_sil <- mean(sil[,3])
wcss <- kmeans_result$tot.withinss
cat("Silhouette (mean):", round(mean_sil,3), "\nWCSS:", round(wcss,1), "\n")

In [ ]:
# Explore different cluster numbers
max_n_clusters <- 7
results <- tibble(Clusters=integer(), Silhouette=numeric(), WCSS=numeric())
for (k in 2:max_n_clusters) {
  km <- kmeans(X %>% select(x1, x2), centers=k, nstart=25)
  sil <- silhouette(km$cluster, dist(X %>% select(x1, x2)))
  results <- results %>% add_row(Clusters=k, Silhouette=mean(sil[,3]), WCSS=km$tot.withinss)
}
results

In [ ]:
# Plot Silhouette and WCSS (Elbow method)
ggplot(results, aes(x=Clusters)) +
  geom_line(aes(y=Silhouette, color="Silhouette"), size=1.2) +
  geom_point(aes(y=Silhouette, color="Silhouette"), size=2) +
  geom_line(aes(y=scale(WCSS), color="WCSS (scaled)"), size=1.2, linetype="dashed") +
  geom_point(aes(y=scale(WCSS), color="WCSS (scaled)"), size=2, shape=17) +
  scale_color_manual(values=c("blue", "red")) +
  labs(y="Metric Value", color="Metric", title="Cluster Evaluation Metrics") +
  theme_minimal()

In [ ]:
# Try DBSCAN on a non-spherical dataset (moons)
library(mlbench)
moons <- mlbench.2dnormals(200, 2, cl=2, sd=0.05)
moon_df <- as_tibble(moons$x)
colnames(moon_df) <- c("x1", "x2")
ggplot(moon_df, aes(x=x1, y=x2)) + geom_point(size=2) + theme_minimal() + labs(title="Moons Data")

In [ ]:
# k-means on moons data
km_moon <- kmeans(moon_df, centers=2, nstart=25)
moon_df$cluster <- as.factor(km_moon$cluster)
ggplot(moon_df, aes(x=x1, y=x2, color=cluster)) + geom_point(size=2) + theme_minimal() + labs(title="K-means on Moons Data")

In [ ]:
# DBSCAN on moons data
db_moon <- dbscan(moon_df, eps=0.3, minPts=5)
moon_df$dbscan <- as.factor(db_moon$cluster)
ggplot(moon_df, aes(x=x1, y=x2, color=dbscan)) + geom_point(size=2) + theme_minimal() + labs(title="DBSCAN on Moons Data")

# Clustering for Company Valuation

In [ ]:
# Step 1: Data collection
# Read in company financial data (replace with your file path)
library(readr)
dataset <- read_csv("financialdata_original.csv")
head(dataset, 12)

In [ ]:
# Step 2: Data preprocessing
glimpse(dataset)
summary(dataset)
sum(is.na(dataset))
dataset <- dataset %>% drop_na()
summary(dataset)

In [ ]:
# Step 3: Model selection - Identify optimal cluster number
num_data <- dataset %>% select(where(is.numeric))
num_data_scaled <- scale(num_data)
fviz_nbclust(num_data_scaled, kmeans, method="silhouette")

In [ ]:
# Step 4: Clustering
set.seed(42)
k <- 8 # (choose based on previous plot)
km <- kmeans(num_data_scaled, centers=k, nstart=25)
dataset$Cluster <- as.factor(km$cluster)

In [ ]:
# Step 5: Identify closest companies
company11_cluster <- dataset %>% filter(shortName == "Company_11") %>% pull(Cluster)
similar_companies <- dataset %>% filter(Cluster == company11_cluster)
similar_companies

In [ ]:
# Remove Company_11 and merge with market data
similar_companies <- similar_companies %>% filter(shortName != "Company_11")
data_extra <- read_csv("financialdata_extra.csv")
merged_data <- left_join(similar_companies, data_extra, by="shortName")
head(merged_data)

In [ ]:
# Step 6: Valuation
# Market Cap as valuation metric
avg_market_cap <- mean(merged_data$marketCap, na.rm=TRUE)
avg_market_cap

In [ ]:
# Calculate EV/EBITDA multiple
merged_data <- merged_data %>% mutate(EV_to_ebitda = enterpriseValue / ebitda)
average_EV_ebitda <- mean(merged_data$EV_to_ebitda, na.rm=TRUE)
average_EV_ebitda

In [ ]:
# Estimate Company_11 EV
ebitda_value_company11 <- dataset %>% filter(shortName == "Company_11") %>% pull(ebitda)
Company11_EV <- average_EV_ebitda * ebitda_value_company11
Company11_EV