
# Clustering & Company Valuation using Clustering

---

This script contains examples on how to run unsupervised learning algorithms in Python. Specifically, the scirpt contains 2 sections:
- **Introduction to clustering**, where we simulate a dataset and run classic clustering techniques like k-means and DBScan. We also look at how to formally evaluate a clustering algorithm.
- **Clustering for company valuation**. Here, we use clustering to execute company valuation using the multiples method. In order for you to implement the use case, you need to download two datasets titled "financialdata_original.csv" and "financialdata_extra.csv". The script will show you how to run the 6 steps of company valuation by clustering:
  * Step 1: Data collection & importing;
  * Step 2: Data preprocessing;
  * Step 3: Model selection;
  * Step 4: Clustering;
  * Step 5: Identify closest companies; and
  * Step 6: Valuation.


# Introduction to Clustering in Python

We start by importing the necessary libraries.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from pylab import MaxNLocator # PyLab is a procedural interface to the Matplotlib object-oriented plotting library.
import seaborn as sns
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [None]:
# To start, we create a dataset.
from sklearn.datasets import make_blobs

# create blobs
X, y = make_blobs(n_samples=200, n_features=2, centers=4, cluster_std=1.6, random_state=50)

print(X)
print(y)

In [None]:
# Let's plot data
plt.figure(figsize=(20,10))
plt.scatter(X[:,0], X[:,1],cmap='Accent', s=70)
plt.show()

In [None]:
# Apply k-Means
from sklearn.cluster import KMeans
from sklearn import metrics
# from sklearn.metrics import pairwise_distances


# silhouette: 1=good, 0=overlap, -1=bad
# Within Cluster Sum of Squares: lower is better

def cluster_kmeans(df, nclust):

    kmeans = KMeans(n_clusters=nclust, random_state=0).fit(df)
    label = kmeans.labels_
    centroids = kmeans.cluster_centers_
    sil=metrics.silhouette_score(df, label, metric='euclidean', random_state=0)
    wcss = kmeans.inertia_

    return sil, wcss, label, centroids

cluster_kmeans(X, 4)

In [None]:
# Let's plot the clustering
sil, wcss, label, centroid = cluster_kmeans(X, 4)
plt.figure(figsize=(10,10))
plt.scatter(X[:,0], X[:,1], c=label, cmap='Accent', s=40)
plt.show()

In [None]:
# We need to validate the number of clusters. So let's check how the WCSS and the
# Silhouette coefficient change if we consider different number of clusters

max_n_clusters = 7

tab=pd.DataFrame(columns = ['Clusters', 'Silhouette(max)', 'WCSS(min)'], dtype=int).fillna('')
tab['Silhouette(max)']=tab['Silhouette(max)'].astype(float)

fig, ax = plt.subplots(math.ceil((max_n_clusters-1) / 2), 2, figsize=(20,20), constrained_layout=True)
ax=ax.flatten()
for i in range(max_n_clusters-1):

    nclust = i + 2
    sil, wcss, label, centroids = cluster_kmeans(X, nclust)
    tab = pd.concat([tab, pd.DataFrame([[nclust, sil, wcss]], columns=tab.columns)], ignore_index=True)

    ax[i].scatter(X[:,0], X[:,1], c=label, cmap='Accent', s=40)
    ax[i].scatter(centroids[:,0], centroids[:,1], c=range(nclust), cmap='Accent', s=300, marker='P')
    ax[i].set_title('Clusters: ' + str(nclust), fontsize = 30)
    textstr = 'Sil: ' + str(round(sil, 3)) + '\nWCSS: ' + str(int(wcss))
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
    ax[i].text(0.75, 0.97, textstr, transform=ax[i].transAxes, fontsize=25,
        verticalalignment='top', bbox=props)

plt.show()
display(tab)

In [None]:
# Determine optimal number of clusters with Elbow method

fig, ax1 = plt.subplots(figsize=(10,5))
ax1.plot(tab.Clusters, tab['Silhouette(max)'], 'bx-', color = 'blue')
ax1.set_xlabel('Number of clusters', fontsize = 20)
ax1.set_ylabel('Silhouette', fontsize = 20, color = 'blue')
ax1.tick_params(axis='y', labelcolor='blue', labelsize=13)

ax2 = ax1.twinx()
ax2.plot(tab.Clusters, tab['WCSS(min)'], 'bx-', color = 'red')
ax2.set_ylabel('WCSS', fontsize = 20, color = 'red')
ax2.tick_params(axis='y', labelcolor='red', labelsize=13)

In [None]:
# K-Means is not the only clustering algo. Let's try DBSCAN - Density-based spatial clustering of applications with noise
# Let's create a dataset with strange data shape (moons)
from sklearn.datasets import make_moons

X2, y2 = make_moons(200, noise=0.05, random_state=0)

plt.figure(figsize=(10,5))
plt.scatter(X2[:,0], X2[:,1], cmap='Accent', s=40)
plt.show()

In [None]:
# Before running the code: How do you think the k-means algorithm will perfrom
# on data like this? How would it split the data?
# Run k-means
sil, wcss, label, centroid = cluster_kmeans(X2, 2)
plt.figure(figsize=(10,5))
plt.scatter(X2[:,0], X2[:,1], c=label, cmap='Accent', s=40)
plt.show()
print('Silhouette:', sil)
print('WCSS:', wcss)

In [None]:
# Try DBSCAN - Density-based spatial clustering of applications with noise
# DBSCAN starts by identifying the neighboring observations of each observation within some radius
# (a hyperparameter). Any data point that is within the data point of radius of another data point
# are in the same cluster
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=0.3).fit(X2) # epsfloat, default=0.5 --> The maximum distance between two samples for one to be considered as in the neighborhood of the other.

label = db.labels_
plt.figure(figsize=(10,5))
plt.scatter(X2[:,0], X2[:,1], c=label, cmap='Accent', s=40)
plt.show()

# Clustering for company valuation

In [None]:
# Step 1: Data collection. Let's upload fundumentals data on a set of companies.
# For this step, you will need to upload the data "financialdata_original.csv"
from google.colab import files
uploaded = files.upload()

In [None]:
import io
dataset = pd.read_csv(io.BytesIO(uploaded['financialdata_original.csv']))

In [None]:
dataset.head(12)

In [None]:
# Step 2: Data preprocessing
dataset.head()

In [None]:
dataset.tail()

In [None]:
dataset.describe()

In [None]:
print(dataset.dtypes)

In [None]:
dataset.isna().any() # Check for NAs

In [None]:
dataset = dataset.dropna() # Drop rows with NAs

In [None]:
dataset.describe()

In [None]:
# Step 3: Model selection - Identify the optimal cluster.
dataset_clustering = dataset.select_dtypes(exclude = "object")

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
df=pd.DataFrame(preprocessing.StandardScaler().fit_transform(dataset_clustering.values), columns = dataset_clustering.columns)
X=df

In [None]:
# Apply k-Means
from sklearn.cluster import KMeans
from sklearn import metrics
# from sklearn.metrics import pairwise_distances


# silhouette: 1=good, 0=overlap, -1=bad
# Within Cluster Sum of Squares: lower is better

def cluster_kmeans(df, nclust):

    kmeans = KMeans(n_clusters=nclust, random_state=0).fit(df)
    label = kmeans.labels_
    centroids = kmeans.cluster_centers_
    sil=metrics.silhouette_score(df, label, metric='euclidean', random_state=0)
    wcss = kmeans.inertia_

    return sil, wcss, label, centroids


In [None]:
# Apply k-Means
# Remember: silhouette: 1=good, 0=overlap, -1=bad
# Within Cluster Sum of Squares: lower is better
cluster_kmeans(X, 4)

In [None]:
# Since, we cannot plot the data as it is multidimensiona, we use the dimensionality reduction technique - Principal Component Analysis (PCA).
# We notice that the first 2 PC account for ~50% of the variations in the dataset.

from sklearn.decomposition import PCA
import plotly.express as px

pca = PCA(n_components=X.shape[1], random_state=0).fit(X)
scores = pca.transform(X)

exp_var_pca = pca.explained_variance_ratio_
cum_sum_eigenvalues = np.cumsum(exp_var_pca)

plt.figure(figsize=(10,5))
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid',label='Cumulative explained variance')
plt.ylabel('Cumulative Explained Variance', size=15)
plt.xlabel('Number of Principal Components', size=15)
plt.legend(loc='best', fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# Validate number of clusters - we evaluate clusters on X

max_n_clusters = 21

tab=pd.DataFrame(columns = ['Clusters', 'Silhouette(max)', 'WCSS(min)'], dtype=int).fillna('')
tab['Silhouette(max)']=tab['Silhouette(max)'].astype(float)
label_list={}

fig, ax = plt.subplots(math.ceil((max_n_clusters-1) / 2), 2, figsize=(40,40), constrained_layout=True)
ax=ax.flatten()
for i in range(max_n_clusters-1):

    nclust = i + 2
    sil, wcss, label, _ = cluster_kmeans(X, nclust)
    df = pd.DataFrame(data=scores,index=label)
    centroids = df.groupby(level=0).mean().values
    tab = pd.concat([tab, pd.DataFrame([[nclust, sil, wcss]], columns=tab.columns)], ignore_index=True)
    label_list[str(nclust)]=label

    ax[i].scatter(scores[:,0], scores[:,1], c=label, cmap='Accent', s=40)
    ax[i].scatter(centroids[:,0], centroids[:,1], c=range(nclust), cmap='Accent', s=300, marker='P')
    ax[i].set_title('Clusters: ' + str(nclust), fontsize = 30)
    textstr = 'Sil: ' + str(round(sil, 3)) + '\nWCSS: ' + str(int(wcss))
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
    ax[i].text(0.75, 0.97, textstr, transform=ax[i].transAxes, fontsize=25,
        verticalalignment='top', bbox=props)

plt.show()
display(tab)

In [None]:
# Determine optimal number of clusters with Elbow method

fig, ax1 = plt.subplots(figsize=(10,5))
ax1.plot(tab.Clusters, tab['Silhouette(max)'], 'bx-', color = 'blue')
ax1.set_xlabel('Number of clusters', fontsize = 20)
ax1.set_ylabel('Silhouette', fontsize = 20, color = 'blue')
ax1.tick_params(axis='y', labelcolor='blue', labelsize=13)

ax2 = ax1.twinx()
ax2.plot(tab.Clusters, tab['WCSS(min)'], 'bx-', color = 'red')
ax2.set_ylabel('WCSS', fontsize = 20, color = 'red')
ax2.tick_params(axis='y', labelcolor='red', labelsize=13)

In [None]:
# Step 4: Once we have identified the optimal number of clusters, let's run the
# clustering and assign the appropriate cluster to each company.
kmeans = KMeans(n_clusters=8, random_state=42)
dataset['Cluster'] = kmeans.fit_predict(X)

In [None]:
# Step 5: Identify Closest Companies
# Let's imagine that Company 11 is not public and we want to value it using the multiples method
# Let's first find its cluster based on the balance sheet and income statements values.
Company11_cluster = dataset[dataset['shortName'] == 'Company_11']['Cluster'].iloc[0]

In [None]:
# Extact the data for the companies that are in the same cluster
similar_companies = dataset[dataset['Cluster'] == Company11_cluster]

In [None]:
similar_companies

In [None]:
# Let's remove Company_11 for the similar companies dataset and add all the
# information on the market performance of the other publically traded companies
similar_companies = similar_companies[similar_companies['shortName'] != 'Company_11']

In [None]:
# Let's upload the market data for the other publically traded companies and add
# them to our similar_companies data.
from google.colab import files
uploaded = files.upload()

In [None]:
# Import data
import io
data_extra = pd.read_csv(io.BytesIO(uploaded['financialdata_extra.csv']))

In [None]:
# Let's merge the datasets
merged_data = pd.merge(similar_companies, data_extra, on='shortName', how='left')

In [None]:
# Checking the merged data
merged_data

In [None]:
# Step 6: Valuation
# Assuming 'Market Cap' as the valuation metric
avg_market_cap = merged_data['marketCap'].mean()
avg_market_cap


In [None]:
# Let's create some other multiples
merged_data.loc[:,'EV_to_ebitda'] = merged_data['enterpriseValue'] / merged_data['ebitda']

In [None]:
# Obtain the average EV/ebitda multiple
average_EV_evitda = merged_data["EV_to_ebitda"].mean()
average_EV_evitda

In [None]:
ebitda_value_company11 = dataset.loc[dataset['shortName'] == 'Company_11', 'ebitda'].values[0]


In [None]:
# Company_11 estimated EV based on EV/Ebitda multiple obtained by clustering
Company11_EV = average_EV_evitda*ebitda_value_company11
Company11_EV