In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff


# Heirarchial Clustering Class

In [19]:
class HeirarchialClustering():
    def __init__(self , linkage_type = 'single'):
        self.linkage_matrix = None
        self.clusters = None
        self.linkage_type = linkage_type
    
    def euclidean_distance(self , x1, x2):
        return np.sqrt(np.sum((x1-x2)**2))
    
    def linkage_distance(self , X1 , X2):
        len_x1 = X1.shape[0]
        len_x2 = X2.shape[0]

        
        if(self.linkage_type == 'single'):
            distance = np.inf
            for i in range(len_x1):
                for j in range(len_x2):
                    distance = min(distance , self.euclidean_distance(X1[i] , X2[j]))
        
        elif(self.linkage_type == 'complete'):
            distance = 0
            for i in range(len_x1):
                for j in range(len_x2):
                    distance = max(distance , self.euclidean_distance(X1[i] , X2[j]))
        
        elif(self.linkage_type == 'average'):
            distance = 0
            for i in range(len_x1):
                for j in range(len_x2):
                    distance += self.euclidean_distance(X1[i] , X2[j])
            distance = distance / (len_x1*len_x2)

        return distance

    def initialise_distance_matrix(self , X):
        n = X.shape[0]
        distance_matrix = np.zeros((n,n))
        for i in range(n):
            for j in range(n):
                distance_matrix[i][j] = self.euclidean_distance(X[i] , X[j])
        return distance_matrix

    def get_min_distance(self, rem_clusters , clusters , X):
        len_rem_clusters = len(rem_clusters)
        min_distance = np.inf
        for i in range(len_rem_clusters):
            for j in range(i+1 , len_rem_clusters):
                # get element values of indices given by clusters[rem_clusters[i]]
                X1 = np.array([X[index] for index in clusters[rem_clusters[i]]])
                X2 = np.array([X[index] for index in clusters[rem_clusters[j]]])
                
                distance = self.linkage_distance(X1 , X2)
                if distance < min_distance:
                    min_distance = distance
                    min_clust1 = rem_clusters[i]
                    min_clust2 = rem_clusters[j]
        
        return min_clust1 , min_clust2 , min_distance
                
    def get_linkage_matrix(self , X):
        n = X.shape[0]
        self.linkage_matrix = []# 3 columns for 2 clusters and distance
        clusters = [[i] for i in range(n)] # contains indices of elements in each cluster
        rem_clusters = [i for i in range(n)] # cluster id of remaining clusters
        while(len(rem_clusters) != 1):
            min_clust1 , min_clust2 , min_distance = self.get_min_distance(rem_clusters , clusters , X)
            self.linkage_matrix.append([min_clust1 , min_clust2 , min_distance])
            rem_clusters.remove(min_clust1)
            rem_clusters.remove(min_clust2)
            new_cluster = np.concatenate((clusters[min_clust1], clusters[min_clust2]))
            cluster_id = len(clusters)
            clusters.append(new_cluster)
            rem_clusters.append(cluster_id)

        self.clusters = clusters
        self.linkage_matrix = np.array(self.linkage_matrix)
        return self.linkage_matrix

    def plot_dendrogram(self , title = 'Dendrogram'):
        fig = ff.create_dendrogram(self.linkage_matrix)
        fig.update_layout(
        width=2400,
        height=400,
        xaxis_title='Data Points',
        yaxis_title='Distance',
        title=title
        )
        fig.show()

# New Customers Dataset


In [3]:
data = pd.read_csv('../Data/SMAI-Dataset-hc-dataset/new_customers.csv')
preprocess_data = data.iloc
X_data = data.iloc[:, [3, 4]].values
# data.head()
print(X_data.shape)

(200, 2)


## Varying the number of features

In [4]:
clus_model = HeirarchialClustering()
linkage_matrix = clus_model.get_linkage_matrix(X_data)
clus_model.plot_dendrogram(title = 'Dendogram taking column 3 and 4')

In [5]:
X_data_2 = data.iloc[:, 4].values
X_data_2 = X_data_2[:, np.newaxis]

clus_model_2 = HeirarchialClustering()
linkage_matrix_2 = clus_model_2.get_linkage_matrix(X_data_2)
clus_model_2.plot_dendrogram(title = 'Dendogram for column 5 data')


In [6]:
X_data_3 = data.iloc[:, 3].values
X_data_3 = X_data_3[:, np.newaxis]

clus_model_3 = HeirarchialClustering()
linkage_matrix_3 = clus_model_3.get_linkage_matrix(X_data_3)
clus_model_3.plot_dendrogram(title = 'Dendogram for column 4 data')

## Varying the linkage type

In [20]:
clus_model = HeirarchialClustering(linkage_type='single')
linkage_matrix = clus_model.get_linkage_matrix(X_data)
clus_model.plot_dendrogram(title = 'Dendogram using Single Linkage')

In [21]:
clus_model = HeirarchialClustering(linkage_type='complete')
linkage_matrix = clus_model.get_linkage_matrix(X_data)
clus_model.plot_dendrogram(title = 'Dendogram using Complete Linkage')

In [22]:
clus_model = HeirarchialClustering(linkage_type='average')
linkage_matrix = clus_model.get_linkage_matrix(X_data)
clus_model.plot_dendrogram(title = 'Dendogram using Average Linkage')

## Observations

### Varying columns
1. Column 4 produces sort of balances clusters
2. Column 5 long and unbalanced clusters 
3. Overall the impact of combining both the columns was balanced 

### Varying linkages
1. Single linkage made long but well seperated clusters
2. Complete linkage made less number but balanced clusters
3. Average linkage made decent amount of clusters which were also balanced

# Gene Dataset

In [7]:
gene_data = pd.read_csv('../Data/SMAI-Dataset-gene-expression/gene.csv')
gene_data.head()

Unnamed: 0,ID_REF,GSM613412,GSM613413,GSM613414,GSM613415,GSM613416,GSM613417,GSM613418,GSM613419,GSM613420,GSM613421,GSM613422,GSM613423
0,10338001,5192.23,5974.65,5393.02,5158.65,5244.96,5143.31,4621.77,5173.65,5811.77,5465.11,4972.57,4528.7
1,10338002,301.503,307.677,362.596,372.776,299.98,308.912,311.883,331.533,344.161,365.172,357.146,372.436
2,10338003,1850.11,2037.54,1891.18,1752.97,1718.8,1762.98,1630.54,1905.87,2141.35,1891.99,1652.14,1593.69
3,10338004,927.983,1140.82,936.433,918.44,930.313,926.496,793.729,936.184,1041.49,942.377,891.359,826.811
4,10338005,5.24974,5.0173,5.09585,3.76912,5.54982,5.27357,3.79682,4.66623,5.68881,5.25432,4.66148,3.93062


In [8]:
X_gene = gene_data.iloc[:, 1:].values
gene_model = HeirarchialClustering()
linkage_matrix = gene_model.get_linkage_matrix(X_gene)
clus_model.plot_dendrogram(title = 'Gene Data Dendogram taking all columns')

## Varying the number of features

In [9]:
num_columns = np.arange(1, X_gene.shape[1]+1)

for num_col in num_columns:
    X_gene = gene_data.iloc[:, 1:num_col+1].values
    gene_model = HeirarchialClustering()
    linkage_matrix = gene_model.get_linkage_matrix(X_gene)
    clus_model.plot_dendrogram(title = f'Gene Data Dendogram taking {num_col} columns')

## Varying the number of linkages

In [24]:
X_gene = gene_data.iloc[:, 1:].values
gene_model = HeirarchialClustering(linkage_type='single')
linkage_matrix = gene_model.get_linkage_matrix(X_gene)
clus_model.plot_dendrogram(title = 'Gene Data Dendogram using Single Linkage')

In [25]:
X_gene = gene_data.iloc[:, 1:].values
gene_model = HeirarchialClustering(linkage_type='complete')
linkage_matrix = gene_model.get_linkage_matrix(X_gene)
clus_model.plot_dendrogram(title = 'Gene Data Dendogram using Complete Linkage')

In [26]:
X_gene = gene_data.iloc[:, 1:].values
gene_model = HeirarchialClustering(linkage_type='average')
linkage_matrix = gene_model.get_linkage_matrix(X_gene)
clus_model.plot_dendrogram(title = 'Gene Data Dendogram using Average Linkage')

## Observations

### Varying features
Increasing columns leads to more balanced and well seperated clusters

### Varying linkages
There was not much difference there