This notebook demonstrates the performance of different clustering algorithms on simulated datasets where cluster memberships are known. 
Reference: https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html

### Import Packages

In [None]:
# data processing
import numpy as np
import pandas as pd

# modeling
from sklearn.preprocessing import StandardScaler
from sklearn import cluster

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

### Set-up

In [None]:
# seaborn plot style
sns.set_style('white')

# color palette for plotting
colors = ["#e74c3c", "#34495e", "#2ecc71", "#4169e1"]

### Read data

The following datasets are generated for demonstrative purposes. The goal is to show how characteristics of different clustering algorithms on datasets with different 2D shapes.

In [None]:
git_url = 'https://raw.githubusercontent.com/vishal-git/dapt-631/main/data'

# 1. Blobs with varied variances
varied = pd.read_csv(f'{git_url}/varied.csv')

# 2. Anisotropicly distributed data
aniso = pd.read_csv(f'{git_url}/aniso.csv')

# 3. Gaussian blobs
blobs = pd.read_csv(f'{git_url}/blobs.csv')

# 4. No structure
no_structure = pd.read_csv(f'{git_url}/no_structure.csv')
no_structure['y'] = 0

### View datasets

In [None]:
datasets = [varied, aniso, blobs, no_structure]

for dataset in datasets:

    # standardize the dataset
    X = 
    
    # count the number of clusters in the dataset
    n_clus = 

    # scatter plot
    plt.figure(figsize=(5, 5))
    sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=dataset['y'], palette=colors[:n_clus])
    
    # adjust/clean the plot
    plt.xlim(-2.5, 2.5)
    plt.ylim(-2.5, 2.5)
    plt.xticks(())
    plt.yticks(())

### Agglomerative Clustering using Ward Linkage Method

In [None]:
req_clus = 3

for dataset in datasets:

    # standardize the dataset
    X = StandardScaler().fit_transform(dataset[['X1', 'X2']])

    # fit a model using Ward's linkage
    ward = 

    # predict cluster
    y_pred = 
    
    # count the number of _predicted_ clusters
    n_pred_clus = 

    # scatter plot, color each data point according to the predicted cluster
    plt.figure(figsize=(5, 5))
    sns.scatterplot(x=, y=, hue=, palette=)
    
    # adjust/clean the plot
    plt.xlim(-2.5, 2.5)
    plt.ylim(-2.5, 2.5)
    plt.xticks(())
    plt.yticks(());

### Comparisons of two linkage methods for Agglomerative Clustering

In [None]:
plt.figure(figsize=(15, 20))
plot_num = 1

for i, dataset in enumerate(datasets):
           
    X = StandardScaler().fit_transform(dataset[['X1', 'X2']])
    
    for criterion in ['none', 'ward', 'complete']:

        plt.subplot(len(datasets), 3, plot_num)
        
        if criterion == 'none':
                
            n_clus = dataset['y'].nunique()
            if i == 0:
                plt.title(criterion, size=14)
                
            sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=dataset['y'], palette=colors[:n_clus])

        else:
            agglomerative = cluster.AgglomerativeClustering(n_clusters=req_clus,
                                                            linkage=criterion).fit(X)
            y_pred = agglomerative.labels_.astype(int)
            n_pred_clus = len(np.unique(y_pred))
            if i == 0:
                plt.title(criterion, size=14)
            sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y_pred, palette=colors[:n_pred_clus])
            
        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        
        plot_num += 1
    
plt.show()

___

### Comparisons: Agglomerative Clustering vs. _k_-means

In [None]:
# clustering criterion/methods to use
criteria = ['none', 'ward', 'kmeans']
plt.figure(figsize=(15, 20))
plot_num = 1

for i, dataset in enumerate(datasets):
           
    X = StandardScaler().fit_transform(dataset[['X1', 'X2']])
    
    for criterion in criteria:

        plt.subplot(len(datasets), len(criteria), plot_num)
        
        if criterion == 'none':
                
            n_clus = dataset['y'].nunique()
            if i == 0:
                plt.title(criterion, size=14)
                
            sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=dataset['y'], palette=colors[:n_clus])

        elif criterion == 'ward':
            agglomerative = cluster.AgglomerativeClustering(n_clusters=req_clus,
                                                            linkage=criterion).fit(X)
            y_pred = agglomerative.labels_.astype(int)
            n_pred_clus = len(np.unique(y_pred))
            if i == 0:
                plt.title(criterion, size=14)
                
            sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y_pred, palette=colors[:n_pred_clus])
            
        elif criterion == 'kmeans':
            y_pred = 
            n_pred_clus = len(np.unique(y_pred))
            if i == 0:
                plt.title(criterion, size=14)
                
            sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y_pred, palette=colors[:n_pred_clus])
            
        else:
            print ('Invalid clustering criterion/method specified.')
            
        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        
        plot_num += 1
    
plt.show()

___

### Comparisons: Agglomerative Clustering vs. DBSCAN

In [None]:
# clustering criterion/methods to use
criteria = ['none', 'ward', 'kmeans', 'DBSCAN']
plt.figure(figsize=(20, 20))
plot_num = 1

for i, dataset in enumerate(datasets):
           
    X = StandardScaler().fit_transform(dataset[['X1', 'X2']])
    
    for criterion in criteria:

        plt.subplot(len(datasets), 4, plot_num)
        
        if criterion == 'none':
                
            n_clus = dataset['y'].nunique()
            if i == 0:
                plt.title(criterion, size=14)
            sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=dataset['y'], palette=colors[:n_clus])

        elif criterion == 'ward':
            agglomerative = cluster.AgglomerativeClustering(n_clusters=req_clus, linkage=criterion).fit(X)
            y_pred = agglomerative.labels_.astype(int)
            n_pred_clus = len(np.unique(y_pred))
            if i == 0:
                plt.title(criterion, size=14)
                
            sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y_pred, palette=colors[:n_pred_clus])
            
        elif criterion == 'DBSCAN':
            dbscan = 
            y_pred = 
            n_pred_clus = len(np.unique(y_pred))
            if i == 0:
                plt.title(criterion, size=14)
                
            sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y_pred, palette=colors[:n_pred_clus])
            
        elif criterion == 'kmeans':
            y_pred = cluster.KMeans(n_clusters=req_clus, n_init=10, random_state=314).fit_predict(X)
            n_pred_clus = len(np.unique(y_pred))
            if i == 0:
                plt.title(criterion, size=14)
                
            sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y_pred, palette=colors[:n_pred_clus])
            
        else:
            print ('Invalid clustering criterion/method specified.')
            
        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        
        plot_num += 1
    
plt.show()