# GUC Clustering Project 

**Objective:** 
The objective of this project teach students how to apply clustering to real data sets

The projects aims to teach student: 
* Which clustering approach to use
* Compare between Kmeans, Hierarchal, DBScan, and Gaussian Mixtures  
* How to tune the parameters of each data approach
* What is the effect of different distance functions (optional) 
* How to evaluate clustering approachs 
* How to display the output
* What is the effect of normalizing the data 

Students in this project will use ready-made functions from Sklearn, plotnine, numpy and pandas 
 



In [None]:
# if plotnine is not installed in Jupter then use the following command to install it 

Running this project require the following imports 

In [None]:
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn.preprocessing as prep
from sklearn.datasets import make_blobs
from plotnine import *   
# StandardScaler is a function to normalize the data 
# You may also check MinMaxScaler and MaxAbsScaler 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

from sklearn.cluster import DBSCAN

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

from sklearn.metrics import silhouette_score

%matplotlib inline

In [None]:
# helper function that allows us to display data in 2 dimensions an highlights the clusters
def display_cluster(X,km=[],num_clusters=0):
    color = ['blue', 'red', 'green', 'cyan', 'magenta', 'yellow', 'black', 'brown', 'teal', 'purple']  # List of colors
    alpha = 0.5  #color obaque

    s = 20
    if num_clusters == 0:
        plt.scatter(X[:,0],X[:,1],c = color[0],alpha = alpha,s = s, label='Data')
    else:
        X_np = X.values if isinstance(X, pd.DataFrame) else X  
        for i in range(num_clusters):
            cluster_indices = np.where(km.labels_ == i)[0]
            plt.scatter(X_np[cluster_indices, 0], X_np[cluster_indices, 1], c=color[i], alpha=alpha, s=s)
            plt.scatter(km.cluster_centers_[i][0], km.cluster_centers_[i][1], c=color[i], marker='x', s=100)
    # plt.legend()

## Multi Blob Data Set 
* The Data Set generated below has 6 cluster with varying number of users and varing densities
* Cluster the data set below using 

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sns.set_style("whitegrid")
sns.set_context("talk")

n_bins = 6  
centers = [(-3, -3), (0, 0), (5,2.5),(-1, 4), (4, 6), (9,7)]
Multi_blob_Data, y = make_blobs(n_samples=[100,150, 300, 400,300, 200], n_features=2, cluster_std=[1.3,0.6, 1.2, 1.7,0.9,1.7],
                  centers=centers, shuffle=False, random_state=42)
display_cluster(Multi_blob_Data)


### Kmeans 
* Use Kmeans with different values of K to cluster the above data 
* Display the outcome of each value of K 
* Plot distortion function versus K and choose the approriate value of k 
* Plot the silhouette_score versus K and use it to choose the best K 
* Store the silhouette_score for the best K for later comparison with other clustering techniques. 

In [None]:
#K-means

K_values= range(2, 8)
distortions = []
silhouette_scores = []

for k in K_values:
        km = KMeans(n_clusters=k, random_state=42)
        km.fit(Multi_blob_Data)
        distortions.append(km.inertia_)
        silhouette_scores.append(silhouette_score(Multi_blob_Data, km.labels_))

        # to display clusters for each K
        plt.figure()
        plt.title(f'KMeans Clustering with K={k}')
        display_cluster(Multi_blob_Data, km, k)
    
plt.figure(figsize=(10, 6))
plt.plot(K_values, distortions, marker='o')
plt.title('Distortion vs. Number of Clusters')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Distortion')
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(K_values, silhouette_scores, marker='o')
plt.title('Silhouette Score vs. Number of Clusters')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.show()

# Choose the best K based on silhouette score
best_K = K_values[np.argmax(silhouette_scores)]
print("Best value of K based on silhouette score:", best_K)
print("Best silhouette score:",np.max(silhouette_scores) )


### Hierarchal Clustering
* Use AgglomerativeClustering function to  to cluster the above data 
* In the  AgglomerativeClustering change the following parameters 
    * Affinity (use euclidean, manhattan and cosine)
    * Linkage( use average and single )
    * Distance_threshold (try different)
* For each of these trials plot the Dendograph , calculate the silhouette_score and display the resulting clusters  
* Find the set of paramters that would find result in the best silhouette_score and store this score for later comparison with other clustering techniques. 
* Record your observation

In [None]:
#Hierarchical 
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from itertools import combinations
from scipy.cluster.hierarchy import linkage

data_df = pd.DataFrame(Multi_blob_Data)

def Hierarchical_Clustering(data_df):
    #parameters
    affinities = ['euclidean', 'cityblock', 'cosine']
    linkages_methods = ['average', 'single']
    distance_thresholds = [None]
    diff_distance_thresholds = [10, 20]

    best_score = -1
    best_params = {}

    for affinity in affinities:
        for linkage_method in linkages_methods:
            for distance_threshold in diff_distance_thresholds:
                model = AgglomerativeClustering(affinity=affinity, linkage=linkage_method, distance_threshold=distance_threshold, n_clusters=None)
                Z = linkage(data_df, method=linkage_method, metric=affinity)
                clusters = model.fit_predict(data_df)
    
                #calculate silhouette score
                if len(np.unique(clusters)) > 1:
                    score = silhouette_score(data_df, clusters)
                    print(f"Silhouette Score: {score}")
    
                    #plot dendrograms
                    plt.figure(figsize=(10, 5))
                    plt.title(f"Dendrogram: Affinity={affinity}, Linkage={linkage_method}, Distance Threshold={distance_threshold}")
                    dendrogram(Z, truncate_mode='level', p=3)
                    plt.xlabel("Sample Index")
                    plt.ylabel("Distance")
                    plt.show()
    
                    #plot cluster results
                    for pair in combinations(range(data_df.shape[1]), 2):
                        plt.figure(figsize=(8, 6))
                        plt.scatter(data_df.iloc[:, pair[0]], data_df.iloc[:, pair[1]], c=clusters, cmap='viridis')
                        plt.title(f"Clusters - Features {pair[0]} and {pair[1]} - Affinity: {affinity}, Linkage: {linkage_method}, Distance Threshold: {distance_threshold}")
                        plt.xlabel(f'Feature {pair[0]}')                       
                        plt.ylabel(f'Feature {pair[1]}')
                        plt.colorbar(label='Cluster')
                        plt.grid(True)
                        plt.tight_layout()
                        plt.show()

    
                    #update best score and parameters if applicable
                    if score > best_score:
                        best_score = score
                        best_params = {
                            'Affinity': affinity,
                            'Linkage': linkage_method,
                            'Distance Threshold': distance_threshold
                        }
    
       #iterate over parameter combinations with distance_thresholds
    for affinity in affinities:
        for linkage_method in linkages_methods:
            for distance_threshold in distance_thresholds:

                if distance_threshold==None:
                   n_clusters = 2
                else:
                   n_clusters = None
                
                #agglomerative Clustering
                model = AgglomerativeClustering(affinity=affinity, linkage=linkage_method,
                                                     distance_threshold=distance_threshold, n_clusters=n_clusters)
                clusters = model.fit_predict(data_df)
                Z = linkage(data_df, method=linkage_method, metric=affinity)

                #to check if multiple clusters are formed
                if len(np.unique(clusters)) > 1:
                    #calculate silhouette score
                    score = silhouette_score(data_df, clusters)
                    print(f"Silhouette score: {score}")

                    #plot dendrogram
                    plt.figure(figsize=(8, 6))
                    plt.title(f'Dendrogram - Affinity: {affinity}, Linkage: {linkage_method}, Distance Threshold: {distance_threshold}')
                    dendrogram(Z, leaf_rotation=90., leaf_font_size=8.)
                    plt.xlabel('Sample Index')
                    plt.ylabel('Distance')
                    plt.show()

                    #plot resulting clusters 
                    for pair in combinations(range(data_df.shape[1]), 2):
                        plt.figure(figsize=(8, 6))
                        plt.scatter(data_df.iloc[:, pair[0]], data_df.iloc[:, pair[1]], c=clusters, cmap='viridis')
                        plt.title(f"Clusters - Features {pair[0]} and {pair[1]} - Affinity: {affinity}, Linkage: {linkage_method}, Distance Threshold: {distance_threshold}")
                        plt.xlabel(f'Feature {pair[0]}')
                        plt.ylabel(f'Feature {pair[1]}')
                        plt.colorbar(label='Cluster')
                        plt.grid(True)
                        plt.tight_layout()
                        plt.show()
                    
                    if score > best_score:
                        best_score = score
                        best_params = {
                            'Affinity': affinity,
                            'Linkage': linkage_method,
                            'Distance Threshold': distance_threshold
                        }
                    
        #print the best silhouette score and its corresponding parameters
        print("Best Silhouette Score:", best_score)
        print("Best Parameters:", best_params)

Hierarchical_Clustering(data_df)

### DBScan
* Use DBScan function to  to cluster the above data 
* In the  DBscan change the following parameters 
    * EPS (from 0.1 to 3)
    * Min_samples (from 5 to 25)
* Plot the silhouette_score versus the variation in the EPS and the min_samples
* Plot the resulting Clusters in this case 
* Find the set of paramters that would find result in the best silhouette_score and store this score for later comparison with other clustering techniques. 
* Record your observations and comments

In [None]:
#  DBSCAN 
from sklearn.cluster import DBSCAN

def DBSCAN_Clustering(data_df):
    #parameters
    eps_num = np.linspace(0.1, 3, 10)
    min_samples_num = range(5, 26)

    best_score = -1  
    best_params = { }
    best_clusters = None
    silhouette_scores = []

    for eps in eps_num:
        for min_samples in min_samples_num:
            #perform DBSCAN clustering
            model = DBSCAN(eps=eps, min_samples=min_samples)
            clusters = model.fit_predict(data_df)

            if len(np.unique(clusters)) <= 1:
                continue

            #calculate silhouette score
            score = silhouette_score(data_df, clusters)

            #store silhouette score and parameters
            silhouette_scores.append(score)

            if score > best_score:
                        best_score = score
                        best_params = {
                            'EPS': eps,
                            'Min Samples': min_samples,
                        }
                        best_clusters= clusters
                
            #plot the resulting clusters for each pair of features
            n_features = data_df.shape[1]
            for i in range(n_features):
                for j in range(i+1,n_features):
                    plt.figure(figsize=(8, 6))
                    plt.scatter(data_df.iloc[:, i], data_df.iloc[:, j], c=clusters, cmap='viridis', s=50, alpha=0.5)
                    plt.xlabel(data_df.columns[i])
                    plt.ylabel(data_df.columns[j])
                    plt.colorbar(label='Cluster')
                    plt.grid(True)
                    plt.tight_layout()
                    plt.axis
                    plt.show()

    #plot silhouette score versus eps and min_samples
    plt.figure(figsize=(8, 6))
    plt.plot(range(len(silhouette_scores)), silhouette_scores, marker='o', linestyle='-')
    plt.title('Silhouette Score vs (EPS and Min Samples)')
    plt.xlabel('Parameter Combination')
    plt.ylabel('Silhouette Score')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    #print the best silhouette score and its corresponding parameters
    print("Best Silhouette Score:", best_score)
    print("Best Parameters:", best_params)


dbscan_clustering(data_df)

### Gaussian Mixture
* Use GaussianMixture function to cluster the above data 
* In GMM change the covariance_type and check the difference in the resulting proabability fit 
* Use a 2D contour plot to plot the resulting distribution (the components of the GMM) as well as the total Gaussian mixture

In [None]:
from sklearn.mixture import GaussianMixture

def GMM_Clustering(data_df):
    n_components = 3
    covariance_types = ['full', 'tied', 'diag', 'spherical']
    best_score = -1
    best_params = {}
    silhouette_scores = []
    parameters = []

    for covariance_type in covariance_types:
        #perform clustering
        gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type)
        cluster_labels = gmm.fit_predict(data_df)
        #best_cluster_labels = cluster_labels

        #calculate silhouette score
        score = silhouette_score(data_df, cluster_labels)
        silhouette_scores.append(score)
        parameters.append({'Covariance Type': covariance_type})

        #update best score and parameters if applicable
        if score > best_score:
            best_score = score
            best_params = {'Covariance Type': covariance_type}
            best_cluster_labels= cluster_labels

    #plot scatter plot for each pair of features
    n_features = data_df.shape[1]
    for i in range(n_features):
        for j in range(i + 1, n_features):
            plt.figure(figsize=(8, 6))

            #scatter plot
            plt.subplot(1, 2, 1)
            plt.scatter(data_df.iloc[:, i], data_df.iloc[:, j], c=best_cluster_labels, cmap='viridis', s=50, alpha=0.5)
            plt.xlabel(f'Feature {i}')
            plt.ylabel(f'Feature {j}')
            plt.title('Scatter Plot')

            #fit GaussianMixture model for the current pair of features
            gmm_pair = GaussianMixture(n_components=3, covariance_type=best_params['Covariance Type'])
            X_pair = data_df[[data_df.columns[i], data_df.columns[j]]]
            gmm_pair.fit(X_pair)

            #contour plot for Gaussian mixture
            plt.subplot(1, 2, 2)
            x_min, x_max = data_df.iloc[:, i].min() - 1, data_df.iloc[:, i].max() + 1
            y_min, y_max = data_df.iloc[:, j].min() - 1, data_df.iloc[:, j].max() + 1
            xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                                 np.linspace(y_min, y_max, 100))
            Z = gmm_pair.score_samples(np.column_stack([xx.ravel(), yy.ravel()]))
            Z = Z.reshape(xx.shape)
            plt.contourf(xx, yy, Z, cmap='viridis', levels=20, alpha=0.5)
            plt.xlabel(f'Feature {i}')
            plt.ylabel(f'Feature {j}')
            plt.title('Contour Plot')

            plt.tight_layout()
            plt.show()

    #plot silhouette score versus covariance type
    plt.figure(figsize=(8, 6))
    plt.bar(covariance_types, silhouette_scores, color='teal')
    plt.xlabel('Covariance Type')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Score versus Covariance Type')
    plt.show()

    #print the best silhouette score and its corresponding parameters
    print("Best Silhouette Score:", best_score)
    print("Best Parameters:", best_params)
    
gaussian_mixture_clustering(data_df)

## iris data set 
The iris data set is test data set that is part of the Sklearn module 
which contains 150 records each with 4 features. All the features are represented by real numbers 

The data represents three classes 


In [None]:
from sklearn.datasets import load_iris
iris_data = load_iris()
iris_data.target[[10, 25, 50]]
#array([0, 0, 1])
list(iris_data.target_names)
['setosa', 'versicolor', 'virginica']
iris_df = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)

* Repeat all the above clustering approaches and steps on the above data 
* Normalize the data then repeat all the above steps 
* Compare between the different clustering approaches

In [None]:
K_values= range (2,8)
distortions = [] 
silhouette_scores = []

for k in K_values:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(iris_df)
    distortions.append(km.inertia_)
    silhouette_scores.append(silhouette_score(iris_df, km.labels_))

    #display clusters for each value of K
    plt.figure()
    plt.grid(True)
    plt.title("K-means clustering with K={}".format(k))
    display_cluster(iris_df, km, k)

#plot distortion function versus no of clusters
plt.figure(figsize=(10, 6))
plt.plot(K_values, distortions, marker='o')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Distortion')
plt.title('Distortion versus Number of Clusters')
plt.grid(True)


#plot silhouette score versus no of clusters
plt.figure(figsize=(10, 6))
plt.plot(K_values, silhouette_scores, marker='o')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')
plt.grid(True)

#find the best value of K based on silhouette score
best_K = K_values[np.argmax(silhouette_scores)]
print(f"The best K is {best_K} with a silhouette score of {np.max(silhouette_scores)}")

plt.show()



In [None]:
Hierarchical_Clustering(iris_df)

In [None]:
DBSCAN_Clustering(iris_df)

In [None]:
GMM_Clustering(iris_df)

In [None]:
#normalize  
scalar = MinMaxScaler()
iris_Normalized=scalar.fit_transform(iris_df)
iris_df_Normalized=pd.DataFrame(iris_Normalized)


In [None]:

K_values= range(2,8)
distortions = [] 
silhouette_scores = []

for k in K_values:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(iris_df_Normalized)
    distortions.append(km.inertia_)
    silhouette_scores.append(silhouette_score(iris_df_Normalized, km.labels_))

    #display clusters for each value of K
    plt.figure()
    plt.grid(True)
    plt.title("K-means clustering with K={}".format(k))
    display_cluster(iris_df_Normalized, km, k)

#plot distortion function versus no of clusters
plt.figure(figsize=(8, 6))
plt.plot(K_values, distortions, marker='o')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Distortion')
plt.title('Distortion versus Number of Clusters')
plt.grid(True)

#plot silhouette score versus no of clusters
plt.figure(figsize=(8, 6))
plt.plot(K_values, silhouette_scores, marker='o')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')
plt.grid(True)

#find the best value of K based on silhouette score
best_K = K_values[np.argmax(silhouette_scores)]
print(f"The best K is {best_K} with a silhouette score of {np.max(silhouette_scores)}")

plt.show()



In [None]:
Hierarchical_Clustering(iris_df_Normalized)

In [None]:
DBSCAN_Clustering(iris_df_Normalized)

In [None]:
GMM_Clustering(iris_df_Normalized)

* When comparing the different clustering approaches before normalization, Hierarchical Clustering and DBSCAN stood out with the highest silhouette score of 0.687, closely followed by K Means Clustering at 0.681. However, GMM lagged behind with a lower silhouette score of 0.553 compared to the other methods.

* After normalization, all methods exhibited similar silhouette scores around 0.630. Interestingly, Hierarchical Clustering, DBSCAN, and K Means Clustering demonstrated identical silhouette scores and parameters, suggesting comparable performance. Conversely, GMM yielded a slightly lower silhouette score of 0.507 with a different covariance type, indicating some variation in its clustering results.

## Customer dataset
Repeat all the above on the customer data set

In [None]:
customer_dataset = pd.read_csv('C:/Users/Adham/Desktop/Customer data.csv')
customer_dataset.set_index('ID', inplace=True)

In [None]:
K_values = range(2,8)
distortions = [] 
silhouette_scores = []

for k in K_values:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(customer_dataset)
    distortions.append(km.inertia_)
    silhouette_scores.append(silhouette_score(customer_dataset, km.labels_))

    #display clusters for each value of K
    plt.figure()
    plt.grid(True)
    plt.title("K-means clustering with K={}".format(k))
    display_cluster(customer_dataset, km, k)

#plot distortion function versus no of clusters
plt.figure(figsize=(8, 6))
plt.plot(K_values, distortions, marker='o')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Distortion')
plt.title('Distortion versus Number of Clusters')
plt.grid(True)

#plot silhouette score versus no of clusters
plt.figure(figsize=(8, 6))
plt.plot(K_values, silhouette_scores, marker='o')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')
plt.grid(True)

#find the best value of K based on silhouette score
best_K = K_values[np.argmax(silhouette_scores)]
print(f"The best K is {best_K} with a silhouette score of {np.max(silhouette_scores)}")

plt.show()

In [None]:
Hierarchical_Clustering(customer_dataset)

In [None]:
DBSCAN_Clustering(customer_dataset)

In [None]:
GMM_Clustering(customer_dataset)

In [None]:
#normalize 
customer_dataset_Normalized=scaler.fit_transform(customer_dataset)
customer_dataset_Normalized_df = pd.DataFrame(customer_dataset_Normalized)

In [None]:

K_values = range (2,8)
distortions = [] 
silhouette_scores = []

for k in K_values:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(customer_dataset_Normalized_df)
    distortions.append(km.inertia_)
    silhouette_scores.append(silhouette_score(customer_dataset_Normalized_df, km.labels_))

    # Display clusters for each value of K
    plt.figure()
    plt.grid(True)
    plt.title("K-means clustering with K={}".format(k))
    display_clusters(customer_dataset_Normalized_df, km, k)

#plot distortion function versus no of clusters
plt.figure(figsize=(10, 6))
plt.plot(K_values, distortions, marker='o')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Distortion')
plt.title('Distortion versus Number of Clusters')
plt.grid(True)

# Plot silhouette score versus no of clusters
plt.figure(figsize=(10, 6))
plt.plot(K_values, silhouette_scores, marker='o')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')
plt.grid(True)

#find the best value of K based on silhouette score
best_K = K_values[np.argmax(silhouette_scores)]
print(f"The best K is {best_K} with a silhouette score of {np.max(silhouette_scores)}")

plt.show()

In [None]:
Hierarchical_Clustering(customer_dataset_Normalized_df)

In [None]:
DBSCAN_Clustering(customer_dataset_Normalized_df)

In [None]:
GMM_Clustering(customer_dataset_Normalized_df)

* Before normalization, Hierarchical Clustering yielded the highest silhouette score of 0.759, trailed by K Means Clustering at 0.583. DBSCAN did not manage to identify clusters, resulting in a silhouette score of -1, while GMM achieved a score of 0.483.

* After normalization, DBSCAN emerged as the top performer with a silhouette score of 0.521, followed by K Means Clustering with 0.437. Hierarchical Clustering and GMM displayed lower silhouette scores compared to DBSCAN and K Means Clustering.