# Lab 4: Clustering

## Libraries

In [30]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, DBSCAN
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import plotly.figure_factory as ff
from scipy.cluster.hierarchy import linkage

## Utility Functions

In [32]:
def plot_data(fig, X, y, clusters=None, kmean_centers=None, row=1, col=1):
    
    # Add data points
    for k in range(clusters + 1):
        mask = y == k
        fig.add_trace(go.Scatter(
            x=X[mask, 0],
            y=X[mask, 1],
            mode='markers',
            name=f'Cluster {k}',
            marker=dict(
                size=8,
                opacity=0.7,
                line=dict(width=1, color='white')
            )
        ), row=row, col=col)

    # Update layout
    fig.update_xaxes(
        title="X1",
        range=[-15, 15],
        showgrid=True,
        gridwidth=1,
        gridcolor="LightGrey",
        row=row,
        col=col
    )

    fig.update_yaxes(
        title="X2",
        range=[-15, 15],
        showgrid=True,
        gridwidth=1,
        gridcolor="LightGrey",
        row=row,
        col=col
    )

    # Add center lines
    fig.add_shape(
        type="line", x0=0, y0=-10, x1=0, y1=10,
        line=dict(color="Black", width=1, dash="dash"),
        row=row,
        col=col
    )
    fig.add_shape(
        type="line", x0=-10, y0=0, x1=10, y1=0,
        line=dict(color="Black", width=1, dash="dash"),
        row=row,
        col=col
    )

    # Hide legends
    fig.update_layout(showlegend=False)

    if kmean_centers is not None:
        fig.add_trace(go.Scatter(
            x=kmean_centers[:, 0],
            y=kmean_centers[:, 1],
            mode='markers',
            name='Centers',
            marker=dict(
                size=12,
                opacity=0.7,
                line=dict(width=1, color='white')
            )
        ), row=row, col=col)

In [33]:
def generate_cluster_data(
    n: int = 10000,
    centers: int = 4, 
    std: float = 0.6, 
    random_state: int = 42
) -> tuple[np.ndarray, np.ndarray]:
    
    X, y = make_blobs(
        n_samples=n, 
        centers=centers,
        center_box=(-10, 10), 
        cluster_std=std, 
        random_state=random_state)
    
    return X, y

In [34]:
def plot_elbow_method(sse, K):
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=list(K),
        y=sse,
        mode='lines+markers',
        name='SSE',
        line=dict(color='royalblue', width=2),
        marker=dict(size=8, color='royalblue'),
        hovertemplate='K=%{x}<br>SSE=%{y:.2f}<extra></extra>'
    ))
    fig.update_layout(
        title='Elbow Method For Optimal K',
        title_x=0.5,
        xaxis=dict(
            title='Number of Clusters (K)',
            tickmode='linear',
            tick0=1,
            dtick=1
        ),
        yaxis=dict(title='Sum of Squared Errors (SSE)'),
        hovermode='x',
        template='plotly_white',
        width=500,
        height=500,
        showlegend=False
    )
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
    fig.show()

## Asignment 1

### (a) Generating data

In [35]:
clusters = 4
data_points = 300
std = 0.6
X, y = generate_cluster_data(n = data_points, centers=clusters, std=std, random_state=1505)

print("X shape: ", X.shape)
print("y shape: ", y.shape)



X shape:  (300, 2)
y shape:  (300,)


In [36]:
# Plotting data
fig = make_subplots(rows=1, cols=1)
plot_data(fig, X, y, clusters=clusters, row=1, col=1)
fig.show()


### (b) Running K-means

In [37]:
K = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
fig = make_subplots(rows=5, cols=2)
sse = []

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=1505)
    y_clusters = kmeans.fit_predict(X)
    centers = kmeans.cluster_centers_
    plot_data(fig,X, y_clusters, clusters=k, kmean_centers=centers, row=(K.index(k) // 2) + 1, col=(K.index(k) % 2) + 1)
    sse.append(kmeans.inertia_)
# Changing the size of the figs
fig.update_layout(height=2000, width=800)

fig.show()
    

### (c) Plot SSE per K - Elbow Method

In [38]:
plot_elbow_method(sse, K)


### (d) Repeat the process with std = 0.1

In [39]:
clusters = 4
data_points = 300
std = 0.1
X, y = generate_cluster_data(n = data_points, centers=clusters, std=std, random_state=1505)

print("X shape: ", X.shape)
print("y shape: ", y.shape)


X shape:  (300, 2)
y shape:  (300,)


In [40]:
K = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
fig = make_subplots(rows=5, cols=2)
sse = []

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=1505)
    y_clusters = kmeans.fit_predict(X)
    centers = kmeans.cluster_centers_
    plot_data(fig,X, y_clusters, clusters=k, kmean_centers=centers, row=(K.index(k) // 2) + 1, col=(K.index(k) % 2) + 1)
    sse.append(kmeans.inertia_)
# Changing the size of the figs
fig.update_layout(height=2000, width=800)

fig.show()

In [41]:
plot_elbow_method(sse, K)


### Repeat the process with std = 2.5 

In [42]:
clusters = 4
data_points = 300
std = 2.5
X, y = generate_cluster_data(n = data_points, centers=clusters, std=std, random_state=1505)

print("X shape: ", X.shape)
print("y shape: ", y.shape)

K = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
fig = make_subplots(rows=5, cols=2)
sse = []

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=1505)
    y_clusters = kmeans.fit_predict(X)
    centers = kmeans.cluster_centers_
    plot_data(fig,X, y_clusters, clusters=k, kmean_centers=centers, row=(K.index(k) // 2) + 1, col=(K.index(k) % 2) + 1)
    sse.append(kmeans.inertia_)
# Changing the size of the figs
fig.update_layout(height=2000, width=800)

fig.show()

X shape:  (300, 2)
y shape:  (300,)


In [43]:
plot_elbow_method(sse, K)

### (e) Repeat process with random_state None


In [44]:
clusters = 4
data_points = 300
std = 2.5
X, y = generate_cluster_data(n = data_points, centers=clusters, std=std, random_state=None)

print("X shape: ", X.shape)
print("y shape: ", y.shape)

K = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
fig = make_subplots(rows=5, cols=2)
sse = []

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=None)
    y_clusters = kmeans.fit_predict(X)
    centers = kmeans.cluster_centers_
    plot_data(fig,X, y_clusters, clusters=k, kmean_centers=centers, row=(K.index(k) // 2) + 1, col=(K.index(k) % 2) + 1)
    sse.append(kmeans.inertia_)
# Changing the size of the figs
fig.update_layout(height=2000, width=800)

fig.show()

X shape:  (300, 2)
y shape:  (300,)


In [45]:
plot_elbow_method(sse, K)

In [46]:
clusters = 4
data_points = 300
std = 0.1
X, y = generate_cluster_data(n = data_points, centers=clusters, std=std, random_state=None)

print("X shape: ", X.shape)
print("y shape: ", y.shape)

K = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
fig = make_subplots(rows=5, cols=2)
sse = []

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=None)
    y_clusters = kmeans.fit_predict(X)
    centers = kmeans.cluster_centers_
    plot_data(fig,X, y_clusters, clusters=k, kmean_centers=centers, row=(K.index(k) // 2) + 1, col=(K.index(k) % 2) + 1)
    sse.append(kmeans.inertia_)
# Changing the size of the figs
fig.update_layout(height=2000, width=800)

fig.show()

X shape:  (300, 2)
y shape:  (300,)


In [47]:
plot_elbow_method(sse, K)

## Assignment 2

### Utility Functions

In [17]:
def create_dendrogram(X, Z, labels):
    fig = go.Figure()

    # Create dendrogram
    fig = ff.create_dendrogram(
        X,
        orientation='bottom',
        labels=labels,
        linkagefun=lambda x: Z,
        color_threshold=0,
        colorscale=['#1f77b4'] 
    )

    fig.update_layout(height=500, width=800, showlegend=False)
    fig.show()

### Loading Data

In [None]:
# Load the data
data = pd.read_csv('vertebrate.csv')
data.head(20)


Unnamed: 0,Name,Warm-blooded,Gives Birth,Aquatic Creature,Aerial Creature,Has Legs,Hibernates,Class
0,human,1,1,0,0,1,0,mammals
1,python,0,0,0,0,0,1,reptiles
2,salmon,0,0,1,0,0,0,fishes
3,whale,1,1,1,0,0,0,mammals
4,frog,0,0,1,0,1,1,amphibians
5,komodo,0,0,0,0,1,0,reptiles
6,bat,1,1,0,1,1,1,mammals
7,pigeon,1,0,0,1,1,0,birds
8,cat,1,1,0,0,1,0,mammals
9,leopard shark,0,1,1,0,0,0,fishes


In [25]:
# Extract features and class labels
X = data.iloc[:, 1:-1].values  # All columns except 'Name' and 'Class'
labels = data['Name'].values
class_labels = data['Class'].values

# Transforming the classes
# Encode class labels for coloring
le = LabelEncoder()
class_encoded = le.fit_transform(class_labels)

### Single Link

In [26]:
# Calculate the linkage matrix
Z = linkage(X, method='single')
create_dendrogram(X, Z, labels)


In [19]:
Z = linkage(X, method='complete')
create_dendrogram(X, Z, labels)

In [28]:
Z = linkage(X, method='average')
create_dendrogram(X, Z, labels)

## Assignment 3

### Loading Data

In [29]:
# Load the data
data = pd.read_csv('chameleon.csv')
data.head(20)

Unnamed: 0,x,y
0,650.914,214.888
1,41.767,179.408
2,509.126,233.749
3,486.403,152.427
4,46.883,367.904
5,539.27,343.616
6,614.93,139.613
7,61.127,99.23
8,334.174,380.344
9,284.575,412.521


In [37]:
# Create and fit DBSCAN model
dbscan = DBSCAN(eps=15.5, min_samples=5)
clusters = dbscan.fit_predict(data)
data['cluster'] = clusters

In [41]:
# Visualize the clusters
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['x'], y=data['y'], mode='markers', marker_color=data['cluster']))
fig.show()
