# Lab 4: Clustering

## Libraries

In [69]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
#import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

## Utility Functions

In [103]:
def plot_data(fig, X, y, clusters=None, kmean_centers=None, row=1, col=1):
    
    # Add data points
    for k in range(clusters + 1):
        mask = y == k
        fig.add_trace(go.Scatter(
            x=X[mask, 0],
            y=X[mask, 1],
            mode='markers',
            name=f'Cluster {k}',
            marker=dict(
                size=8,
                opacity=0.7,
                line=dict(width=1, color='white')
            )
        ), row=row, col=col)

    # Update layout
    fig.update_xaxes(
        title="X1",
        range=[-15, 15],
        showgrid=True,
        gridwidth=1,
        gridcolor="LightGrey",
        row=row,
        col=col
    )

    fig.update_yaxes(
        title="X2",
        range=[-15, 15],
        showgrid=True,
        gridwidth=1,
        gridcolor="LightGrey",
        row=row,
        col=col
    )

    # Add center lines
    fig.add_shape(
        type="line", x0=0, y0=-10, x1=0, y1=10,
        line=dict(color="Black", width=1, dash="dash"),
        row=row,
        col=col
    )
    fig.add_shape(
        type="line", x0=-10, y0=0, x1=10, y1=0,
        line=dict(color="Black", width=1, dash="dash"),
        row=row,
        col=col
    )

    # Hide legends
    fig.update_layout(showlegend=False)

    if kmean_centers is not None:
        fig.add_trace(go.Scatter(
            x=kmean_centers[:, 0],
            y=kmean_centers[:, 1],
            mode='markers',
            name='Centers',
            marker=dict(
                size=12,
                opacity=0.7,
                line=dict(width=1, color='white')
            )
        ), row=row, col=col)

In [104]:
def generate_cluster_data(
    n: int = 10000,
    centers: int = 4, 
    std: float = 0.6, 
    random_state: int = 42
) -> tuple[np.ndarray, np.ndarray]:
    
    X, y = make_blobs(
        n_samples=n, 
        centers=centers,
        center_box=(-10, 10), 
        cluster_std=std, 
        random_state=random_state)
    
    return X, y

## Asignment 1

### (a) Generating data

In [107]:
clusters = 4
data_points = 300
std = 0.6
X, y = generate_cluster_data(n = data_points, centers=clusters, std=std, random_state=1505)

print("X shape: ", X.shape)
print("y shape: ", y.shape)



X shape:  (300, 2)
y shape:  (300,)


In [108]:
# Plotting data
fig = make_subplots(rows=1, cols=1)
plot_data(fig, X, y, clusters=clusters, row=1, col=1)
fig.show()


### (b) Running K-means

In [109]:
K = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
fig = make_subplots(rows=5, cols=2)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=1505)
    y_clusters = kmeans.fit_predict(X)
    centers = kmeans.cluster_centers_
    plot_data(fig,X, y_clusters, clusters=k, kmean_centers=centers, row=(K.index(k) // 2) + 1, col=(K.index(k) % 2) + 1)
# Changing the size of the figs
fig.update_layout(height=2000, width=800)

fig.show()
    