In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import multivariate_normal

# generate data
np.random.seed(0)
mean = [0, 0]
cov = [[1, 0.5], [0.5, 1]]
x, y = np.random.multivariate_normal(mean, cov, size=300).T

# create a grid for contour plot
X, Y = np.mgrid[-3:3:.01, -3:3:.01]
pos = np.dstack((X, Y))
rv = multivariate_normal(mean, cov)

# create seaborn jointplot
g = sns.jointplot(x=x, y=y, space=0, alpha=0.8)

# plot contour
g.ax_joint.contour(X, Y, rv.pdf(pos), colors='r')

# set title
# plt.suptitle('2D Visualization', fontsize=15)

plt.savefig('pictures/joint_dist_2D_vis.pdf')


In [None]:
pos = np.dstack((X, Y))
rv = multivariate_normal(mean, cov)
Z = rv.pdf(pos)

# create 3D plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

ax.plot_surface(X, Y, Z, cmap='viridis', linewidth=0)

ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')

ax.set_title('3D Visualization')

plt.savefig('pictures/joint_dist_3D_vis.pdf')


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

# Generate sample data with 3 clusters
X, y_true = make_blobs(n_samples=300, centers=3, cluster_std=0.60, random_state=0)

# Apply kmeans to the data
kmeans = KMeans(n_clusters=3)
y_kmeans = kmeans.fit_predict(X)

plt.figure(figsize=(6, 5))

# Plot the data points
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=30, cmap='viridis')

# Plot the centroids of the clusters
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, alpha=0.5);

plt.title("Cluster Analysis Illustration")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")

plt.savefig('cluster_anlaysis_illustration.pdf')


In [None]:
from scipy.spatial.distance import cdist

class MyKMeans:
    def __init__(self, n_clusters=3, random_state=None, max_iter=300):
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.max_iter = max_iter

    def fit(self, X):
        # Set the random seed for reproducibility
        np.random.seed(self.random_state)

        # Initialize the cluster centers randomly from the data points
        rand_indices = 
        self.cluster_centers_ = 

        for _ in range(self.max_iter):
            # Step 1: Assign each data point to the nearest center
            self.labels_ = 

            # Step 2: Compute new center as the mean of the data points assigned to each cluster
            new_centers = 

            # Step 3: If the centers do not change, then the algorithm has converged
            if np.all(self.cluster_centers_ == new_centers):
                break

            # Update the centers
            self.cluster_centers_ = new_centers

        return self

    def predict(self, X):
        return ?


In [None]:
import numpy as np
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

# Create some artificial data
# np.random.seed(0)
C1 = np.random.normal(0, 1, (30, 2))
C2 = np.random.normal(3, 1, (30, 2))
C3 = np.random.normal(6, 1, (30, 2))
data = np.concatenate((C1, C2, C3), axis=0)

# Create a figure with 2x2 subplots
fig, axs = plt.subplots(2, 2, figsize=(7, 7))

iterations = [1, 2, 3, 20]  # Iterations we want to plot

# Initialize the KMeans object


count = 0
for i in range(20):  # Run 20 iterations
    # Perform given number of steps of KMeans
    kmeans = MyKMeans(n_clusters=3, random_state=1, max_iter=i+1)
    kmeans.fit(data)

    # If this is an iteration we want to plot, create the plot
    if i+1 in iterations:
        ax = axs[count//(len(iterations)//2)][count%(len(iterations)//2)]
        scatter = ax.scatter(data[:, 0], data[:, 1], c=kmeans.labels_, cmap='viridis', alpha=0.6)
        centers_scatter = ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c=['purple', 'green', 'yellow'], s=200, edgecolors='black')
        ax.set_title(f'Iteration {i+1}')
        count += 1

plt.tight_layout()
plt.savefig('pictures/K-means_alg_illustration.pdf')


In [None]:
from scipy.stats import multivariate_normal

# Define the Gaussian distributions corresponding to the cluster centers
gaussians = [multivariate_normal(mean=center, cov=np.eye(2)) for center in kmeans.cluster_centers_]

# Create a grid for plotting the Gaussian distributions
x = np.linspace(-3, 9, 500)
y = np.linspace(-3, 9, 500)
X, Y = np.meshgrid(x, y)
pos = np.dstack((X, Y))

# Compute the density of each Gaussian distribution at each point in the grid
Zs = [gaussian.pdf(pos) for gaussian in gaussians]

# Combine the densities to get the overall density
Z = np.sum(Zs, axis=0)

fig, ax = plt.subplots(figsize=(7, 6))

# Plot the overall density
contourf = ax.contourf(X, Y, Z, levels=20, cmap='inferno', alpha=0.6)

# Plot the data points colored by their cluster assignments
scatter = ax.scatter(data[:, 0], data[:, 1], c=kmeans.labels_, cmap='viridis', alpha=0.6)

# Plot the cluster centers
centers_scatter = ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c=['purple', 'green', 'yellow'], s=200, edgecolors='black')

ax.set_title('KMeans Clusters and Corresponding Gaussian Distributions')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')

# Add a colorbar
fig.colorbar(contourf, ax=ax, label='Probability Density')

plt.savefig('pictures/KMeans_gaussian.pdf')

In [None]:
import numpy as np
from scipy.spatial.distance import cdist

class MySpectralClustering:
    def __init__(self, n_clusters=2, gamma=1.0):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.eigenvalues_ = None
        self.eigenvectors_ = None

    def laplacian(self, W):
        # Degree matrix
        G = 

        # Laplacian matrix
        L = 

        return L

    def fit_predict(self, X):
        # Step 1: Create the similarity graph
        # Use gamma as the coefficient for the RBF kernel
        W = 
        np.fill_diagonal(W, 0)

        # Step 2: Form the graph Laplacian
        L = self.laplacian(W)

        # Step 3: Compute the first k eigenvectors
        self.eigenvalues_, self.eigenvectors_ = 
        Z = 

        # Step 4: Cluster the rows of the matrix of eigenvectors
        kmeans = KMeans(self.n_clusters, max_iter=100)
        kmeans.fit(?)

        return kmeans.labels_

In [None]:
from sklearn.datasets import make_circles

# Generate circular data
X, y = make_circles(n_samples=300, factor=.5, noise=.05)

# Perform KMeans clustering
kmeans = MyKMeans(n_clusters=2, random_state=0)
kmeans.fit(X)
labels_kmeans = kmeans.predict(X)

# Perform SpectralClustering
spectral = MySpectralClustering(n_clusters=2, gamma=35.0)
labels_spectral = spectral.fit_predict(X)

fig, axs = plt.subplots(1, 3, figsize=(16, 5))

# Plot the actual data distribution
axs[0].scatter(X[:, 0], X[:, 1], c='black', cmap='viridis', alpha=0.7)
axs[0].set_title('Actual Data Distribution')
axs[0].set_xlabel('Feature 1')
axs[0].set_ylabel('Feature 2')

# Plot the KMeans clustering results
axs[1].scatter(X[:, 0], X[:, 1], c=labels_kmeans, cmap='viridis', alpha=0.7)
axs[1].set_title('KMeans Clustering')
axs[1].set_xlabel('Feature 1')
axs[1].set_ylabel('Feature 2')

# Plot the SpectralClustering results
axs[2].scatter(X[:, 0], X[:, 1], c=labels_spectral, cmap='viridis', alpha=0.7)
axs[2].set_title('Spectral Clustering')
axs[2].set_xlabel('Feature 1')
axs[2].set_ylabel('Feature 2')

plt.tight_layout()
plt.savefig('pictures/Spectral_Clustering_and_Kmeans.pdf')
# plt.savefig('pictures/Kmeans-limitation.pdf')



In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define the function for calculating similarity
def similarity(d, c):
    return np.exp(-d**2 * c)

# Create an array of distances
distances = np.linspace(0, 5, 400)

# Calculate similarities for different scale parameters
c_values = [0.2, 1.0, 5.0]
for c in c_values:
    similarities = similarity(distances, c)
    plt.plot(distances, similarities, label=f'$\gamma$ = {c}')

# Decorate the plot
plt.xlabel('Distance (d)')
plt.ylabel('Similarity Score (s)')
plt.title('Similarity vs Distance for different scale parameters ($\gamma$)')
plt.legend()

# Show the plot
plt.savefig('pictures/rbf_func.pdf')


In [None]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

# Create a weighted graph
G = nx.Graph()
G.add_edge('A', 'B', weight=0.6)
G.add_edge('A', 'C', weight=0.2)
G.add_edge('C', 'D', weight=0.1)
G.add_edge('C', 'E', weight=0.7)
G.add_edge('C', 'F', weight=0.9)
G.add_edge('A', 'D', weight=0.3)

# Get the node names in the order NetworkX uses internally
node_names = list(G.nodes)

pos = nx.spring_layout(G)  # positions for all nodes

# Draw the graph
plt.figure(figsize=(6, 5))
plt.subplot(221)

# Draw the graph nodes
nx.draw_networkx_nodes(G, pos)

# Draw the graph edges
nx.draw_networkx_edges(G, pos)

# Draw the labels for nodes
nx.draw_networkx_labels(G, pos)

# Draw the edge weights
labels = nx.get_edge_attributes(G,'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)

plt.title('Graph')

# Draw the weight matrix
plt.subplot(222)
plt.imshow(nx.adjacency_matrix(G).toarray(), cmap='viridis')
plt.xticks(np.arange(len(node_names)), labels=node_names)
plt.yticks(np.arange(len(node_names)), labels=node_names)
plt.title('Weight Matrix (W)')
plt.colorbar()

# Draw the degree matrix
plt.subplot(223)
plt.imshow(np.diag([d for n, d in G.degree(weight='weight')]), cmap='viridis')
plt.xticks(np.arange(len(node_names)), labels=node_names)
plt.yticks(np.arange(len(node_names)), labels=node_names)
plt.title('Degree Matrix (G)')
plt.colorbar()

# Draw the Laplacian
plt.subplot(224)
plt.imshow(nx.laplacian_matrix(G).toarray(), cmap='viridis')
plt.xticks(np.arange(len(node_names)), labels=node_names)
plt.yticks(np.arange(len(node_names)), labels=node_names)
plt.title('Laplacian Matrix (L)')
plt.colorbar()

plt.tight_layout()
plt.savefig('pictures/laplacian_matrix.pdf')


In [None]:
def eigenvectors_explained():
    # Create some data
    np.random.seed(0)
    X = np.dot(np.random.random(size=(2, 2)), np.random.normal(size=(2, 200))).T

    # Subtract the mean to center the data at the origin
    X = X - np.mean(X, 0)

    # Compute the covariance matrix
    C = np.cov(X.T)

    # Compute the eigenvalues and eigenvectors of the covariance matrix
    eigvals, eigvecs = np.linalg.eig(C)

    # Transform the data
    X_transformed = X @ eigvecs

    fig, ax = plt.subplots(1, 2, figsize=(8, 4))

    # Plot the original data
    ax[0].scatter(X[:, 0], X[:, 1], alpha=0.2)
    for e, v in zip(eigvals, eigvecs.T):
        ax[0].plot([0, 3.*np.sqrt(e)*v[0]], [0, 3.*np.sqrt(e)*v[1]], 'r-', lw=2)
    ax[0].set_title('Original Data')
    ax[0].set_xlabel('x[0]')
    ax[0].set_ylabel('x[1]')
    ax[0].set_xlim(-2.5,2.5)
    ax[0].set_ylim(-2.5,2.5)

    # Plot the transformed data
    ax[1].scatter(X_transformed[:, 0], X_transformed[:, 1], alpha=0.2)
    for e, v in zip(eigvals, np.identity(2)):
        ax[1].plot([0, 3.*np.sqrt(e)*v[0]], [0, 3.*np.sqrt(e)*v[1]], 'r-', lw=2)
    ax[1].set_title('Transformed Data')
    ax[1].set_xlabel('Principal Component 1')
    ax[1].set_ylabel('Principal Component 2')
    ax[1].set_xlim(-2.5,2.5)
    ax[1].set_ylim(-2.5,2.5)

    plt.tight_layout()
    plt.savefig('pictures/eigenvector_data_transform.pdf')

In [None]:
eigenvectors_explained()

In [None]:
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from scipy.sparse import csgraph
from scipy.linalg import eigh

from sklearn.datasets import make_circles

# Generate circular data
X, y = make_circles(n_samples=300, factor=.5, noise=.05)

# Initialize the figure
fig, axs = plt.subplots(1, 2, figsize=(8, 4))

# Plot the data scatter
axs[0].scatter(X[:, 0], X[:, 1], c='black', cmap='viridis', alpha=0.7)
axs[0].set_title('Data Scatter')
axs[0].set_xlabel('Feature 1')
axs[0].set_ylabel('Feature 2')

# Perform SpectralClustering with various gamma values
gamma_values = np.linspace(1, 50, 50)
eigenvalues = []
for gamma in gamma_values:
    spectral = MySpectralClustering(n_clusters=2, gamma=gamma)
    spectral.fit_predict(X)
    eigenvalues.append(spectral.eigenvalues_[1])  # store the second smallest eigenvalue


# Plot the second smallest eigenvector vs the third smallest eigenvector
axs[1].scatter(spectral.eigenvectors_[:, 1], spectral.eigenvectors_[:, 2], alpha=0.7)
axs[1].set_title('2nd vs 3rd Smallest Eigenvector')
axs[1].set_xlabel('2nd Smallest Eigenvector')
axs[1].set_ylabel('3rd Smallest Eigenvector')

plt.tight_layout()
plt.savefig('pictures/spectral_clustering_illustration.pdf')
