In [1]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import umap.umap_ as umap
import pickle
from kneed import KneeLocator

# Import Seaborn for styling
import seaborn as sns

# Set Seaborn style
sns.set(style="whitegrid")

# Set Plotly to render in the notebook
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set global plotting styles
plt.rcParams['font.family'] = ['Heiti TC']  # For displaying Chinese characters if needed
plt.rcParams['figure.figsize'] = (10, 6)

  from .autonotebook import tqdm as notebook_tqdm


## Load Preprocessed Data

Load the preprocessed data exported from the preprocessing notebook. This includes both the original and Gaussian-filtered images.

In [2]:
# Define the path to the preprocessed data
preprocessed_data_path = './preprocessed_data'

# Load the original image DataFrame
image_df = pd.read_pickle(os.path.join(preprocessed_data_path, 'original_image_df.pkl'))

# Load the Gaussian-filtered image DataFrame
gaussian_image_df = pd.read_pickle(os.path.join(preprocessed_data_path, 'gaussian_image_df.pkl'))

# Load the merged DataFrame with metadata
merged_df = pd.read_pickle(os.path.join(preprocessed_data_path, 'merged_df.pkl'))

# Display the first few rows of the merged DataFrame
merged_df.head()

Unnamed: 0,label,pixels,scaled_pixels,Word,checklist,token,token_trial,word,bopomofo,consonant,vowel,block
0,r_u_4_rep1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.21...",入 ㄖㄨˋ,True,r_u_4,r_u_4_rep1,入,ㄖㄨˋ,r,u,4
1,ch_apical_2_rep1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.21...",池 ㄔˊ,True,ch_apical_2,ch_apical_2_rep1,池,ㄔˊ,ch,apical,2
2,ch_apical_3_rep1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.21...",恥 ㄔˇ,True,ch_apical_3,ch_apical_3_rep1,恥,ㄔˇ,ch,apical,3
3,sh_a_1_rep1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.21...",沙 ㄕㄚ,True,sh_a_1,sh_a_1_rep1,沙,ㄕㄚ,sh,a,1
4,ch_schwa_1_rep1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.21...",車 ㄔㄜ,True,ch_schwa_1,ch_schwa_1_rep1,車,ㄔㄜ,ch,schwa,1


## Clustering Methods

We will explore four clustering approaches:

1. K-means Clustering with PCA: <br /> 
> Apply K-means clustering on the scaled pixel data and use PCA for visualization.

2. K-means Clustering with UMAP: <br /> 
> Apply K-means clustering on the scaled pixel data and use UMAP for visualization.

3. PCA followed by K-means Clustering: <br /> 
> Reduce dimensionality with PCA before applying K-means clustering.

4. UMAP followed by K-means Clustering: <br /> 
> Reduce dimensionality with UMAP before applying K-means clustering.

Each method involves clustering for k=1 to k=10, plotting the elbow plot, and visualizing the clusters.

### 1. K-means Clustering with PCA

In this approach, we apply K-means clustering directly on the scaled pixel data and use PCA to reduce dimensions for visualization purposes.

#### 1.1 Data Preparation

We stack the scaled pixel arrays vertically to create a 2D NumPy array suitable for clustering algorithms

In [3]:
# Extract scaled pixel data from the merged DataFrame
# Assuming 'scaled_pixels' is a list of scaled pixel arrays
scaled_pixel_data = np.vstack(merged_df['scaled_pixels'].values)  # Shape: (2700, 4500)

#### 1.2 K-means Clustering for k=1 to k=10

K-means Clustering: <br />
> For each value of k (from 1 to 10), we fit a K-means model to the scaled pixel data.

Inertia Calculation: <br />
> Inertia measures the sum of squared distances of samples to their closest cluster center, useful for determining the optimal k.

Storing Labels: <br />
> Cluster labels for each k are stored for later visualization.

In [4]:
# Define the range of k values
k_values = range(1, 11)

# Initialize lists to store inertia values and cluster labels
inertia_values = []
cluster_labels_dict = {}

# Perform K-means clustering for each k
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_pixel_data)
    inertia_values.append(kmeans.inertia_)
    cluster_labels_dict[k] = kmeans.labels_


#### 1.3 Elbow Plot to Determine Optimal k

Elbow Method: <br />

> Plotting inertia against k helps identify the point where adding more clusters doesn't significantly reduce inertia, suggesting an optimal k.


In [13]:
# Use KneeLocator to find the elbow point
kl = KneeLocator(k_values, inertia_values, curve='convex', direction='decreasing')
optimal_k = kl.elbow

# Plot the inertia values with the elbow point
elbow_fig = go.Figure()

elbow_fig.add_trace(go.Scatter(
    x=list(k_values),
    y=inertia_values,
    mode='lines+markers',
    name='Inertia'
))

if optimal_k:
    elbow_fig.add_vline(x=optimal_k, line_dash="dash", line_color="red", annotation_text="Elbow")

elbow_fig.update_layout(
    title="Elbow Plot for K-means Clustering with PCA",
    xaxis_title="Number of Clusters (k)",
    yaxis_title="Inertia",
    xaxis=dict(tickmode='linear', tick0=1, dtick=1)
)

# Save the elbow plot as an HTML string
elbow_plot_html = pio.to_html(elbow_fig, include_plotlyjs=False, full_html=False)

# Insert the Plotly CDN script into the head section
plotly_cdn_script = '<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>'
elbow_plot_html = elbow_plot_html.replace('<head>', f'<head>{plotly_cdn_script}')

#### 1.4. PCA for Dimensionality Reduction

Reduces the high-dimensional pixel data to 2 principal components, facilitating 2D visualization of the clusters.

In [6]:
# Apply PCA to reduce dimensions to 2 for visualization
pca = PCA(n_components=2, random_state=42)
pixel_pca = pca.fit_transform(scaled_pixel_data)

#### 1.5. Visualizing Clustering Results with PCA

##### 1.5.1 Interactive Plot for Each k (k=1 to k=10)

In [9]:
import os
import plotly.io as pio

# Create a directory to save individual HTML plots
individual_html_dir = os.path.abspath('./clustering_results/kmeans_pca_individual')
os.makedirs(individual_html_dir, exist_ok=True)

# Add the script tag to the Plotly CDN
plotly_cdn_script = '<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>'

individual_plots_html = ""

for k, labels in cluster_labels_dict.items():
    fig = go.Figure(data=go.Scatter(
        x=pixel_pca[:, 0],
        y=pixel_pca[:, 1],
        mode='markers',
        marker=dict(color=labels, colorscale='Viridis', size=5),
        text=merged_df['label'],
        hoverinfo='text'
    ))
    
    fig.update_layout(
        title=f'K-means Clustering with PCA (k={k})',
        xaxis_title='Principal Component 1',
        yaxis_title='Principal Component 2'
    )
    
    # Save each plot as an HTML string, without the default Plotly JS script
    individual_plot_html = pio.to_html(fig, include_plotlyjs=False, full_html=True)

    # Add the Plotly CDN script to the head section
    individual_plot_html = individual_plot_html.replace('<head>', f'<head>{plotly_cdn_script}')
    
    # Append to the combined HTML string for all plots
    individual_plots_html += individual_plot_html
    
    # Save each plot as a separate HTML file using the absolute path
    output_path = os.path.join(individual_html_dir, f'kmeans_pca_k{k}.html')
    
    # Write the HTML with the CDN script added
    with open(output_path, 'w') as f:
        f.write(individual_plot_html)

##### 1.5.2. Juxtaposed Plots for All k's

In [12]:
from plotly.subplots import make_subplots

# Determine the layout for subplots
cols = 5  # Number of columns
rows = 2  # Number of rows (since 10 plots, 2 rows x 5 columns)

# Create subplot figure with the specified number of rows and columns
subplot_fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'k={k}' for k in k_values])

# Add each clustering plot to the subplot
for i, (k, labels) in enumerate(cluster_labels_dict.items()):
    row = (i // cols) + 1
    col = (i % cols) + 1
    subplot_fig.add_trace(
        go.Scatter(
            x=pixel_pca[:, 0],
            y=pixel_pca[:, 1],
            mode='markers',
            marker=dict(color=labels, colorscale='Viridis', size=2),
            text=merged_df['label'],
            hoverinfo='text',
            showlegend=False
        ),
        row=row, col=col
    )

# Update layout
subplot_fig.update_layout(
    title_text='K-means Clustering with PCA (k=1 to 10)',
    height=800,  # Adjust height if necessary
    width=1200   # Adjust width if necessary
)

# Save the juxtaposed plot as an HTML string
juxtaposed_plot_html = pio.to_html(subplot_fig, include_plotlyjs=False, full_html=True)

# Insert the Plotly CDN script into the head section
juxtaposed_plot_html = juxtaposed_plot_html.replace('<head>', f'<head>{plotly_cdn_script}')

# Save the juxtaposed plot as a single HTML file
juxtaposed_output_path = os.path.join('./clustering_results/', 'kmeans_pca_juxtaposed.html')
with open(juxtaposed_output_path, 'w') as f:
    f.write(juxtaposed_plot_html)

##### Exporting Clustering Results for K-means with PCA

In [14]:
# Combine all HTML parts into one complete HTML document
kmeans_pca_html = f"""
<html>
<head>
    <meta charset='utf-8' />
    {plotly_cdn_script}
</head>
<body>
    <h1>K-means Clustering with PCA</h1>
    <h2>Elbow Plot</h2>
    {elbow_plot_html}
    <h2>Individual Clustering Plots (k=1 to k=10)</h2>
    {individual_plots_html}
    <h2>Juxtaposed Clustering Plots (k=1 to k=10)</h2>
    {juxtaposed_plot_html}
</body>
</html>
"""

# Define the output HTML file path
kmeans_pca_output_html = './clustering_results/kmeans_pca_results.html'

# Save the combined HTML to a file
with open(kmeans_pca_output_html, 'w', encoding='utf-8') as f:
    f.write(kmeans_pca_html)

print(f"K-means with PCA clustering results have been exported to {kmeans_pca_output_html}")


K-means with PCA clustering results have been exported to ./clustering_results/kmeans_pca_results.html


### 2. K-means Clustering with UMAP

In this approach, we apply K-means clustering on the scaled pixel data and use UMAP for dimensionality reduction to visualize the clusters.

#### 2.1 UMAP for Dimensionality Reduction

UMAP Application: Reduces the high-dimensional pixel data to 2 dimensions using UMAP, which often preserves both local and global data structure better than PCA.

In [15]:
# Apply UMAP to reduce dimensions to 2 for visualization
umap_reducer = umap.UMAP(n_components=2, random_state=42)
umap_embedding = umap_reducer.fit_transform(scaled_pixel_data)

#### 2.2. Visualizing Clustering Results with UMAP

##### 2.2.1. Interactive Plot for Each k (k=1 to k=10)

In [16]:
# Create a directory to save individual HTML plots
umap_individual_html_dir = './clustering_results/kmeans_umap_individual'
os.makedirs(umap_individual_html_dir, exist_ok=True)

umap_individual_plots_html = ""

for k, labels in cluster_labels_dict.items():
    fig = go.Figure(data=go.Scatter(
        x=umap_embedding[:, 0],
        y=umap_embedding[:, 1],
        mode='markers',
        marker=dict(color=labels, colorscale='Viridis', size=5),
        text=merged_df['label'],
        hoverinfo='text'
    ))
    
    fig.update_layout(
        title=f'K-means Clustering with UMAP (k={k})',
        xaxis_title='UMAP Dimension 1',
        yaxis_title='UMAP Dimension 2'
    )
    
    # Save each plot as an HTML string
    individual_plot_html = pio.to_html(fig, include_plotlyjs=False, full_html=True)
    
    # Insert the Plotly CDN script into the head section
    individual_plot_html = individual_plot_html.replace('<head>', f'<head>{plotly_cdn_script}')
    
    # Append to the combined HTML string for all plots
    umap_individual_plots_html += individual_plot_html
    
    # Save each plot as a separate HTML file
    output_path = os.path.join(umap_individual_html_dir, f'kmeans_umap_k{k}.html')
    with open(output_path, 'w') as f:
        f.write(individual_plot_html)


##### 2.2.2. Juxtaposed Plots for All k's

In [17]:
from plotly.subplots import make_subplots

# Determine the layout for subplots
cols = 5  # Number of columns
rows = 2  # Number of rows

# Create subplot figure with the specified number of rows and columns
umap_subplot_fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'k={k}' for k in k_values])

# Add each clustering plot to the subplot
for i, (k, labels) in enumerate(cluster_labels_dict.items()):
    row = (i // cols) + 1
    col = (i % cols) + 1
    umap_subplot_fig.add_trace(
        go.Scatter(
            x=umap_embedding[:, 0],
            y=umap_embedding[:, 1],
            mode='markers',
            marker=dict(color=labels, colorscale='Viridis', size=2),
            text=merged_df['label'],
            hoverinfo='text',
            showlegend=False
        ),
        row=row, col=col
    )

# Update layout
umap_subplot_fig.update_layout(
    title_text='K-means Clustering with UMAP (k=1 to 10)',
    height=800,  # Adjust height if necessary
    width=1200   # Adjust width if necessary
)

# Save the juxtaposed plot as an HTML string
umap_juxtaposed_plot_html = pio.to_html(umap_subplot_fig, include_plotlyjs=False, full_html=True)

# Insert the Plotly CDN script into the head section
umap_juxtaposed_plot_html = umap_juxtaposed_plot_html.replace('<head>', f'<head>{plotly_cdn_script}')

# Save the juxtaposed plot as a single HTML file
umap_juxtaposed_output_path = os.path.join('./clustering_results/', 'kmeans_umap_juxtaposed.html')
with open(umap_juxtaposed_output_path, 'w') as f:
    f.write(umap_juxtaposed_plot_html)

##### Exporting Clustering Results for K-means with UMAP

In [18]:
# Combine all HTML parts into one complete HTML document
kmeans_umap_html = f"""
<html>
<head>
    <meta charset='utf-8' />
    {plotly_cdn_script}
</head>
<body>
    <h1>K-means Clustering with UMAP</h1>
    <h2>Elbow Plot</h2>
    {elbow_plot_html}
    <h2>Individual Clustering Plots (k=1 to k=10)</h2>
    {umap_individual_plots_html}
    <h2>Juxtaposed Clustering Plots (k=1 to k=10)</h2>
    {umap_juxtaposed_plot_html}
</body>
</html>
"""

# Define the output HTML file path
kmeans_umap_output_html = './clustering_results/kmeans_umap_results.html'

# Save the combined HTML to a file
with open(kmeans_umap_output_html, 'w', encoding='utf-8') as f:
    f.write(kmeans_umap_html)

print(f"K-means with UMAP clustering results have been exported to {kmeans_umap_output_html}")


K-means with UMAP clustering results have been exported to ./clustering_results/kmeans_umap_results.html


### 3. PCA followed by K-means Clustering

In this method, we first reduce the dimensionality of the data using PCA and then apply K-means clustering on the reduced data.

#### 3.1. PCA for Dimensionality Reduction

PCA Reduces the data to 2 principal components, preparing it for clustering.

In [19]:
# Apply PCA to reduce dimensions to 2 for clustering
pca_for_clustering = PCA(n_components=2, random_state=42)
pca_data = pca_for_clustering.fit_transform(scaled_pixel_data)

#### 3.2. K-means Clustering on PCA-reduced Data

In [20]:
# Initialize lists to store inertia values and cluster labels
inertia_pca_clustering = []
cluster_labels_pca_clustering = {}

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(pca_data)
    inertia_pca_clustering.append(kmeans.inertia_)
    cluster_labels_pca_clustering[k] = kmeans.labels_


#### 3.3. Elbow Plot for PCA + K-means

In [21]:
# Use KneeLocator to find the elbow point
kl_pca_clustering = KneeLocator(k_values, inertia_pca_clustering, curve='convex', direction='decreasing')
optimal_k_pca_clustering = kl_pca_clustering.elbow

# Plot the inertia values with the elbow point
elbow_pca_clustering_fig = go.Figure()

elbow_pca_clustering_fig.add_trace(go.Scatter(
    x=list(k_values),
    y=inertia_pca_clustering,
    mode='lines+markers',
    name='Inertia'
))

if optimal_k_pca_clustering:
    elbow_pca_clustering_fig.add_vline(x=optimal_k_pca_clustering, line_dash="dash", line_color="red", annotation_text="Elbow")

elbow_pca_clustering_fig.update_layout(
    title="Elbow Plot for PCA + K-means Clustering",
    xaxis_title="Number of Clusters (k)",
    yaxis_title="Inertia",
    xaxis=dict(tickmode='linear', tick0=1, dtick=1)
)

# Save the elbow plot as an HTML string
elbow_pca_clustering_html = pio.to_html(elbow_pca_clustering_fig, include_plotlyjs=False, full_html=False)

# Insert the Plotly CDN script into the head section
elbow_pca_clustering_html = elbow_pca_clustering_html.replace('<head>', f'<head>{plotly_cdn_script}')


#### 3.4. Visualizing Clustering Results on PCA-reduced Data

##### 3.4.1. Interactive Plot for Each k (k=1 to k=10)

In [22]:
import os
import plotly.io as pio

# Create a directory to save individual HTML plots
pca_kmeans_individual_html_dir = os.path.abspath('./clustering_results/pca_kmeans_individual')
os.makedirs(pca_kmeans_individual_html_dir, exist_ok=True)

pca_kmeans_individual_plots_html = ""

for k, labels in cluster_labels_pca_clustering.items():
    fig = go.Figure(data=go.Scatter(
        x=pca_data[:, 0],
        y=pca_data[:, 1],
        mode='markers',
        marker=dict(color=labels, colorscale='Viridis', size=5),
        text=merged_df['label'],
        hoverinfo='text'
    ))
    
    fig.update_layout(
        title=f'PCA + K-means Clustering (k={k})',
        xaxis_title='Principal Component 1',
        yaxis_title='Principal Component 2'
    )
    
    # Save each plot as an HTML string
    individual_plot_html = pio.to_html(fig, include_plotlyjs=False, full_html=True)
    individual_plot_html = individual_plot_html.replace('<head>', f'<head>{plotly_cdn_script}')
    pca_kmeans_individual_plots_html += individual_plot_html
    
    # Save each plot as a separate HTML file
    output_path = os.path.join(pca_kmeans_individual_html_dir, f'pca_kmeans_k{k}.html')
    with open(output_path, 'w') as f:
        f.write(individual_plot_html)


##### 3.4.2. Juxtaposed Plots for All k's

In [23]:
from plotly.subplots import make_subplots

# Determine the layout for subplots
cols = 5  # Number of columns
rows = 2  # Number of rows (since 10 plots, 2 rows x 5 columns)

# Create subplot figure with the specified number of rows and columns
pca_kmeans_subplot_fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'k={k}' for k in k_values])

# Add each clustering plot to the subplot
for i, (k, labels) in enumerate(cluster_labels_pca_clustering.items()):
    row = (i // cols) + 1
    col = (i % cols) + 1
    pca_kmeans_subplot_fig.add_trace(
        go.Scatter(
            x=pca_data[:, 0],
            y=pca_data[:, 1],
            mode='markers',
            marker=dict(color=labels, colorscale='Viridis', size=2),
            text=merged_df['label'],
            hoverinfo='text',
            showlegend=False
        ),
        row=row, col=col
    )

# Update layout
pca_kmeans_subplot_fig.update_layout(
    title_text='PCA + K-means Clustering (k=1 to 10)',
    height=800,  # Adjust height if necessary
    width=1200   # Adjust width if necessary
)

# Save the juxtaposed plot as an HTML string
pca_kmeans_juxtaposed_plot_html = pio.to_html(pca_kmeans_subplot_fig, include_plotlyjs=False, full_html=True)

# Insert the Plotly CDN script into the head section
pca_kmeans_juxtaposed_plot_html = pca_kmeans_juxtaposed_plot_html.replace('<head>', f'<head>{plotly_cdn_script}')

# Save the juxtaposed plot as a single HTML file
pca_kmeans_juxtaposed_output_path = os.path.join('./clustering_results/', 'pca_kmeans_juxtaposed.html')
with open(pca_kmeans_juxtaposed_output_path, 'w') as f:
    f.write(pca_kmeans_juxtaposed_plot_html)

##### Exporting Clustering Results for PCA + K-means

In [24]:
# Combine all HTML parts into one complete HTML document
pca_kmeans_html = f"""
<html>
<head>
    <meta charset='utf-8' />
    {plotly_cdn_script}
</head>
<body>
    <h1>PCA + K-means Clustering</h1>
    <h2>Elbow Plot</h2>
    {elbow_pca_clustering_html}
    <h2>Individual Clustering Plots (k=1 to k=10)</h2>
    {pca_kmeans_individual_plots_html}
    <h2>Juxtaposed Clustering Plots (k=1 to k=10)</h2>
    {juxtaposed_plot_html}
</body>
</html>
"""

# Define the output HTML file path
pca_kmeans_output_html = './clustering_results/pca_kmeans_results.html'

# Save the combined HTML to a file
with open(pca_kmeans_output_html, 'w', encoding='utf-8') as f:
    f.write(pca_kmeans_html)

print(f"PCA + K-means clustering results have been exported to {pca_kmeans_output_html}")

PCA + K-means clustering results have been exported to ./clustering_results/pca_kmeans_results.html


### 4. UMAP followed by K-means Clustering

In this approach, we first reduce the dimensionality of the data using UMAP and then apply K-means clustering on the reduced data.

##### 4.1. UMAP for Dimensionality Reduction

We reduce the data to 2 UMAP dimensions, preparing it for clustering.

In [25]:
# Apply UMAP to reduce dimensions to 2 for clustering
umap_reducer_for_clustering = umap.UMAP(n_components=2, random_state=42)
umap_data = umap_reducer_for_clustering.fit_transform(scaled_pixel_data)

##### 4.2. K-means Clustering on UMAP-reduced Data

In [26]:
# Initialize lists to store inertia values and cluster labels
inertia_umap_clustering = []
cluster_labels_umap_clustering = {}

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(umap_data)
    inertia_umap_clustering.append(kmeans.inertia_)
    cluster_labels_umap_clustering[k] = kmeans.labels_


#### 4.3. Elbow Plot for UMAP + K-means

In [27]:
# Use KneeLocator to find the elbow point
kl_umap_clustering = KneeLocator(k_values, inertia_umap_clustering, curve='convex', direction='decreasing')
optimal_k_umap_clustering = kl_umap_clustering.elbow

# Plot the inertia values with the elbow point
elbow_umap_clustering_fig = go.Figure()

elbow_umap_clustering_fig.add_trace(go.Scatter(
    x=list(k_values),
    y=inertia_umap_clustering,
    mode='lines+markers',
    name='Inertia'
))

if optimal_k_umap_clustering:
    elbow_umap_clustering_fig.add_vline(x=optimal_k_umap_clustering, line_dash="dash", line_color="red", annotation_text="Elbow")

elbow_umap_clustering_fig.update_layout(
    title="Elbow Plot for UMAP + K-means Clustering",
    xaxis_title="Number of Clusters (k)",
    yaxis_title="Inertia",
    xaxis=dict(tickmode='linear', tick0=1, dtick=1)
)

# Save the elbow plot as an HTML string
elbow_umap_clustering_html = pio.to_html(elbow_umap_clustering_fig, include_plotlyjs=False, full_html=False)

# Insert the Plotly CDN script into the head section
elbow_umap_clustering_html = elbow_umap_clustering_html.replace('<head>', f'<head>{plotly_cdn_script}')


#### 4.4. Visualizing Clustering Results on UMAP-reduced Data

##### 4.4.1. Interactive Plot for Each k (k=1 to k=10)

In [28]:
import os
import plotly.io as pio

# Create a directory to save individual HTML plots
umap_kmeans_individual_html_dir = os.path.abspath('./clustering_results/umap_kmeans_individual')
os.makedirs(umap_kmeans_individual_html_dir, exist_ok=True)

umap_kmeans_individual_plots_html = ""

for k, labels in cluster_labels_umap_clustering.items():
    fig = go.Figure(data=go.Scatter(
        x=umap_data[:, 0],
        y=umap_data[:, 1],
        mode='markers',
        marker=dict(color=labels, colorscale='Viridis', size=5),
        text=merged_df['label'],
        hoverinfo='text'
    ))
    
    fig.update_layout(
        title=f'UMAP + K-means Clustering (k={k})',
        xaxis_title='UMAP Dimension 1',
        yaxis_title='UMAP Dimension 2'
    )
    
    # Save each plot as an HTML string
    individual_plot_html = pio.to_html(fig, include_plotlyjs=False, full_html=True)
    individual_plot_html = individual_plot_html.replace('<head>', f'<head>{plotly_cdn_script}')
    umap_kmeans_individual_plots_html += individual_plot_html
    
    # Save each plot as a separate HTML file
    output_path = os.path.join(umap_kmeans_individual_html_dir, f'umap_kmeans_k{k}.html')
    with open(output_path, 'w') as f:
        f.write(individual_plot_html)


##### 4.4.2. Juxtaposed Plots for All k's

In [29]:
from plotly.subplots import make_subplots

# Determine the layout for subplots
cols = 5  # Number of columns
rows = 2  # Number of rows

# Create subplot figure with the specified number of rows and columns
umap_kmeans_subplot_fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'k={k}' for k in k_values])

# Add each clustering plot to the subplot
for i, (k, labels) in enumerate(cluster_labels_umap_clustering.items()):
    row = (i // cols) + 1
    col = (i % cols) + 1
    umap_kmeans_subplot_fig.add_trace(
        go.Scatter(
            x=umap_data[:, 0],
            y=umap_data[:, 1],
            mode='markers',
            marker=dict(color=labels, colorscale='Viridis', size=2),
            text=merged_df['label'],
            hoverinfo='text',
            showlegend=False
        ),
        row=row, col=col
    )

# Update layout
umap_kmeans_subplot_fig.update_layout(
    title_text='UMAP + K-means Clustering (k=1 to 10)',
    height=800,  # Adjust height if necessary
    width=1200   # Adjust width if necessary
)

# Save the juxtaposed plot as an HTML string
umap_kmeans_juxtaposed_plot_html = pio.to_html(umap_kmeans_subplot_fig, include_plotlyjs=False, full_html=True)

# Insert the Plotly CDN script into the head section
umap_kmeans_juxtaposed_plot_html = umap_kmeans_juxtaposed_plot_html.replace('<head>', f'<head>{plotly_cdn_script}')

# Save the juxtaposed plot as a single HTML file
umap_kmeans_juxtaposed_output_path = os.path.join('./clustering_results/', 'umap_kmeans_juxtaposed.html')
with open(umap_kmeans_juxtaposed_output_path, 'w') as f:
    f.write(umap_kmeans_juxtaposed_plot_html)


#### Exporting Clustering Results for UMAP + K-means

In [30]:
# Combine all HTML parts into one complete HTML document
umap_kmeans_html = f"""
<html>
<head>
    <meta charset='utf-8' />
    {plotly_cdn_script}
</head>
<body>
    <h1>UMAP + K-means Clustering</h1>
    <h2>Elbow Plot</h2>
    {elbow_umap_clustering_html}
    <h2>Individual Clustering Plots (k=1 to k=10)</h2>
    {umap_kmeans_individual_plots_html}
    <h2>Juxtaposed Clustering Plots (k=1 to k=10)</h2>
    {umap_kmeans_juxtaposed_plot_html}
</body>
</html>
"""

# Define the output HTML file path
umap_kmeans_output_html = './clustering_results/umap_kmeans_results.html'

# Save the combined HTML to a file
with open(umap_kmeans_output_html, 'w', encoding='utf-8') as f:
    f.write(umap_kmeans_html)

print(f"UMAP + K-means clustering results have been exported to {umap_kmeans_output_html}")


UMAP + K-means clustering results have been exported to ./clustering_results/umap_kmeans_results.html
