# Clustering Analysis: Mental Health in Tech 2016

## 1. Load Reduced Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from scipy.cluster.hierarchy import dendrogram, linkage
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')

RUN = 'run_03'

data_dir = f'../../data/{RUN}/reduced'
methods = ['pca', 'mds', 'lle', 'tsne', 'umap']

reduced_data = {}
for method in methods:
    filepath = f'{data_dir}/{method}_2d.csv'
    reduced_data[method] = pd.read_csv(filepath)
    
print(f"Loaded {len(methods)} reduced datasets")

## 2. Comprehensive Cluster Evaluation

In [None]:
def evaluate_clustering(data, k_range=range(2, 8)):
    """Evaluate clustering with multiple metrics"""
    results = {
        'k': list(k_range),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': [],
        'bic': [],
        'aic': []
    }
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(data)
        
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(data, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(data, labels))
        results['davies_bouldin'].append(davies_bouldin_score(data, labels))
        
        gmm = GaussianMixture(n_components=k, random_state=42)
        gmm.fit(data)
        results['bic'].append(gmm.bic(data))
        results['aic'].append(gmm.aic(data))
    
    return results

In [None]:
umap_data = reduced_data['umap'].values
eval_results = evaluate_clustering(umap_data)

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# 1. Elbow (Inertia)
axes[0, 0].plot(eval_results['k'], eval_results['inertia'], 'bo-', linewidth=2, markersize=8)
axes[0, 0].set_xlabel('k')
axes[0, 0].set_ylabel('Inertia')
axes[0, 0].set_title('Elbow Method')
axes[0, 0].axvline(x=4, color='r', linestyle='--', alpha=0.5, label='k=4')
axes[0, 0].legend()

# 2. Silhouette
axes[0, 1].plot(eval_results['k'], eval_results['silhouette'], 'go-', linewidth=2, markersize=8)
axes[0, 1].set_xlabel('k')
axes[0, 1].set_ylabel('Silhouette Score')
axes[0, 1].set_title('Silhouette Score (higher = better)')
axes[0, 1].axvline(x=4, color='r', linestyle='--', alpha=0.5)

# 3. Calinski-Harabasz
axes[0, 2].plot(eval_results['k'], eval_results['calinski_harabasz'], 'mo-', linewidth=2, markersize=8)
axes[0, 2].set_xlabel('k')
axes[0, 2].set_ylabel('Calinski-Harabasz Index')
axes[0, 2].set_title('Calinski-Harabasz (higher = better)')
axes[0, 2].axvline(x=4, color='r', linestyle='--', alpha=0.5)

# 4. Davies-Bouldin
axes[1, 0].plot(eval_results['k'], eval_results['davies_bouldin'], 'co-', linewidth=2, markersize=8)
axes[1, 0].set_xlabel('k')
axes[1, 0].set_ylabel('Davies-Bouldin Index')
axes[1, 0].set_title('Davies-Bouldin (lower = better)')
axes[1, 0].axvline(x=4, color='r', linestyle='--', alpha=0.5)

# 5. BIC
axes[1, 1].plot(eval_results['k'], eval_results['bic'], 'ro-', linewidth=2, markersize=8)
axes[1, 1].set_xlabel('k')
axes[1, 1].set_ylabel('BIC')
axes[1, 1].set_title('BIC (lower = better)')
axes[1, 1].axvline(x=4, color='r', linestyle='--', alpha=0.5)

# 6. AIC
axes[1, 2].plot(eval_results['k'], eval_results['aic'], 'yo-', linewidth=2, markersize=8)
axes[1, 2].set_xlabel('k')
axes[1, 2].set_ylabel('AIC')
axes[1, 2].set_title('AIC (lower = better)')
axes[1, 2].axvline(x=4, color='r', linestyle='--', alpha=0.5)

plt.suptitle('Cluster Evaluation Metrics (UMAP)', fontsize=14, fontweight='bold')
plt.tight_layout()
os.makedirs(f'../../plots/{RUN}', exist_ok=True)
plt.savefig(f'../../plots/{RUN}/cluster_evaluation_metrics.png', dpi=150)
plt.show()


## 3. k-Means Clustering (k=4)

In [None]:
K_CLUSTERS = 4

kmeans_results = {}
for method in methods:
    data = reduced_data[method].values
    kmeans = KMeans(n_clusters=K_CLUSTERS, random_state=42, n_init=10)
    labels = kmeans.fit_predict(data)
    kmeans_results[method] = {
        'labels': labels,
        'centroids': kmeans.cluster_centers_,
        'silhouette': silhouette_score(data, labels),
        'inertia': kmeans.inertia_
    }

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, method in enumerate(methods):
    ax = axes[idx]
    data = reduced_data[method].values
    labels = kmeans_results[method]['labels']
    centroids = kmeans_results[method]['centroids']
    
    scatter = ax.scatter(data[:, 0], data[:, 1], c=labels, cmap='tab10', alpha=0.6, s=20)
    ax.scatter(centroids[:, 0], centroids[:, 1], c='black', marker='X', s=200, edgecolors='white', linewidths=2)
    ax.set_title(f'{method.upper()} (Sil: {kmeans_results[method]["silhouette"]:.3f})')
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')



plt.suptitle(f'k-Means Clustering (k={K_CLUSTERS})', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(f'../../plots/{RUN}/kmeans_clustering.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. GMM Clustering

In [None]:
from matplotlib.patches import Ellipse

def draw_ellipse(position, covariance, ax, n_std=2.0, **kwargs):
    U, s, _ = np.linalg.svd(covariance)
    angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
    width, height = 2 * n_std * np.sqrt(s)
    ellipse = Ellipse(position, width, height, angle=angle, **kwargs)
    ax.add_patch(ellipse)

gmm_results = {}
for method in methods:
    data = reduced_data[method].values
    gmm = GaussianMixture(n_components=K_CLUSTERS, random_state=42)
    labels = gmm.fit_predict(data)
    gmm_results[method] = {
        'labels': labels,
        'means': gmm.means_,
        'covariances': gmm.covariances_,
        'silhouette': silhouette_score(data, labels),
        'bic': gmm.bic(data),
        'aic': gmm.aic(data)
    }

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, method in enumerate(methods):
    ax = axes[idx]
    data = reduced_data[method].values
    labels = gmm_results[method]['labels']
    means = gmm_results[method]['means']
    covs = gmm_results[method]['covariances']
    
    ax.scatter(data[:, 0], data[:, 1], c=labels, cmap='tab10', alpha=0.6, s=20)
    for i in range(K_CLUSTERS):
        draw_ellipse(means[i], covs[i], ax, n_std=2.0, alpha=0.2, 
                     facecolor=plt.cm.tab10(i), edgecolor='black', linewidth=2)
    ax.scatter(means[:, 0], means[:, 1], c='black', marker='X', s=200, edgecolors='white', linewidths=2)
    ax.set_title(f'{method.upper()} (Sil: {gmm_results[method]["silhouette"]:.3f})')
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')

axes[5].axis('off')
plt.suptitle(f'GMM Clustering (k={K_CLUSTERS})', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(f'../../plots/{RUN}/gmm_clustering.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Hierarchical Clustering

In [None]:
sample_size = 200
np.random.seed(42)
sample_idx = np.random.choice(len(reduced_data['pca']), sample_size, replace=False)
sample_data = reduced_data['pca'].values[sample_idx]

linkage_matrix = linkage(sample_data, method='ward')

fig, ax = plt.subplots(figsize=(14, 5))
dendrogram(linkage_matrix, ax=ax, truncate_mode='lastp', p=30, leaf_rotation=90, leaf_font_size=8)
ax.set_title('Dendrogram (Ward Method, PCA Data, n=200)')
ax.set_xlabel('Cluster')
ax.set_ylabel('Distance')
ax.axhline(y=12, color='r', linestyle='--', alpha=0.7, label=f'Cut for k={K_CLUSTERS}')
ax.legend()
plt.tight_layout()
plt.savefig(f'../../plots/{RUN}/dendrogram.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
hierarchical_results = {}
for method in methods:
    data = reduced_data[method].values
    agg = AgglomerativeClustering(n_clusters=K_CLUSTERS, linkage='ward')
    labels = agg.fit_predict(data)
    hierarchical_results[method] = {
        'labels': labels,
        'silhouette': silhouette_score(data, labels)
    }

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, method in enumerate(methods):
    ax = axes[idx]
    data = reduced_data[method].values
    labels = hierarchical_results[method]['labels']
    
    ax.scatter(data[:, 0], data[:, 1], c=labels, cmap='tab10', alpha=0.6, s=20)
    for i in range(K_CLUSTERS):
        cluster_points = data[labels == i]
        centroid = cluster_points.mean(axis=0)
        ax.scatter(centroid[0], centroid[1], c='black', marker='X', s=200, edgecolors='white', linewidths=2)
    ax.set_title(f'{method.upper()} (Sil: {hierarchical_results[method]["silhouette"]:.3f})')
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')

axes[5].axis('off')
plt.suptitle(f'Hierarchical Clustering - Ward (k={K_CLUSTERS})', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(f'../../plots/{RUN}/hierarchical_clustering.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Algorithm Comparison

In [None]:
best_score = -1
best_config = None

for method in methods:
    km_sil = kmeans_results[method]['silhouette']
    gmm_sil = gmm_results[method]['silhouette']
    hier_sil = hierarchical_results[method]['silhouette']
    
    print(f"{method.upper():<10} {km_sil:<15.4f} {gmm_sil:<15.4f} {hier_sil:<15.4f}")
    
    for algo, score in [('kmeans', km_sil), ('gmm', gmm_sil), ('hierarchical', hier_sil)]:
        if score > best_score:
            best_score = score
            best_config = (method, algo)

print("-"*55)
print(f"\nBest: {best_config[0].upper()} + {best_config[1]} (Silhouette: {best_score:.4f})")

## 7. Save Results

In [None]:
output_dir = f'../../data/{RUN}/clustered'
os.makedirs(output_dir, exist_ok=True)

df_preprocessed = pd.read_csv(f'../../data/{RUN}/processed/mental_health_preprocessed.csv')

best_method, best_algo = best_config
if best_algo == 'kmeans':
    best_labels = kmeans_results[best_method]['labels']
elif best_algo == 'gmm':
    best_labels = gmm_results[best_method]['labels']
else:
    best_labels = hierarchical_results[best_method]['labels']

df_clustered = df_preprocessed.copy()
df_clustered['cluster'] = best_labels

df_clustered.to_csv(f'{output_dir}/clustered_data.csv', index=False)