In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
import warnings
import os
import joblib
import json

warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('default')
sns.set_palette("husl")

print("=== Heart Disease Prediction - Unsupervised Learning ===")
print("Applying K-Means and Hierarchical Clustering for pattern discovery...")

# Load preprocessed data
try:
    # Ensure these files are in a 'data' subfolder or provide the correct path
    X_scaled = pd.read_csv('../data/X_scaled.csv')
    y = pd.read_csv('../data/y.csv')['target']
    X_top_features = pd.read_csv('../data/X_top_features.csv')

    print("✅ Data loaded successfully")
    print(f"Original features: {X_scaled.shape[1]}")
    print(f"Top selected features: {X_top_features.shape[1]}")

except FileNotFoundError:
    print("❌ Data files not found. Please run previous notebooks first.")
    # Create sample data for demonstration
    print("Creating sample data for demonstration...")
    np.random.seed(42)
    n_samples = 303
    X_scaled = pd.DataFrame(
        np.random.randn(n_samples, 13),
        columns=[f'feature_{i}' for i in range(13)]
    )
    X_top_features = X_scaled.iloc[:, :8]
    y = pd.Series(np.random.choice([0, 1], n_samples), name='target')
    print("✅ Sample data created")

print(f"\nDataset shape: {X_scaled.shape}")
print(f"Target distribution: {dict(y.value_counts())}")

# Prepare datasets for clustering - RENAMED VARIABLE
data_to_cluster = {
    'Original': X_scaled,
    'Top_Features': X_top_features
}

# 1. OPTIMAL NUMBER OF CLUSTERS - ELBOW METHOD
print("\n" + "="*70)
print("1. DETERMINING OPTIMAL NUMBER OF CLUSTERS")
print("="*70)

def plot_elbow_method(X, dataset_name, max_clusters=10):
    """Plot elbow method for optimal k selection"""
    inertias = []
    silhouette_scores = []
    k_range = range(2, max_clusters + 1)

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(X)
        inertias.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(X, cluster_labels))

    return k_range, inertias, silhouette_scores

# Plot elbow method for each dataset
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Optimal Number of Clusters Analysis', fontsize=16, fontweight='bold')

optimal_k = {}

# UPDATED LOOP
for i, (dataset_name, X) in enumerate(data_to_cluster.items()):
    print(f"📊 Analyzing {dataset_name} dataset...")

    k_range, inertias, silhouette_scores = plot_elbow_method(X, dataset_name)

    # Plot inertia (elbow method)
    axes[0, i].plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
    axes[0, i].set_xlabel('Number of Clusters (k)')
    axes[0, i].set_ylabel('Inertia (WCSS)')
    axes[0, i].set_title(f'Elbow Method - {dataset_name}')
    axes[0, i].grid(True, alpha=0.3)

    # Plot silhouette scores
    axes[1, i].plot(k_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
    axes[1, i].set_xlabel('Number of Clusters (k)')
    axes[1, i].set_ylabel('Silhouette Score')
    axes[1, i].set_title(f'Silhouette Analysis - {dataset_name}')
    axes[1, i].grid(True, alpha=0.3)

    # Find optimal k (highest silhouette score)
    optimal_k[dataset_name] = k_range[np.argmax(silhouette_scores)]
    print(f"  Optimal k (silhouette): {optimal_k[dataset_name]}")

plt.tight_layout()
plt.show()

# 2. K-MEANS CLUSTERING
print("\n" + "="*70)
print("2. K-MEANS CLUSTERING")
print("="*70)

kmeans_results = {}

# UPDATED LOOP
for dataset_name, X in data_to_cluster.items():
    print(f"\n🔧 Applying K-Means to {dataset_name} dataset...")
    k = optimal_k.get(dataset_name, 2)
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    ari = adjusted_rand_score(y, cluster_labels)
    nmi = normalized_mutual_info_score(y, cluster_labels)

    kmeans_results[dataset_name] = {
        'model': kmeans,
        'labels': cluster_labels,
        'k': k,
        'silhouette': silhouette_avg,
        'inertia': kmeans.inertia_,
        'ari': ari,
        'nmi': nmi,
        'centers': kmeans.cluster_centers_
    }
    print(f"  Number of clusters: {k}")
    print(f"  Silhouette score: {silhouette_avg:.3f}")
    print(f"  ARI (vs actual labels): {ari:.3f}")
    print(f"  NMI (vs actual labels): {nmi:.3f}")

# 3. HIERARCHICAL CLUSTERING
print("\n" + "="*70)
print("3. HIERARCHICAL CLUSTERING")
print("="*70)

hierarchical_results = {}

# UPDATED LOOP
for dataset_name, X in data_to_cluster.items():
    print(f"\n🌳 Applying Hierarchical Clustering to {dataset_name} dataset...")
    linkage_matrix = linkage(X, method='ward')
    k = optimal_k.get(dataset_name, 2)
    hierarchical = AgglomerativeClustering(n_clusters=k, linkage='ward')
    cluster_labels = hierarchical.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    ari = adjusted_rand_score(y, cluster_labels)
    nmi = normalized_mutual_info_score(y, cluster_labels)

    hierarchical_results[dataset_name] = {
        'model': hierarchical,
        'labels': cluster_labels,
        'linkage_matrix': linkage_matrix,
        'k': k,
        'silhouette': silhouette_avg,
        'ari': ari,
        'nmi': nmi
    }
    print(f"  Number of clusters: {k}")
    print(f"  Silhouette score: {silhouette_avg:.3f}")
    print(f"  ARI (vs actual labels): {ari:.3f}")
    print(f"  NMI (vs actual labels): {nmi:.3f}")

# 4. DENDROGRAM VISUALIZATION
print("\n" + "="*70)
print("4. DENDROGRAM VISUALIZATION")
print("="*70)

fig, axes = plt.subplots(1, len(data_to_cluster), figsize=(8*len(data_to_cluster), 6))
if len(data_to_cluster) == 1:
    axes = [axes]

# UPDATED LOOP
for i, (dataset_name, X) in enumerate(data_to_cluster.items()):
    linkage_matrix = hierarchical_results[dataset_name]['linkage_matrix']
    dendrogram(linkage_matrix, ax=axes[i], truncate_mode='level', p=5)
    axes[i].set_title(f'Hierarchical Clustering Dendrogram - {dataset_name}')
    axes[i].set_xlabel('Sample Index or Cluster Size')
    axes[i].set_ylabel('Distance')

plt.tight_layout()
plt.show()

# 5. CLUSTER VISUALIZATION
print("\n" + "="*70)
print("5. CLUSTER VISUALIZATION")
print("="*70)

pca_2d = PCA(n_components=2, random_state=42)
fig, axes = plt.subplots(2, len(data_to_cluster), figsize=(8*len(data_to_cluster), 12))
if len(data_to_cluster) == 1:
    axes = axes.reshape(-1, 1)

# UPDATED LOOP
for i, (dataset_name, X) in enumerate(data_to_cluster.items()):
    kmeans_labels = kmeans_results[dataset_name]['labels']
    hierarchical_labels = hierarchical_results[dataset_name]['labels']
    X_pca = pca_2d.fit_transform(X)

    # Plot K-Means
    axes[0, i].scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.7, s=50)
    axes[0, i].set_title(f'K-Means Clustering - {dataset_name}')
    axes[0, i].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.1%} variance)')
    axes[0, i].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.1%} variance)')
    centers_pca = pca_2d.transform(kmeans_results[dataset_name]['centers'])
    axes[0, i].scatter(centers_pca[:, 0], centers_pca[:, 1], c='red', marker='x', s=200, linewidths=3, label='Centroids')
    axes[0, i].legend()

    # Plot Hierarchical
    axes[1, i].scatter(X_pca[:, 0], X_pca[:, 1], c=hierarchical_labels, cmap='viridis', alpha=0.7, s=50)
    axes[1, i].set_title(f'Hierarchical Clustering - {dataset_name}')
    axes[1, i].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.1%} variance)')
    axes[1, i].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.1%} variance)')

plt.tight_layout()
plt.show()

# 6. COMPARE CLUSTERS WITH ACTUAL LABELS
print("\n" + "="*70)
print("6. CLUSTER COMPARISON WITH ACTUAL LABELS")
print("="*70)

fig, axes = plt.subplots(3, len(data_to_cluster), figsize=(8*len(data_to_cluster), 15))
if len(data_to_cluster) == 1:
    axes = axes.reshape(-1, 1)

# UPDATED LOOP
for i, (dataset_name, X) in enumerate(data_to_cluster.items()):
    X_pca = pca_2d.fit_transform(X)

    # Original labels
    axes[0, i].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='coolwarm', alpha=0.7, s=50)
    axes[0, i].set_title(f'Actual Labels - {dataset_name}')
    axes[0, i].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.1%} variance)')
    axes[0, i].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.1%} variance)')

    # K-Means clusters
    kmeans_labels = kmeans_results[dataset_name]['labels']
    axes[1, i].scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.7, s=50)
    axes[1, i].set_title(f'K-Means Clusters - {dataset_name}')
    axes[1, i].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.1%} variance)')
    axes[1, i].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.1%} variance)')

    # Hierarchical clusters
    hierarchical_labels = hierarchical_results[dataset_name]['labels']
    axes[2, i].scatter(X_pca[:, 0], X_pca[:, 1], c=hierarchical_labels, cmap='viridis', alpha=0.7, s=50)
    axes[2, i].set_title(f'Hierarchical Clusters - {dataset_name}')
    axes[2, i].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.1%} variance)')
    axes[2, i].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.1%} variance)')

plt.tight_layout()
plt.show()


# 7. CLUSTER ANALYSIS AND INTERPRETATION
print("\n" + "="*70)
print("7. CLUSTER ANALYSIS AND INTERPRETATION")
print("="*70)

# UPDATED LOOP
for dataset_name, X in data_to_cluster.items():
    print(f"\n📊 Cluster Analysis - {dataset_name} Dataset:")
    kmeans_labels = kmeans_results[dataset_name]['labels']
    hierarchical_labels = hierarchical_results[dataset_name]['labels']

    print(f"\n🔍 K-Means Cluster Composition:")
    kmeans_crosstab = pd.crosstab(kmeans_labels, y)
    print(kmeans_crosstab)

    print(f"\n🔍 Hierarchical Cluster Composition:")
    hierarchical_crosstab = pd.crosstab(hierarchical_labels, y)
    print(hierarchical_crosstab)

    print(f"\n📈 K-Means Cluster Statistics (Feature Means):")
    X_with_clusters = X.copy()
    X_with_clusters['Cluster'] = kmeans_labels
    cluster_stats = X_with_clusters.groupby('Cluster').mean().round(3)
    print(cluster_stats)

# 8. SILHOUETTE ANALYSIS
print("\n" + "="*70)
print("8. DETAILED SILHOUETTE ANALYSIS")
print("="*70)

from sklearn.metrics import silhouette_samples

fig, axes = plt.subplots(len(data_to_cluster), 2, figsize=(15, 6*len(data_to_cluster)))
if len(data_to_cluster) == 1:
    axes = axes.reshape(1, -1)

# UPDATED LOOP
for i, (dataset_name, X) in enumerate(data_to_cluster.items()):
    # K-Means silhouette
    kmeans_labels = kmeans_results[dataset_name]['labels']
    silhouette_vals = silhouette_samples(X, kmeans_labels)
    y_lower = 10
    for cluster_id in range(kmeans_results[dataset_name]['k']):
        cluster_silhouette_vals = silhouette_vals[kmeans_labels == cluster_id]
        cluster_silhouette_vals.sort()
        size_cluster = cluster_silhouette_vals.shape[0]
        y_upper = y_lower + size_cluster
        color = plt.cm.viridis(float(cluster_id) / kmeans_results[dataset_name]['k'])
        axes[i, 0].fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_vals, facecolor=color, edgecolor=color, alpha=0.7)
        axes[i, 0].text(-0.05, y_lower + 0.5 * size_cluster, str(cluster_id))
        y_lower = y_upper + 10
    axes[i, 0].set_title(f'K-Means Silhouette Plot - {dataset_name}')
    axes[i, 0].axvline(x=kmeans_results[dataset_name]['silhouette'], color="red", linestyle="--")

    # Hierarchical silhouette
    hierarchical_labels = hierarchical_results[dataset_name]['labels']
    silhouette_vals = silhouette_samples(X, hierarchical_labels)
    y_lower = 10
    for cluster_id in range(hierarchical_results[dataset_name]['k']):
        cluster_silhouette_vals = silhouette_vals[hierarchical_labels == cluster_id]
        cluster_silhouette_vals.sort()
        size_cluster = cluster_silhouette_vals.shape[0]
        y_upper = y_lower + size_cluster
        color = plt.cm.viridis(float(cluster_id) / hierarchical_results[dataset_name]['k'])
        axes[i, 1].fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_vals, facecolor=color, edgecolor=color, alpha=0.7)
        axes[i, 1].text(-0.05, y_lower + 0.5 * size_cluster, str(cluster_id))
        y_lower = y_upper + 10
    axes[i, 1].set_title(f'Hierarchical Silhouette Plot - {dataset_name}')
    axes[i, 1].axvline(x=hierarchical_results[dataset_name]['silhouette'], color="red", linestyle="--")

plt.tight_layout()
plt.show()

# 9. CLUSTERING PERFORMANCE COMPARISON
print("\n" + "="*70)
print("9. CLUSTERING PERFORMANCE COMPARISON")
print("="*70)

comparison_data = []
# UPDATED LOOP
for dataset_name in data_to_cluster.keys():
    comparison_data.append([
        dataset_name, 'K-Means',
        kmeans_results[dataset_name]['k'],
        kmeans_results[dataset_name]['silhouette'],
        kmeans_results[dataset_name]['ari'],
        kmeans_results[dataset_name]['nmi']
    ])
    comparison_data.append([
        dataset_name, 'Hierarchical',
        hierarchical_results[dataset_name]['k'],
        hierarchical_results[dataset_name]['silhouette'],
        hierarchical_results[dataset_name]['ari'],
        hierarchical_results[dataset_name]['nmi']
    ])

comparison_df = pd.DataFrame(comparison_data, columns=['Dataset', 'Method', 'K', 'Silhouette', 'ARI', 'NMI'])
print("📊 Clustering Performance Comparison:")
print(comparison_df.round(4))

# 10. SAVE CLUSTERING RESULTS
print("\n" + "="*70)
print("10. SAVING CLUSTERING RESULTS")
print("="*70)

try:
    os.makedirs('../models', exist_ok=True)
    os.makedirs('../results', exist_ok=True)

    # UPDATED LOOP
    for dataset_name in data_to_cluster.keys():
        joblib.dump(kmeans_results[dataset_name]['model'], f'../models/kmeans_{dataset_name.lower()}.pkl')
        cluster_labels_df = pd.DataFrame({
            'KMeans_Labels': kmeans_results[dataset_name]['labels'],
            'Hierarchical_Labels': hierarchical_results[dataset_name]['labels'],
            'Actual_Labels': y
        })
        cluster_labels_df.to_csv(f'results/cluster_labels_{dataset_name.lower()}.csv', index=False)

    comparison_df.to_csv('results/clustering_comparison.csv', index=False)

    clustering_analysis = {
        'optimal_k': optimal_k,
        'kmeans_results': {ds: {k: v for k, v in res.items() if k != 'model' and k != 'centers'} for ds, res in kmeans_results.items()},
        'hierarchical_results': {ds: {k: v for k, v in res.items() if k != 'model' and k != 'linkage_matrix'} for ds, res in hierarchical_results.items()}
    }

    with open('../results/clustering_analysis.json', 'w') as f:
        json.dump(clustering_analysis, f, indent=4)

    print("✅ Clustering results saved successfully!")

except Exception as e:
    print(f"⚠️ Error saving files: {e}")

# 11. UNSUPERVISED LEARNING SUMMARY
print("\n" + "="*70)
print("11. UNSUPERVISED LEARNING SUMMARY")
print("="*70)

print("✅ Unsupervised learning analysis completed successfully!")
print(f"📊 Datasets analyzed: {len(data_to_cluster)}")

print(f"\n🏆 Best clustering results (by Silhouette Score):")
# UPDATED LOOP
for dataset_name in data_to_cluster.keys():
    kmeans_score = kmeans_results[dataset_name]['silhouette']
    hierarchical_score = hierarchical_results[dataset_name]['silhouette']
    best_method = "K-Means" if kmeans_score > hierarchical_score else "Hierarchical"
    best_score = max(kmeans_score, hierarchical_score)
    print(f"  - {dataset_name}: {best_method} (Silhouette: {best_score:.3f})")

print("\n🎉 Analysis complete!")