Roshan Gautam
DataMining Lab 5
University of Cumberlands



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, homogeneity_score, completeness_score
from scipy.cluster.hierarchy import dendrogram, linkage
import warnings
warnings.filterwarnings('ignore')

# Set up plotting parameters
plt.style.use('default')
sns.set_palette("husl")

## Step 1: Data Preparation and Exploration

print("=== STEP 1: DATA PREPARATION AND EXPLORATION ===\n")

# Load the Wine dataset
wine_data = load_wine()
X = wine_data.data
y = wine_data.target

# Create DataFrame for easier handling
df = pd.DataFrame(X, columns=wine_data.feature_names)
df['target'] = y

print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Features: {len(wine_data.feature_names)}")
print(f"Classes: {len(np.unique(y))}")
print(f"Class distribution: {np.bincount(y)}")

print("\nDataset Structure:")
print(df.head())

print("\nDataset Statistics:")
print(df.describe())

print("\nDataset Info:")
print(df.info())

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nFeatures standardized for clustering analysis.")

## Step 2: Hierarchical Clustering

print("\n=== STEP 2: HIERARCHICAL CLUSTERING ===\n")

# Test different numbers of clusters
n_clusters_range = [2, 3, 4, 5]
hierarchical_results = {}

for n_clusters in n_clusters_range:
    # Apply Agglomerative Clustering
    hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
    cluster_labels = hierarchical.fit_predict(X_scaled)
    
    # Calculate silhouette score
    sil_score = silhouette_score(X_scaled, cluster_labels)
    
    hierarchical_results[n_clusters] = {
        'labels': cluster_labels,
        'silhouette_score': sil_score
    }
    
    print(f"n_clusters={n_clusters}: Silhouette Score = {sil_score:.3f}")

# Find optimal number of clusters
best_n_clusters = max(hierarchical_results.keys(), 
                     key=lambda k: hierarchical_results[k]['silhouette_score'])
print(f"\nBest n_clusters for Hierarchical: {best_n_clusters}")

# Use best configuration for visualization
best_hierarchical_labels = hierarchical_results[best_n_clusters]['labels']

# Visualize clusters using first two principal components
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=best_hierarchical_labels, cmap='viridis')
plt.title(f'Hierarchical Clustering (n_clusters={best_n_clusters})')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.colorbar()

plt.subplot(1, 3, 2)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
plt.title('True Labels')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.colorbar()

# Generate and plot dendrogram
plt.subplot(1, 3, 3)
linkage_matrix = linkage(X_scaled, method='ward')
dendro = dendrogram(linkage_matrix, truncate_mode='level', p=5)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')

plt.tight_layout()
plt.show()

## Step 3: DBSCAN Clustering

print("\n=== STEP 3: DBSCAN CLUSTERING ===\n")

# Test different parameter combinations
eps_values = [0.5, 1.0, 1.5, 2.0]
min_samples_values = [3, 5, 7, 10]

dbscan_results = {}
best_dbscan_score = -1
best_dbscan_params = None

print("Testing DBSCAN parameters:")
print("eps\tmin_samples\tn_clusters\tn_noise\tsilhouette")

for eps in eps_values:
    for min_samples in min_samples_values:
        # Apply DBSCAN
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        cluster_labels = dbscan.fit_predict(X_scaled)
        
        # Calculate metrics
        n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
        n_noise = list(cluster_labels).count(-1)
        
        if n_clusters > 1:
            sil_score = silhouette_score(X_scaled, cluster_labels)
            homogeneity = homogeneity_score(y, cluster_labels)
            completeness = completeness_score(y, cluster_labels)
            
            dbscan_results[(eps, min_samples)] = {
                'labels': cluster_labels,
                'n_clusters': n_clusters,
                'n_noise': n_noise,
                'silhouette_score': sil_score,
                'homogeneity_score': homogeneity,
                'completeness_score': completeness
            }
            
            print(f"{eps}\t{min_samples}\t\t{n_clusters}\t\t{n_noise}\t{sil_score:.3f}")
            
            if sil_score > best_dbscan_score:
                best_dbscan_score = sil_score
                best_dbscan_params = (eps, min_samples)
        else:
            print(f"{eps}\t{min_samples}\t\t{n_clusters}\t\t{n_noise}\tN/A")

if best_dbscan_params:
    print(f"\nBest DBSCAN parameters: eps={best_dbscan_params[0]}, min_samples={best_dbscan_params[1]}")
    
    best_dbscan_result = dbscan_results[best_dbscan_params]
    best_dbscan_labels = best_dbscan_result['labels']
    
    # Print detailed metrics for best configuration
    print("\nBest DBSCAN Results:")
    print(f"Number of clusters: {best_dbscan_result['n_clusters']}")
    print(f"Number of noise points: {best_dbscan_result['n_noise']}")
    print(f"Silhouette Score: {best_dbscan_result['silhouette_score']:.3f}")
    print(f"Homogeneity Score: {best_dbscan_result['homogeneity_score']:.3f}")
    print(f"Completeness Score: {best_dbscan_result['completeness_score']:.3f}")
    
    # Visualize DBSCAN results
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=best_dbscan_labels, cmap='viridis')
    plt.title(f'DBSCAN (eps={best_dbscan_params[0]}, min_samples={best_dbscan_params[1]})')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
    plt.colorbar(scatter)
    
    plt.subplot(1, 3, 2)
    # Highlight noise points
    noise_mask = best_dbscan_labels == -1
    plt.scatter(X_pca[~noise_mask, 0], X_pca[~noise_mask, 1], 
               c=best_dbscan_labels[~noise_mask], cmap='viridis', alpha=0.7, label='Clusters')
    plt.scatter(X_pca[noise_mask, 0], X_pca[noise_mask, 1], 
               c='red', marker='x', s=50, label='Noise')
    plt.title('DBSCAN with Noise Points Highlighted')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
    plt.legend()
    
    plt.subplot(1, 3, 3)
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
    plt.title('True Labels')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
    plt.colorbar()
    
    plt.tight_layout()
    plt.show()

## Step 4: Analysis and Insights

print("\n=== STEP 4: ANALYSIS AND INSIGHTS ===\n")

print("COMPARISON BETWEEN HIERARCHICAL AND DBSCAN CLUSTERING:")
print("-" * 60)

if best_dbscan_params:
    # Compare the best results
    hier_sil = hierarchical_results[best_n_clusters]['silhouette_score']
    dbscan_sil = best_dbscan_result['silhouette_score']
    
    print(f"Hierarchical Clustering (n_clusters={best_n_clusters}):")
    print(f"  - Silhouette Score: {hier_sil:.3f}")
    print(f"  - Number of clusters: {best_n_clusters}")
    print(f"  - All points assigned to clusters")
    
    print(f"\nDBSCAN (eps={best_dbscan_params[0]}, min_samples={best_dbscan_params[1]}):")
    print(f"  - Silhouette Score: {dbscan_sil:.3f}")
    print(f"  - Number of clusters: {best_dbscan_result['n_clusters']}")
    print(f"  - Noise points: {best_dbscan_result['n_noise']}")
    print(f"  - Homogeneity Score: {best_dbscan_result['homogeneity_score']:.3f}")
    print(f"  - Completeness Score: {best_dbscan_result['completeness_score']:.3f}")

print("\nPARAMETER INFLUENCE:")
print("-" * 20)
print("Hierarchical Clustering:")
print("- Number of clusters directly controls the granularity of clustering")
print("- Ward linkage minimizes within-cluster variance")
print("- Always produces exactly n_clusters, regardless of data structure")

print("\nDBSCAN:")
print("- eps controls the neighborhood size for core points")
print("- min_samples determines minimum density for cluster formation")
print("- Can identify noise points and handles clusters of varying density")
print("- Number of clusters emerges from the data structure")

print("\nSTRENGTHS AND WEAKNESSES:")
print("-" * 25)
print("Hierarchical Clustering:")
print("Strengths:")
print("  + Deterministic results")
print("  + Provides hierarchical structure via dendrogram")
print("  + Works well with compact, well-separated clusters")
print("  + No noise points - all data assigned")
print("Weaknesses:")
print("  - Requires pre-specifying number of clusters")
print("  - Sensitive to outliers")
print("  - Assumes spherical clusters")

print("\nDBSCAN:")
print("Strengths:")
print("  + Automatically determines number of clusters")
print("  + Robust to outliers (identifies as noise)")
print("  + Can find arbitrarily shaped clusters")
print("  + No assumption about cluster shape")
print("Weaknesses:")
print("  - Sensitive to parameter selection")
print("  - Struggles with varying densities")
print("  - May not work well with high-dimensional data")

# Summary statistics
print(f"\nSUMMARY:")
print(f"- Wine dataset has {X.shape[0]} samples with {X.shape[1]} features")
print(f"- True number of wine classes: {len(np.unique(y))}")
print(f"- Best Hierarchical clustering found {best_n_clusters} clusters")
if best_dbscan_params:
    print(f"- Best DBSCAN found {best_dbscan_result['n_clusters']} clusters with {best_dbscan_result['n_noise']} noise points")
    if hier_sil > dbscan_sil:
        print(f"- Hierarchical clustering achieved better silhouette score ({hier_sil:.3f} vs {dbscan_sil:.3f})")
    else:
        print(f"- DBSCAN achieved better silhouette score ({dbscan_sil:.3f} vs {hier_sil:.3f})")
else:
    print("- DBSCAN did not find suitable clustering with tested parameters")
