# Diabetes Dataset Clustering Analysis

This notebook performs comprehensive clustering analysis on a diabetes dataset using various dimensionality reduction techniques and clustering algorithms to identify patterns and relationships in the data.

## 1. Import Libraries

In [None]:
# ! pip install scikit-learn hdbscan umap

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import umap
import hdbscan
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (16, 10)

## 2. Load and Explore the Dataset

In [None]:
# Load the dataset
# Modify the path as needed to point to your file
data = pd.read_csv("diabetes_dataset.csv")
   # None was convert to nan
data['Alcohol_Consumption'] = data['Alcohol_Consumption'].fillna('None')

# Display the first few rows
data.head()

In [None]:
# Check dataset info
print(f"Dataset shape: {data.shape}")
print("\nColumns:")
print(data.columns.tolist())

# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

In [None]:
# Check data types and basic statistics
data.info()

In [None]:
# Basic statistics for numerical columns
data.describe()

In [None]:
# Check distribution of categorical variables
categorical_cols = ['Sex', 'Ethnicity', 'Physical_Activity_Level', 
                   'Alcohol_Consumption', 'Smoking_Status']

for col in categorical_cols:
    print(f"\nDistribution of {col}:")
    print(data[col].value_counts())
    print(f"Percentage:\n{data[col].value_counts(normalize=True) * 100}")

# Specific check for Alcohol_Consumption to ensure 'None' is properly recognized
print("\nSpecific check for Alcohol_Consumption values:")
alcohol_values = data['Alcohol_Consumption'].unique()
print(alcohol_values)
print("\nVerifying 'None' is treated as a valid category (not missing):")
print(f"Count of 'None' values: {(data['Alcohol_Consumption'] == 'None').sum()}")

## 3. Data Preprocessing

In [None]:
# Data preprocessing
# Drop the first unnamed index column if present
if '' in data.columns:
    data = data.drop('', axis=1)
elif 'Unnamed: 0' in data.columns:
    data = data.drop('Unnamed: 0', axis=1)

# Separate numerical and categorical features
categorical_cols = ['Sex', 'Ethnicity', 'Physical_Activity_Level', 
                   'Alcohol_Consumption', 'Smoking_Status']
numerical_cols = [col for col in data.columns if col not in categorical_cols and 
                 col not in ['Family_History_of_Diabetes', 'Previous_Gestational_Diabetes']]
binary_cols = ['Family_History_of_Diabetes', 'Previous_Gestational_Diabetes']

print("Numerical features:", numerical_cols)
print("\nCategorical features:", categorical_cols)
print("\nBinary features:", binary_cols)

In [None]:
from sklearn.impute import SimpleImputer

# Define preprocessing for numerical and categorical data
# Ensure OneHotEncoder handles 'None' as a valid category in Alcohol_Consumption

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols+binary_cols)
    ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_cols),
#         ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols),
#         ('bin', 'passthrough', binary_cols)
#     ])

# Create a preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Apply preprocessing
X_processed = preprocessing_pipeline.fit_transform(data)

print(f"Shape of processed data: {X_processed.shape}")

## 4. Define Utility Functions for Clustering and Visualization

In [None]:
def evaluate_clustering(X, labels, method_name):
    """Evaluate clustering performance using silhouette score"""
    if len(np.unique(labels)) > 1:  # Ensure we have more than one cluster
        silhouette = silhouette_score(X, labels)
        print(f"{method_name} - Silhouette Score: {silhouette:.4f}")
        return silhouette
    else:
        print(f"{method_name} - Only one cluster found, cannot calculate silhouette score")
        return -1

def plot_clusters_2d(X_2d, labels, method_name, title):
    """Plot clusters in 2D space"""
    plt.figure(figsize=(14, 10))
    scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis', 
                         alpha=0.7, s=40, edgecolors='w', linewidth=0.5)
    
    plt.colorbar(scatter, label='Cluster')
    plt.title(f"{title}\n{method_name}", fontsize=16)
    plt.xlabel('Dimension 1', fontsize=12)
    plt.ylabel('Dimension 2', fontsize=12)
    plt.tight_layout()
    return plt

def run_dimensionality_reduction(X, methods):
    """Apply various dimensionality reduction methods"""
    results = {}
    
    for method_name, method in methods.items():
        print(f"Applying {method_name}...")
        X_reduced = method.fit_transform(X)
        results[method_name] = X_reduced
        
    return results

def run_clustering(X, methods):
    """Apply various clustering methods"""
    results = {}
    
    for method_name, method in methods.items():
        print(f"Applying {method_name}...")
        labels = method.fit_predict(X)
        silhouette = evaluate_clustering(X, labels, method_name)
        results[method_name] = {
            'labels': labels,
            'silhouette': silhouette,
            'n_clusters': len(np.unique(labels))
        }
        
    return results

## 5. Dimensionality Reduction

In [None]:
# Define dimensionality reduction methods
dim_reduction_methods = {
    'PCA n_components=2': PCA(n_components=2),
    'UMAP n_components=2, n_neighbors=15, min_dist=0.1': umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42),
    't-SNE n_components=2, perplexity=30, n_iter=1000': TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42)
}

# Run dimensionality reduction
reduced_data = run_dimensionality_reduction(X_processed, dim_reduction_methods)

In [None]:
# Create a side-by-side comparison of all dimensionality reduction techniques
fig, axes = plt.subplots(1, 3, figsize=(24, 8))
reduction_methods = list(reduced_data.keys())

for i, (dr_name, X_reduced) in enumerate(reduced_data.items()):
    scatter = axes[i].scatter(
        X_reduced[:, 0], X_reduced[:, 1],
        alpha=0.6, s=0.5)
    axes[i].set_title(f"{dr_name}", fontsize=16)
    axes[i].set_xlabel('X', fontsize=14)
    axes[i].set_ylabel('Y', fontsize=14)
    
# Add a single colorbar for all plots
cbar = fig.colorbar(scatter, ax=axes, label='Cluster')
plt.suptitle(f"Comparison of Dimensionality Reduction Techniques ", fontsize=20)
plt.tight_layout()
plt.subplots_adjust(top=0.85)
plt.show()

# Save the comparison image if needed
# plt.savefig('dim_reduction_comparison.png', dpi=300, bbox_inches='tight')
# print("Comparison image saved as 'dim_reduction_comparison.png'")

## 6. Clustering Analysis

In [None]:
# Define clustering methods
clustering_methods = {
    'Agglomerative (k=5)': AgglomerativeClustering(n_clusters=5),
    'HDBSCAN': hdbscan.HDBSCAN(min_cluster_size=50, min_samples=5)
}
for k in range(2,6):
    clustering_methods[f'K-Means (k={k})']= KMeans(n_clusters=k, random_state=42)
    
# Run clustering on the original high-dimensional data
clustering_results = run_clustering(X_processed, clustering_methods)

In [None]:
# Find the best clustering method based on silhouette score
best_method = max(clustering_results.items(), key=lambda x: x[1]['silhouette'])[0]
best_labels = clustering_results[best_method]['labels']
print(f"\nBest clustering method: {best_method} with silhouette score: {clustering_results[best_method]['silhouette']:.4f}")

## 7. Visualize Clusters with Different Dimensionality Reduction Techniques

In [None]:
# Visualize the best clustering method with different dimensionality reduction techniques
for dr_name, X_reduced in reduced_data.items():
    plt = plot_clusters_2d(
        X_reduced, 
        best_labels, 
        f"{best_method}", 
        f"Diabetes Dataset Clusters Visualized with {dr_name}"
    )
    plt.show()

## 8. Analyze Cluster Characteristics

In [None]:
# Add cluster labels to the original data
data['Cluster'] = best_labels

# Analyze the characteristics of each cluster
cluster_summary = data.groupby('Cluster').mean()
print("Cluster Characteristics (mean values):")
cluster_summary

In [None]:
# Function to visualize feature distributions by cluster
def plot_feature_distributions(data, feature_list, n_cols=3):
    n_features = len(feature_list)
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 4 * n_rows))
    axes = axes.flatten()
    
    for i, feature in enumerate(feature_list):
        if i < len(axes):
            sns.boxplot(x='Cluster', y=feature, data=data, ax=axes[i])
            axes[i].set_title(f'Distribution of {feature} by Cluster')
            
    # Hide unused subplots
    for j in range(i + 1, len(axes)):
        axes[j].set_visible(False)
        
    plt.tight_layout()
    return plt

In [None]:
# Select the most important features for visualization
important_features = [
    'Age', 'BMI', 'Fasting_Blood_Glucose', 'HbA1c', 
    'Waist_Circumference', 'Blood_Pressure_Systolic',
    'Cholesterol_Total', 'Cholesterol_HDL', 'Cholesterol_LDL'
]

# Plot feature distributions by cluster
plt = plot_feature_distributions(data, important_features)
plt.show()

## 9. Feature Importance Analysis using PCA

In [None]:
# Create a PCA-based feature importance plot
pca = PCA()
pca.fit(X_processed)

# Create a dataframe with PCA components and their explained variance
pca_df = pd.DataFrame({
    'Variance Explained': pca.explained_variance_ratio_,
    'Principal Component': [f'PC{i+1}' for i in range(len(pca.explained_variance_ratio_))]
})

# Plot the explained variance of PCA components
plt.figure(figsize=(12, 6))
sns.barplot(x='Principal Component', y='Variance Explained', data=pca_df.head(10))
plt.title('Explained Variance by Principal Components')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 10. Visualize Cluster Centers (for K-Means)

In [None]:
# Generate a heatmap of cluster centers for the most important features
# First, we need to get the cluster centers
if 'K-Means' in best_method:
    # For K-Means, we can directly get the cluster centers
    kmeans = clustering_methods[best_method]
    cluster_centers = kmeans.cluster_centers_
    
    # Process feature names for the heatmap
    feature_names = []
    for name, trans, cols in preprocessor.transformers_:
        if name == 'num':
            feature_names.extend(cols)
        elif name == 'cat':
            # Get one-hot encoded column names
            encoder = trans
            for i, col in enumerate(cols):
                cats = encoder.categories_[i][1:]  # Skip the first category (dropped)
                feature_names.extend([f"{col}_{cat}" for cat in cats])
        elif name == 'bin':
            feature_names.extend(cols)
    
    # Create a DataFrame with cluster centers
    centers_df = pd.DataFrame(cluster_centers[:, :len(numerical_cols)], 
                             columns=numerical_cols)
    
    # Normalize the centers for heatmap visualization
    centers_norm = (centers_df - centers_df.mean()) / centers_df.std()
    
    # Plot heatmap
    plt.figure(figsize=(14, 8))
    sns.heatmap(centers_norm, annot=True, cmap='coolwarm', linewidths=.5,
               yticklabels=[f'Cluster {i}' for i in range(centers_norm.shape[0])])
    plt.title('Normalized Cluster Centers for Numerical Features')
    plt.tight_layout()
    plt.show()

## 11. Distribution of Categorical Variables by Cluster

In [None]:
# Analyze the distribution of categorical variables across clusters
for cat_col in categorical_cols:
    plt.figure(figsize=(14, 6))
    cross_tab = pd.crosstab(data['Cluster'], data[cat_col], normalize='index') * 100
    cross_tab.plot(kind='bar', stacked=True)
    plt.title(f'Distribution of {cat_col} by Cluster (%)')
    plt.xlabel('Cluster')
    plt.ylabel('Percentage')
    plt.xticks(rotation=0)
    plt.legend(title=cat_col)
    plt.tight_layout()
    plt.show()

## 12. Summary of Findings

In [None]:
# Display a summary of the clustering results
print(f"Best clustering method: {best_method}")
print(f"Number of clusters: {clustering_results[best_method]['n_clusters']}")
print(f"Silhouette score: {clustering_results[best_method]['silhouette']:.4f}")

print("\nCluster sizes:")
print(data['Cluster'].value_counts().sort_index())

print("\nKey characteristics of each cluster:")
# Display the top differentiating features for each cluster
for cluster in sorted(data['Cluster'].unique()):
    print(f"\nCluster {cluster}:")
    # Calculate the z-scores for this cluster compared to overall mean
    cluster_means = data[data['Cluster'] == cluster][numerical_cols].mean()
    overall_means = data[numerical_cols].mean()
    overall_stds = data[numerical_cols].std()
    
    z_scores = (cluster_means - overall_means) / overall_stds
    
    # Display top 5 features with highest absolute z-scores
    top_features = z_scores.abs().sort_values(ascending=False).head(5).index.tolist()
    for feature in top_features:
        direction = "higher" if z_scores[feature] > 0 else "lower"
        print(f"  - {feature}: {direction} than average (z-score: {z_scores[feature]:.2f})")