In [None]:
import pandas as pd
import numpy as np
from kneed import KneeLocator
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
import warnings

import gc  
warnings.filterwarnings('ignore')

def load_and_sample_data(file_path='merged_weather_demand_final.csv', sample_size=5000, random_state=42):
    """
    Load the dataset and take a representative sample with a smaller default size.
    """
    print(f"Loading data from {file_path}...")

    # Loading the full dataset
    df = pd.read_csv(file_path)
    print(f"Full dataset shape: {df.shape}")

    # Taking a sample to reduce memory
    df_sample = df.sample(n=min(sample_size, len(df)), random_state=random_state)

    print(f"Sample dataset shape: {df_sample.shape}")
    return df_sample

def prepare_features_for_clustering(df, features_to_use=None):
    """
    Prepare specific features for clustering.
    If features_to_use is None, it will use a default set of features.
    """
    # Default features for clustering 
    if features_to_use is None:
        features_to_use = [
            'Demand (MW)',       # Main target variable
               
            'Hour',
            
            'Weekend'
                      
        ]  

   
    available_features = [f for f in features_to_use if f in df.columns]

    if len(available_features) < len(features_to_use):
        missing = set(features_to_use) - set(available_features)
        print(f"Warning: Some requested features are not available: {missing}")

    # Selecting only the specified features
    df_features = df[available_features].copy()

    
    missing_values = df_features.isnull().sum()
    if missing_values.sum() > 0:
        print("Warning: Missing values detected:")
        print(missing_values[missing_values > 0])
        print("Filling missing values with median...")
        df_features = df_features.fillna(df_features.median())

    print(f"Features selected for clustering: {available_features}")
    return df_features

def normalize_features(df_features):
    """
    Normalize/scale features for clustering.
    """
    print("Normalizing features...")

    # Initialize the scaler
    scaler = StandardScaler()

    # Fit and transform the data
    df_scaled = pd.DataFrame(
        scaler.fit_transform(df_features),
        columns=df_features.columns,
        index=df_features.index
    )

    return df_scaled

def reduce_dimensions(df_scaled, n_components_pca=2):
    """
    Apply PCA and t-SNE for dimensionality reduction, with optimized memory usage.
    """
    print("Performing dimensionality reduction...")

    # PCA
    pca = PCA(n_components=n_components_pca)
    pca_result = pca.fit_transform(df_scaled)

    # Create a DataFrame for PCA results
    pca_df = pd.DataFrame(
        data=pca_result,
        columns=[f'PCA{i+1}' for i in range(n_components_pca)],
        index=df_scaled.index
    )

    # Calculate explained variance
    explained_variance = pca.explained_variance_ratio_
    print(f"PCA explained variance: {explained_variance}")
    print(f"Total variance explained: {sum(explained_variance):.2f}")

    # t-SNE
    print("Applying t-SNE on a subset of data to save memory...")
    # Subsample for t-SNE to reduce memory usage
    max_tsne_samples = min(1000, len(df_scaled))  # Reduced max samples

    df_tsne = df_scaled.sample(max_tsne_samples, random_state=42)

    # Applying t-SNE with reduced perplexity
    tsne = TSNE(
        n_components=2,
        random_state=42,
        perplexity=min(30, len(df_tsne) // 5),
        n_iter=1000  # Reduced iterations
    )
    tsne_result = tsne.fit_transform(df_tsne)

    # Creating a DataFrame for t-SNE results
    tsne_df = pd.DataFrame(
        data=tsne_result,
        columns=['tSNE1', 'tSNE2'],
        index=df_tsne.index
    )

    # Clean up to free memory
    del tsne
    gc.collect()

    return pca_df, tsne_df, explained_variance

def kmeans_clustering(df_scaled, max_k=8):  # Reduced max_k
    """
    Perform K-Means clustering and determine the optimal number of clusters.
    """
    print("Performing K-Means clustering...")

    # Determine optimal K using elbow method and silhouette scores
    inertia = []
    silhouette_scores = []

    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(df_scaled)
        inertia.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(df_scaled, labels))
        print(f"K={k}: Inertia={kmeans.inertia_:.2f}, Silhouette Score={silhouette_scores[-1]:.4f}")

        # Clean up memory after each iteration
        del kmeans
        gc.collect()

    # Plotting elbow curve
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(range(2, max_k + 1), inertia, marker='o')
    plt.title('Elbow Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    plt.grid(True)

    plt.subplot(1, 2, 2)
    plt.plot(range(2, max_k + 1), silhouette_scores, marker='o')
    plt.title('Silhouette Score Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.grid(True)

    plt.tight_layout()
    plt.savefig('kmeans_elbow.png', dpi=100)  
    plt.close()  # Close to free memory

    # Get optimal K based on silhouette score
    optimal_k = silhouette_scores.index(max(silhouette_scores)) + 2
    print(f"Optimal number of clusters based on silhouette score: {optimal_k}")

    # Apply K-Means with optimal K
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(df_scaled)

    return labels, optimal_k

def dbscan_clustering(df_scaled):
    """
    Improved DBSCAN clustering with optimized parameter selection.
    """
    print("Performing DBSCAN clustering with optimized parameters...")
    
    # Calculate k-distance plot to determine optimal eps
    def calculate_k_distance(df, k=5, sample_size=1000):
        """Calculate k-distance for a sample of points to help determine eps."""
        from sklearn.neighbors import NearestNeighbors
        
        if len(df) > sample_size:
            df_sample = df.sample(sample_size, random_state=42)
        else:
            df_sample = df
            
        nbrs = NearestNeighbors(n_neighbors=k+1).fit(df_sample)
        distances, _ = nbrs.kneighbors(df_sample)
        k_distances = np.sort(distances[:, k])[::-1]
        return k_distances
    
    k_distances = calculate_k_distance(df_scaled, k=5)
    
    
    plt.figure(figsize=(10, 6))
    plt.plot(np.arange(len(k_distances)), k_distances, 'b-')
    plt.title('k-Distance Graph (k=5)')
    plt.xlabel('Points sorted by distance to 5th nearest neighbor')
    plt.ylabel('5th nearest neighbor distance')
    plt.grid(True)
    plt.savefig('dbscan_k_distance_plot.png', dpi=100)
    plt.close()
    
    # Automatically determine eps from the k-distance plot
    from kneed import KneeLocator
    
    try:
        kneedle = KneeLocator(np.arange(len(k_distances)), k_distances, curve='convex', direction='decreasing')
        eps_candidate = kneedle.knee_y
        print(f"Automatically detected eps candidate: {eps_candidate:.4f}")
    except:
        eps_candidate = np.percentile(k_distances, 90)
        print(f"Using fallback eps value: {eps_candidate:.4f}")
    
    # Set min_samples based on dimensionality
    min_samples = 2 * df_scaled.shape[1]
    print(f"Using min_samples: {min_samples}")
    
    # Try multiple parameter combinations
    eps_values = np.linspace(eps_candidate * 0.8, eps_candidate * 1.2, 3)
    min_samples_values = [min_samples, min_samples + 5, min_samples + 10]
    
    best_score = -1
    best_params = None
    best_labels = None
    
    for eps in eps_values:
        for min_samples in min_samples_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(df_scaled)
            
            unique_labels = set(labels)
            n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
            n_noise = list(labels).count(-1)
            noise_ratio = n_noise / len(labels)
            
            if 2 <= n_clusters <= 10 and noise_ratio < 0.3:
                non_noise_mask = labels != -1
                if sum(non_noise_mask) > 1:
                    sil_score = silhouette_score(df_scaled[non_noise_mask], labels[non_noise_mask])
                    print(f"eps={eps:.3f}, min_samples={min_samples}: clusters={n_clusters}, "
                          f"noise={noise_ratio:.2%}, silhouette={sil_score:.4f}")
                    
                    # Score combines silhouette and low noise preference
                    score = sil_score * (1 - noise_ratio)
                    if score > best_score:
                        best_score = score
                        best_params = (eps, min_samples)
                        best_labels = labels
    
    if best_params:
        print(f"\nBest DBSCAN parameters: eps={best_params[0]:.3f}, min_samples={best_params[1]}")
        print(f"Number of clusters: {len(set(best_labels)) - (1 if -1 in best_labels else 0)}")
        print(f"Noise points: {list(best_labels).count(-1)}/{len(best_labels)} "
              f"({list(best_labels).count(-1)/len(best_labels):.2%})")
        return best_labels
    else:
        print("\nNo optimal parameters found, using fallback parameters")
        dbscan = DBSCAN(eps=eps_candidate, min_samples=min_samples)
        labels = dbscan.fit_predict(df_scaled)
        return labels

def hierarchical_clustering(df_scaled, sample_size=500):
    """
    Performing hierarchical clustering and plot dendrogram with smaller sample.
    """
    print("Performing hierarchical clustering...")

    # Take a smaller sample for hierarchical clustering to save memory
    sample_size = min(sample_size, len(df_scaled))
    sample_indices = np.random.choice(len(df_scaled), size=sample_size, replace=False)
    df_sample = df_scaled.iloc[sample_indices]

    # Compute the linkage matrix
    Z = linkage(df_sample, method='ward')

    # Plot dendrogram
    plt.figure(figsize=(12, 7))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('Data points')
    plt.ylabel('Distance')

    dendrogram(
        Z,
        truncate_mode='lastp',
        p=30,
        leaf_rotation=90.,
        leaf_font_size=12.,
        show_contracted=True
    )

    plt.axhline(y=15, c='k', linestyle='--', alpha=0.5)
    plt.savefig('hierarchical_dendrogram.png', dpi=100)
    plt.close()

    
    n_clusters = 4

    
    print(f"Applying Agglomerative Clustering with {n_clusters} clusters to full dataset...")
    agg = AgglomerativeClustering(n_clusters=n_clusters)
    labels = agg.fit_predict(df_scaled)

    # Clean up
    del Z, agg
    gc.collect()

    return labels, n_clusters

def visualize_clusters(df_original, pca_df, tsne_df, labels, method_name):
    """
    Visualize clusters in PCA and t-SNE space with improved consistency.
    """
    print(f"Visualizing {method_name} clusters...")

    # Create a color map for clusters
    unique_labels = np.unique(labels)
    
    if -1 in unique_labels:  # DBSCAN case
        n_clusters = len(unique_labels) - 1
        colors = plt.cm.tab20(np.linspace(0, 1, max(n_clusters, 1)))
        color_map = {}
        label_map = {-1: 'Noise'}
        cluster_idx = 0
        for label in unique_labels:
            if label != -1:
                color_map[label] = colors[cluster_idx]
                label_map[label] = f'Cluster {label+1}'
                cluster_idx += 1
        color_map[-1] = (0.7, 0.7, 0.7, 1)
    else:
        n_clusters = len(unique_labels)
        colors = plt.cm.tab20(np.linspace(0, 1, n_clusters))
        color_map = {i: colors[i] for i in range(n_clusters)}
        label_map = {i: f'Cluster {i+1}' for i in range(n_clusters)}

    # Create visualization data
    index_to_label = {idx: lbl for idx, lbl in zip(df_original.index[:len(labels)], labels)}
    
    # PCA visualization
    pca_viz_data = []
    for i, idx in enumerate(pca_df.index):
        if idx in index_to_label:
            pca_viz_data.append({
                'PCA1': pca_df.iloc[i, 0],
                'PCA2': pca_df.iloc[i, 1],
                'Cluster': label_map[index_to_label[idx]],
                'label_raw': index_to_label[idx]
            })
    pca_for_viz = pd.DataFrame(pca_viz_data)

    # t-SNE visualization
    tsne_for_viz = None
    if len(tsne_df) > 0:
        tsne_viz_data = []
        for i, idx in enumerate(tsne_df.index):
            if idx in index_to_label:
                tsne_viz_data.append({
                    'tSNE1': tsne_df.iloc[i, 0],
                    'tSNE2': tsne_df.iloc[i, 1],
                    'Cluster': label_map[index_to_label[idx]],
                    'label_raw': index_to_label[idx]
                })
        if tsne_viz_data:
            tsne_for_viz = pd.DataFrame(tsne_viz_data)

    # Plot PCA and t-SNE results
    plt.figure(figsize=(12, 10))
    
    # PCA plot
    plt.subplot(2, 1, 1)
    for cluster_name in sorted(pca_for_viz['Cluster'].unique()):
        mask = pca_for_viz['Cluster'] == cluster_name
        if mask.any():
            raw_label = pca_for_viz.loc[mask, 'label_raw'].iloc[0]
            color = color_map.get(raw_label, (0, 0, 0, 1))
            plt.scatter(
                pca_for_viz.loc[mask, 'PCA1'],
                pca_for_viz.loc[mask, 'PCA2'],
                s=30, alpha=0.7,
                label=cluster_name,
                color=color
            )
    plt.title(f'{method_name}: Clusters in PCA Space')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend()
    plt.grid(True)

    # t-SNE plot if available
    if tsne_for_viz is not None and len(tsne_for_viz) > 0:
        plt.subplot(2, 1, 2)
        for cluster_name in sorted(tsne_for_viz['Cluster'].unique()):
            mask = tsne_for_viz['Cluster'] == cluster_name
            if mask.any():
                raw_label = tsne_for_viz.loc[mask, 'label_raw'].iloc[0]
                color = color_map.get(raw_label, (0, 0, 0, 1))
                plt.scatter(
                    tsne_for_viz.loc[mask, 'tSNE1'],
                    tsne_for_viz.loc[mask, 'tSNE2'],
                    s=30, alpha=0.7,
                    label=cluster_name,
                    color=color
                )
        plt.title(f'{method_name}: Clusters in t-SNE Space')
        plt.xlabel('t-SNE Component 1')
        plt.ylabel('t-SNE Component 2')
        plt.legend()
        plt.grid(True)

    plt.tight_layout()
    plt.savefig(f'{method_name}_clusters.png', dpi=100)
    plt.close()

    # Adding cluster labels to original data
    df_with_clusters = df_original.copy()
    df_with_clusters['Cluster'] = pd.Series([label_map[l] for l in labels], index=df_original.index[:len(labels)])

    # Visualizing key features by cluster
    key_features = ['Demand (MW)', 'Temperature (F)', 'Hour']
    key_features = [f for f in key_features if f in df_with_clusters.columns]

    if key_features:
        plt.figure(figsize=(15, 5 * len(key_features)))
        for i, feature in enumerate(key_features):
            plt.subplot(len(key_features), 1, i+1)
            sns.boxplot(x='Cluster', y=feature, data=df_with_clusters)
            plt.title(f'{feature} Distribution by Cluster')
            plt.grid(True)

        plt.tight_layout()
        plt.savefig(f'{method_name}_features.png', dpi=100)
        plt.close()

    return df_with_clusters

def interpret_clusters(df_with_clusters):
    """
    Improved cluster interpretation with better demand categorization.
    """
    print("Interpreting clusters...")

    features_to_interpret = [
        'Demand (MW)', 'Temperature (F)', 'Humidity', 
        'Hour', 'Weekday', 'Month'
    ]
    features_to_interpret = [f for f in features_to_interpret if f in df_with_clusters.columns]

    cluster_stats = df_with_clusters.groupby('Cluster')[features_to_interpret].agg(['mean', 'median', 'std'])
    print("\nCluster Statistics:")
    print(cluster_stats)

    cluster_interpretations = {}
    overall_avg_demand = df_with_clusters['Demand (MW)'].mean()

    for cluster in df_with_clusters['Cluster'].unique():
        cluster_data = df_with_clusters[df_with_clusters['Cluster'] == cluster]
        interpretation = f"Cluster {cluster}:\n"
        interpretation += f"- Size: {len(cluster_data)} points ({len(cluster_data)/len(df_with_clusters):.1%})\n"

        # Improved demand interpretation
        if 'Demand (MW)' in cluster_data.columns:
            avg_demand = cluster_data['Demand (MW)'].mean()
            if avg_demand > overall_avg_demand * 1.5:
                interpretation += "- Peak demand periods\n"
            elif avg_demand > overall_avg_demand * 1.2:
                interpretation += "- High demand periods\n"
            elif avg_demand < overall_avg_demand * 0.7:
                interpretation += "- Low demand periods\n"
            elif avg_demand < overall_avg_demand * 0.9:
                interpretation += "- Below average demand\n"
            else:
                interpretation += "- Typical demand periods\n"

        # Temperature interpretation
        if 'Temperature (F)' in cluster_data.columns:
            avg_temp = cluster_data['Temperature (F)'].mean()
            if avg_temp > 80:
                interpretation += "- Hot weather conditions\n"
            elif avg_temp > 65:
                interpretation += "- Warm weather conditions\n"
            elif avg_temp < 45:
                interpretation += "- Cold weather conditions\n"
            else:
                interpretation += "- Moderate temperatures\n"

        # Time interpretation
        if 'Hour' in cluster_data.columns:
            hour_counts = cluster_data['Hour'].value_counts(normalize=True)
            peak_hours = hour_counts[hour_counts > 0.1].index.tolist()
            if peak_hours:
                interpretation += f"- Peak hours: {sorted(peak_hours)}\n"

        # Day type interpretation
        if 'Weekday' in cluster_data.columns:
            weekday_ratio = cluster_data[cluster_data['Weekday'] < 5].shape[0] / len(cluster_data)
            if weekday_ratio > 0.7:
                interpretation += "- Primarily weekdays\n"
            elif weekday_ratio < 0.3:
                interpretation += "- Primarily weekends\n"
            else:
                interpretation += "- Mixed weekdays/weekends\n"

        cluster_interpretations[cluster] = interpretation

    print("\nCluster Interpretations:")
    for cluster, interpretation in cluster_interpretations.items():
        print(interpretation)
        print("---")

    return cluster_interpretations

def run_clustering_analysis(file_path='merged_weather_demand_final.csv', sample_size=5000, features_to_use=None):
    """
    Run the entire clustering analysis pipeline.
    """
    try:
        # 1. Loading and sampling data
        df_sample = load_and_sample_data(file_path, sample_size)

        # 2. Selecting and preparing features
        df_features = prepare_features_for_clustering(df_sample, features_to_use)

        # 3. Normalizing features
        df_scaled = normalize_features(df_features)

        # 4. Dimensionality reduction
        pca_df, tsne_df, explained_variance = reduce_dimensions(df_scaled)

        # 5-7. Performing clustering
        kmeans_labels, optimal_k = kmeans_clustering(df_scaled)
        dbscan_labels = dbscan_clustering(df_scaled)
        hierarchical_labels, n_hierarchical_clusters = hierarchical_clustering(df_scaled)

        # 8. Visualizing clusters
        kmeans_with_clusters = visualize_clusters(df_sample, pca_df, tsne_df, kmeans_labels, 'K-Means')
        dbscan_with_clusters = visualize_clusters(df_sample, pca_df, tsne_df, dbscan_labels, 'DBSCAN')
        hierarchical_with_clusters = visualize_clusters(df_sample, pca_df, tsne_df, hierarchical_labels, 'Hierarchical')

        # 9. Interpretation of clusters
        kmeans_interpretations = interpret_clusters(kmeans_with_clusters)
        dbscan_interpretations = interpret_clusters(dbscan_with_clusters)
        hierarchical_interpretations = interpret_clusters(hierarchical_with_clusters)

        # 10. Evaluating clustering
        def calculate_silhouette(labels, data):
            if -1 in labels:  # DBSCAN case
                non_noise = labels != -1
                if sum(non_noise) > 1:
                    return silhouette_score(data[non_noise], labels[non_noise])
                return "N/A (too much noise)"
            return silhouette_score(data, labels)

        kmeans_silhouette = silhouette_score(df_scaled, kmeans_labels)
        dbscan_silhouette = calculate_silhouette(dbscan_labels, df_scaled)
        hierarchical_silhouette = silhouette_score(df_scaled, hierarchical_labels)

        print("\nClustering Evaluation:")
        print(f"K-Means Silhouette Score: {kmeans_silhouette:.4f}")
        print(f"DBSCAN Silhouette Score: {dbscan_silhouette}")
        print(f"Hierarchical Silhouette Score: {hierarchical_silhouette:.4f}")

        print("\nClustering analysis complete!")
        print("Visualizations saved as PNG files.")

        return {
            'kmeans': {
                'labels': kmeans_labels,
                'optimal_k': optimal_k,
                'silhouette': kmeans_silhouette,
                'df_with_clusters': kmeans_with_clusters,
                'interpretations': kmeans_interpretations
            },
            'dbscan': {
                'labels': dbscan_labels,
                'silhouette': dbscan_silhouette,
                'df_with_clusters': dbscan_with_clusters,
                'interpretations': dbscan_interpretations
            },
            'hierarchical': {
                'labels': hierarchical_labels,
                'n_clusters': n_hierarchical_clusters,
                'silhouette': hierarchical_silhouette,
                'df_with_clusters': hierarchical_with_clusters,
                'interpretations': hierarchical_interpretations
            }
        }
    except MemoryError as e:
        print("\n*** MEMORY ERROR ***")
        print("Try reducing sample_size or using fewer features")
        print(f"Error: {e}")
        return None
    except Exception as e:
        print(f"\n*** ERROR: {type(e).__name__} ***")
        print(f"Details: {e}")
        return None

if __name__ == "__main__":
    #  features for clustering
    clustering_features = [
        'Demand (MW)',
        
        'Hour',
       
        'Weekend'
        
    ]

    
    results = run_clustering_analysis(
        file_path='merged_weather_demand_final.csv',
        sample_size=16000,
        features_to_use=clustering_features
    )

Loading data from merged_weather_demand_final.csv...
Full dataset shape: (165192, 21)
Sample dataset shape: (3000, 21)
Features selected for clustering: ['Demand (MW)', 'Hour', 'Weekend']
Normalizing features...
Performing dimensionality reduction...
PCA explained variance: [0.3454848  0.33229899]
Total variance explained: 0.68
Applying t-SNE on a subset of data to save memory...
Performing K-Means clustering...
K=2: Inertia=5996.37, Silhouette Score=0.4023
K=3: Inertia=4265.49, Silhouette Score=0.4261
K=4: Inertia=2967.05, Silhouette Score=0.4204
K=5: Inertia=2375.27, Silhouette Score=0.4379
K=6: Inertia=1884.38, Silhouette Score=0.4661
K=7: Inertia=1573.56, Silhouette Score=0.4625
K=8: Inertia=1327.84, Silhouette Score=0.4191
Optimal number of clusters based on silhouette score: 6
Performing DBSCAN clustering...
eps=0.040, min_samples=5: clusters=159, noise=46.73%, silhouette=0.7974
eps=0.060, min_samples=5: clusters=174, noise=34.60%, silhouette=0.6762
Best DBSCAN parameters: eps=0.