<a href="https://colab.research.google.com/github/thakurakanksha288/AI-ASSIGNMENT-PYTHON-/blob/main/demo_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import (
    silhouette_score,
    silhouette_samples,
    davies_bouldin_score,
    calinski_harabasz_score
)

from scipy import stats
import glob, os, warnings
warnings.filterwarnings("ignore")

plt.style.use("seaborn-v0_8-darkgrid")

In [3]:
def load_data(data_path):
    block_files = glob.glob(os.path.join(data_path, "block_*.csv"))
    if not block_files:
        raise FileNotFoundError("No block_*.csv files found")

    energy_data = pd.concat(
        [pd.read_csv(f) for f in block_files],
        ignore_index=True
    )

    return energy_data

In [4]:
def engineer_features(df):
    id_col = next(c for c in df.columns if c.lower() in ["lclid","building_id","id","meter_id"])

    features = []
    for bid in df[id_col].unique():
        d = df[df[id_col] == bid]
        if len(d) < 10:
            continue

        energy = d.filter(regex="energy|kwh|consumption", axis=1).iloc[:,0]

        features.append({
            "building_id": bid,
            "mean_consumption": energy.mean(),
            "max_demand": energy.max(),
            "std_consumption": energy.std(),
            "peak_to_avg_ratio": energy.max() / (energy.mean() + 1e-9),
            "load_factor": energy.mean() / (energy.max() + 1e-9),
            "total_consumption": energy.sum(),
            "skewness": energy.skew(),
            "kurtosis": energy.kurtosis()
        })

    return pd.DataFrame(features)

In [5]:
def preprocess(features_df):
    building_ids = features_df["building_id"]
    X = features_df.drop(columns=["building_id"])

    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())

    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, building_ids, X

In [6]:
def apply_pca(X_scaled, variance=0.9):
    pca_full = PCA().fit(X_scaled)
    n_comp = np.argmax(np.cumsum(pca_full.explained_variance_ratio_) >= variance) + 1

    pca = PCA(n_components=n_comp)
    X_pca = pca.fit_transform(X_scaled)

    # Scree plot (research standard)
    plt.figure(figsize=(6,4))
    plt.plot(np.cumsum(pca_full.explained_variance_ratio_), marker="o")
    plt.axhline(variance, linestyle="--", color="red")
    plt.xlabel("Components")
    plt.ylabel("Cumulative Variance")
    plt.title("PCA Scree Plot")
    plt.show()

    return X_pca, pca

In [7]:
def find_optimal_k(X_pca, max_k=10):
    ks, sil, inertias = [], [], []

    for k in range(2, max_k+1):
        km = KMeans(n_clusters=k, n_init=20, random_state=42)
        labels = km.fit_predict(X_pca)
        ks.append(k)
        sil.append(silhouette_score(X_pca, labels))
        inertias.append(km.inertia_)

    # Research plots
    fig, ax = plt.subplots(1,2, figsize=(10,4))
    ax[0].plot(ks, inertias, marker="o")
    ax[0].set_title("Elbow Method")

    ax[1].plot(ks, sil, marker="o")
    ax[1].set_title("Silhouette Analysis")
    plt.show()

    return ks[np.argmax(sil)]

In [8]:
def apply_kmeans(X_pca, k):
    kmeans = KMeans(n_clusters=k, n_init=50, random_state=42)
    labels = kmeans.fit_predict(X_pca)

    metrics = {
        "silhouette": silhouette_score(X_pca, labels),
        "davies_bouldin": davies_bouldin_score(X_pca, labels),
        "calinski_harabasz": calinski_harabasz_score(X_pca, labels),
        "inertia": kmeans.inertia_
    }

    return labels, metrics

In [9]:
def apply_kmeans(X_pca, k):
    kmeans = KMeans(n_clusters=k, n_init=50, random_state=42)
    labels = kmeans.fit_predict(X_pca)

    metrics = {
        "silhouette": silhouette_score(X_pca, labels),
        "davies_bouldin": davies_bouldin_score(X_pca, labels),
        "calinski_harabasz": calinski_harabasz_score(X_pca, labels),
        "inertia": kmeans.inertia_
    }

    return labels, metrics

In [10]:
def analyze_clusters(X_original, labels):
    df = X_original.copy()
    df["Cluster"] = labels

    profiles = df.groupby("Cluster").mean()

    # Heatmap (research-grade)
    plt.figure(figsize=(8,5))
    sns.heatmap(profiles.T, cmap="RdYlGn_r", annot=True, fmt=".2f")
    plt.title("Cluster Feature Profiles")
    plt.show()

    return profiles

In [11]:
def analyze_clusters(X_original, labels):
    df = X_original.copy()
    df["Cluster"] = labels

    profiles = df.groupby("Cluster").mean()

    # Heatmap (research-grade)
    plt.figure(figsize=(8,5))
    sns.heatmap(profiles.T, cmap="RdYlGn_r", annot=True, fmt=".2f")
    plt.title("Cluster Feature Profiles")
    plt.show()

    return profiles

In [12]:
def plot_clusters(X_pca, labels):
    plt.figure(figsize=(6,5))
    plt.scatter(X_pca[:,0], X_pca[:,1], c=labels, cmap="viridis", s=40)
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.title("K-Means Clusters in PCA Space")
    plt.colorbar(label="Cluster")
    plt.show()

In [14]:
def main(data_path):
    data = load_data(data_path)
    features = engineer_features(data)
    X_scaled, building_ids, X_original = preprocess(features)

    X_pca, pca = apply_pca(X_scaled)
    optimal_k = find_optimal_k(X_pca)

    labels, metrics = apply_kmeans(X_pca, optimal_k)
    profiles = analyze_clusters(X_original, labels)
    plot_clusters(X_pca, labels)

    print("\nFinal Clustering Metrics:")
    for k,v in metrics.items():
        print(f"{k}: {v:.4f}")