# Data Exploration for Dynamic Influence-Based Clustering

This notebook explores the energy consumption datasets used in the Dynamic Influence-Based Clustering Framework.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

import config
from src.preprocessing.data_loader import DataLoader
from src.preprocessing.preprocessor import Preprocessor

# Set plotting style
plt.style.use('seaborn-whitegrid')
sns.set_context("notebook", font_scale=1.2)

## Load Datasets

We'll load and explore each of the datasets used in the framework.

In [None]:
# List of datasets
datasets = [
    "building_genome",
    "industrial_site1",
    "industrial_site2",
    "industrial_site3"
]

# Load each dataset
data_dict = {}

for dataset_name in datasets:
    data_loader = DataLoader(dataset_name=dataset_name)
    data_dict[dataset_name] = data_loader.load_data()
    
    print(f"\n{dataset_name} dataset loaded with shape {data_dict[dataset_name].shape}")

## Explore Building Genome Dataset

In [None]:
# Get Building Genome dataset
building_data = data_dict["building_genome"]

# Display basic information
print("Building Genome Dataset Information:")
print(f"Shape: {building_data.shape}")
print(f"Columns: {building_data.columns.tolist()}")
print("\nSample data:")
building_data.head()

In [None]:
# Check for missing values
missing_values = building_data.isnull().sum()
print("Missing values:")
missing_values[missing_values > 0]

In [None]:
# Convert timestamp to datetime if not already
if 'timestamp' in building_data.columns and not pd.api.types.is_datetime64_any_dtype(building_data['timestamp']):
    building_data['timestamp'] = pd.to_datetime(building_data['timestamp'])

# Plot time series for a few buildings
building_cols = [col for col in building_data.columns if col.startswith('building_')][:5]  # First 5 buildings

plt.figure(figsize=(15, 8))
for col in building_cols:
    plt.plot(building_data['timestamp'], building_data[col], label=col)

plt.title('Energy Consumption for Selected Buildings')
plt.xlabel('Time')
plt.ylabel('Energy Consumption')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Plot peak load distribution
if 'peak_load' in building_data.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(x='peak_load', data=building_data)
    plt.title('Peak Load Distribution')
    plt.xlabel('Peak Load (0: No, 1: Yes)')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()

## Explore Industrial Site Datasets

In [None]:
# Function to explore industrial site data
def explore_industrial_site(site_name):
    site_data = data_dict[site_name]
    
    # Display basic information
    print(f"{site_name} Dataset Information:")
    print(f"Shape: {site_data.shape}")
    print(f"Columns: {site_data.columns.tolist()}")
    print("\nSample data:")
    display(site_data.head())
    
    # Check for missing values
    missing_values = site_data.isnull().sum()
    print("\nMissing values:")
    display(missing_values[missing_values > 0])
    
    # Convert timestamp to datetime if not already
    if 'timestamp' in site_data.columns and not pd.api.types.is_datetime64_any_dtype(site_data['timestamp']):
        site_data['timestamp'] = pd.to_datetime(site_data['timestamp'])
    
    # Plot time series for energy consumption
    if 'energy_consumption' in site_data.columns:
        plt.figure(figsize=(15, 6))
        plt.plot(site_data['timestamp'], site_data['energy_consumption'], marker='o')
        plt.title(f'{site_name} Energy Consumption')
        plt.xlabel('Time')
        plt.ylabel('Energy Consumption')
        plt.grid(True)
        plt.tight_layout()
        plt.show()
    
    # Plot correlation matrix
    numeric_cols = site_data.select_dtypes(include=[np.number]).columns
    corr_matrix = site_data[numeric_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title(f'{site_name} Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()

In [None]:
# Explore Industrial Site 1
explore_industrial_site("industrial_site1")

In [None]:
# Explore Industrial Site 2
explore_industrial_site("industrial_site2")

In [None]:
# Explore Industrial Site 3
explore_industrial_site("industrial_site3")

## Preprocess Data

Now let's preprocess the data using our framework's preprocessor.

In [None]:
# Function to preprocess and visualize data
def preprocess_and_visualize(dataset_name):
    data = data_dict[dataset_name]
    preprocessor = Preprocessor()
    
    # Preprocess data
    X, y, t, c = preprocessor.preprocess(data)
    
    print(f"\n{dataset_name} Preprocessing Results:")
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    print(f"t shape: {t.shape}")
    print(f"c shape: {c.shape}")
    
    # Visualize feature distributions
    plt.figure(figsize=(15, 8))
    for i in range(min(X.shape[1], 5)):  # Plot first 5 features
        plt.subplot(1, min(X.shape[1], 5), i+1)
        sns.histplot(X[:, i], kde=True)
        plt.title(f'Feature {i}')
    
    plt.tight_layout()
    plt.show()
    
    # Visualize target distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(y, kde=True)
    plt.title(f'{dataset_name} Target Distribution')
    plt.xlabel('Target Value')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    
    return X, y, t, c

In [None]:
# Preprocess and visualize each dataset
preprocessed_data = {}

for dataset_name in datasets:
    print(f"\nPreprocessing {dataset_name} dataset...")
    X, y, t, c = preprocess_and_visualize(dataset_name)
    preprocessed_data[dataset_name] = (X, y, t, c)

## Temporal Patterns

Let's explore temporal patterns in the data.

In [None]:
# Function to explore temporal patterns
def explore_temporal_patterns(dataset_name):
    X, y, t, c = preprocessed_data[dataset_name]
    
    # Convert to DataFrame for easier manipulation
    df = pd.DataFrame({
        'timestamp': t,
        'target': y
    })
    
    # Add time-based features
    if hasattr(df['timestamp'].dt, 'hour'):
        df['hour'] = df['timestamp'].dt.hour
        df['day'] = df['timestamp'].dt.day
        df['month'] = df['timestamp'].dt.month
        df['dayofweek'] = df['timestamp'].dt.dayofweek
    
    # Plot target by hour of day
    if 'hour' in df.columns:
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='hour', y='target', data=df)
        plt.title(f'{dataset_name} - Target by Hour of Day')
        plt.xlabel('Hour of Day')
        plt.ylabel('Target Value')
        plt.tight_layout()
        plt.show()
    
    # Plot target by day of week
    if 'dayofweek' in df.columns:
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='dayofweek', y='target', data=df)
        plt.title(f'{dataset_name} - Target by Day of Week')
        plt.xlabel('Day of Week (0=Monday, 6=Sunday)')
        plt.ylabel('Target Value')
        plt.tight_layout()
        plt.show()
    
    # Plot target by month
    if 'month' in df.columns:
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='month', y='target', data=df)
        plt.title(f'{dataset_name} - Target by Month')
        plt.xlabel('Month')
        plt.ylabel('Target Value')
        plt.tight_layout()
        plt.show()

In [None]:
# Explore temporal patterns for each dataset
for dataset_name in datasets:
    print(f"\nExploring temporal patterns in {dataset_name} dataset...")
    explore_temporal_patterns(dataset_name)

## Summary

In this notebook, we've explored the energy consumption datasets used in the Dynamic Influence-Based Clustering Framework. We've examined their basic properties, preprocessed them using our framework's utilities, and explored temporal patterns in the data.

Key observations:
- The Building Genome dataset contains hourly energy consumption data for multiple buildings, with a binary target indicating peak load periods.
- The Industrial Site datasets contain quarterly energy consumption data with various features related to electrical and operational parameters.
- All datasets exhibit temporal patterns, with variations by hour of day, day of week, and month.
- The preprocessing pipeline successfully handles missing values and normalizes features.

These insights will inform our approach to influence space transformation and dynamic clustering in the subsequent notebooks.