# Energy Consumption Anomaly Detection

This notebook demonstrates anomaly detection techniques for energy consumption data using machine learning algorithms.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Set plotting style
plt.style.use('ggplot')
sns.set_theme(style="whitegrid")
%matplotlib inline

## 1. Data Loading and Exploration

Load energy consumption data from various sources and explore its characteristics.

In [None]:
# Sample code to load data
# In a real scenario, this would connect to your database or data files

# Generate sample data for demonstration
np.random.seed(42)
dates = pd.date_range(start='2023-01-01', periods=1000, freq='H')
base_consumption = 100 + 20 * np.sin(np.arange(1000) * (2 * np.pi / 24))  # Daily cycle
weekly_pattern = 15 * np.sin(np.arange(1000) * (2 * np.pi / (24 * 7)))    # Weekly cycle
noise = np.random.normal(0, 5, 1000)                                      # Random noise

# Add some anomalies
anomalies = np.zeros(1000)
anomaly_indices = np.random.choice(range(1000), size=20, replace=False)
anomalies[anomaly_indices] = np.random.normal(0, 50, 20)

# Combine components
consumption = base_consumption + weekly_pattern + noise + anomalies

# Create DataFrame
df = pd.DataFrame({
    'timestamp': dates,
    'energy_consumption': consumption
})

# Display the first few rows
df.head()

In [None]:
# Visualize the data
plt.figure(figsize=(15, 6))
plt.plot(df['timestamp'], df['energy_consumption'])
plt.title('Energy Consumption Over Time')
plt.xlabel('Time')
plt.ylabel('Energy Consumption (kWh)')
plt.tight_layout()
plt.show()

## 2. Feature Engineering

Extract relevant features from the time series data for anomaly detection.

In [None]:
# Extract time-based features
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['month'] = df['timestamp'].dt.month

# Calculate rolling statistics
df['rolling_mean_24h'] = df['energy_consumption'].rolling(window=24).mean()
df['rolling_std_24h'] = df['energy_consumption'].rolling(window=24).std()

# Calculate the difference from expected patterns
df['consumption_diff'] = df['energy_consumption'] - df['rolling_mean_24h']

# Drop NaN values from rolling calculations
df_clean = df.dropna()

# Display the engineered features
df_clean.head()

## 3. Anomaly Detection Model

Implement and train an anomaly detection model using Isolation Forest.

In [None]:
# Prepare features for anomaly detection
features = ['energy_consumption', 'hour', 'day_of_week', 'rolling_mean_24h', 'rolling_std_24h', 'consumption_diff']
X = df_clean[features].copy()

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Isolation Forest model
model = IsolationForest(n_estimators=100, contamination=0.02, random_state=42)
model.fit(X_scaled)

# Predict anomalies
df_clean['anomaly_score'] = model.decision_function(X_scaled)
df_clean['is_anomaly'] = model.predict(X_scaled) == -1  # -1 for anomalies, 1 for normal points

In [None]:
# Visualize the detected anomalies
plt.figure(figsize=(15, 6))
plt.plot(df_clean['timestamp'], df_clean['energy_consumption'], label='Energy Consumption')
plt.scatter(df_clean[df_clean['is_anomaly']]['timestamp'], 
            df_clean[df_clean['is_anomaly']]['energy_consumption'], 
            color='red', label='Anomalies')
plt.title('Energy Consumption with Detected Anomalies')
plt.xlabel('Time')
plt.ylabel('Energy Consumption (kWh)')
plt.legend()
plt.tight_layout()
plt.show()

## 4. Model Evaluation

Evaluate the performance of the anomaly detection model.

In [None]:
# Count detected anomalies
anomaly_count = df_clean['is_anomaly'].sum()
print(f"Number of detected anomalies: {anomaly_count}")
print(f"Percentage of data points flagged as anomalies: {anomaly_count / len(df_clean) * 100:.2f}%")

# Analyze anomaly characteristics
anomalies_df = df_clean[df_clean['is_anomaly']]
print("\nStatistics of normal vs. anomalous points:")
print(df_clean.groupby('is_anomaly')['energy_consumption'].describe())

## 5. Real-time Anomaly Detection Function

Create a function that can be used for real-time anomaly detection in the production system.

In [None]:
def detect_anomalies(new_data, model, scaler, feature_columns):
    """
    Detect anomalies in new energy consumption data.
    
    Parameters:
    -----------
    new_data : pandas.DataFrame
        New data points to check for anomalies
    model : trained anomaly detection model
        Pre-trained anomaly detection model
    scaler : sklearn.preprocessing.StandardScaler
        Fitted scaler for feature standardization
    feature_columns : list
        List of feature column names to use
        
    Returns:
    --------
    pandas.DataFrame
        Original data with anomaly scores and flags added
    """
    # Extract features
    X = new_data[feature_columns].copy()
    
    # Standardize features
    X_scaled = scaler.transform(X)
    
    # Predict anomalies
    new_data['anomaly_score'] = model.decision_function(X_scaled)
    new_data['is_anomaly'] = model.predict(X_scaled) == -1
    
    return new_data

# Example of using the function with new data
# In a real system, this would be called with new incoming data
# new_predictions = detect_anomalies(new_data, model, scaler, features)

## 6. Save Model for Production

Save the trained model and scaler for use in the production system.

In [None]:
import joblib

# Save the model and scaler
joblib.dump(model, '../models/isolation_forest_model.pkl')
joblib.dump(scaler, '../models/feature_scaler.pkl')
joblib.dump(features, '../models/feature_list.pkl')

print("Model and preprocessing objects saved successfully.")

## 7. Conclusion and Next Steps

This notebook demonstrated a basic approach to anomaly detection in energy consumption data. For production use, consider:

1. Collecting and using real historical data
2. Implementing more sophisticated models (LSTM, Prophet, etc.)
3. Adding domain-specific features
4. Setting up automated retraining
5. Implementing real-time alerting based on detected anomalies