# Hybrid Energy System Data Generation Example

This notebook demonstrates how to use the data generation system to create synthetic data for fault prediction.

In [None]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_generator import HybridSystemDataGenerator

# Set plotting style
#plt.style.use('seaborn')
sns.set_palette('husl')

## Generate Dataset

Let's generate a 2-year dataset starting from January 1st, 2023.

In [3]:
# Initialize the generator
generator = HybridSystemDataGenerator(seed=42)

# Generate dataset
df = generator.generate_dataset(
    start_date='2023-01-01',
    periods_years=2,
    output_file='../data/hybrid_system_data.parquet'
)

print(f"Generated dataset with {len(df)} rows and {len(df.columns)} features")
print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")

Generating time series base...
Generating weather conditions...


  dates = pd.date_range(start=start_date, periods=hours, freq='H')


Generating load profile...


  season = df['season'][i]
  reactive_power = np.sqrt(apparent_power**2 - load_demand**2)


Simulating solar PV system...


KeyError: 'cloud_cover'

## Explore Generated Data

Let's look at some key aspects of the generated data.

In [None]:
# Plot solar generation vs load demand for a week
week_data = df['2023-01-01':'2023-01-07']

plt.figure(figsize=(15, 6))
plt.plot(week_data.index, week_data['solar_power_output'], label='Solar Output (kW)')
plt.plot(week_data.index, week_data['load_active_power'], label='Load Demand (kW)')
plt.title('Solar Generation vs Load Demand - First Week of 2023')
plt.xlabel('Date')
plt.ylabel('Power (kW)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot battery state of charge and fault events
plt.figure(figsize=(15, 6))

# Plot SOC
plt.plot(week_data.index, week_data['battery_soc'] * 100, label='Battery SOC (%)')

# Highlight fault periods
fault_periods = week_data[week_data['fault_occurred']]
plt.scatter(fault_periods.index, fault_periods['battery_soc'] * 100,
           color='red', marker='x', s=100, label='Fault Events')

plt.title('Battery State of Charge and Fault Events - First Week of 2023')
plt.xlabel('Date')
plt.ylabel('State of Charge (%)')
plt.legend()
plt.grid(True)
plt.show()

## Analyze Fault Distribution

Let's look at the distribution of different types of faults in our dataset.

In [None]:
# Count fault occurrences
fault_counts = df[df['fault_occurred']]['fault_types'].value_counts()

plt.figure(figsize=(12, 6))
fault_counts.plot(kind='bar')
plt.title('Distribution of Fault Types')
plt.xlabel('Fault Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Print fault statistics
total_hours = len(df)
fault_hours = df['fault_occurred'].sum()
print(f"Total fault hours: {fault_hours}")
print(f"System availability: {(1 - fault_hours/total_hours)*100:.2f}%")

## Save Feature Descriptions

Let's create a reference of all features in our dataset.

In [None]:
# Create feature descriptions
feature_descriptions = pd.DataFrame({
    'Feature': df.columns,
    'Type': df.dtypes,
    'Non_null_count': df.count(),
    'Memory_usage': df.memory_usage(deep=True)[1:] / 1024  # KB
})

# Save to CSV
feature_descriptions.to_csv('../data/feature_descriptions.csv')
feature_descriptions.head(10)