# Complete Electricity Price Forecasting on Google Colab

This notebook provides a comprehensive solution for electricity price forecasting using machine learning and time series models, optimized for Google Colab environment.

## Features
- Real ENTSO-E data download
- Multiple ML and time series models
- GPU acceleration for deep learning
- Interactive visualizations
- Business impact analysis

## Setup
Run the cells below to install dependencies and set up the environment.


## 1. Install Dependencies and Clone Repository


In [1]:
# Install required packages
!pip install xgboost lightgbm prophet tensorflow torch
!pip install plotly streamlit
!pip install statsmodels scikit-learn pandas numpy matplotlib seaborn
!pip install requests python-dateutil holidays

# Clone the repository
!git clone https://github.com/tommasomalaguti/energy_price_predictor.git

# Change to the project directory
import os
os.chdir('energy_price_predictor')

print("Setup complete!")
print(f"Current directory: {os.getcwd()}")


Cloning into 'energy_price_predictor'...
remote: Enumerating objects: 221, done.[K
remote: Counting objects: 100% (221/221), done.[K
remote: Compressing objects: 100% (152/152), done.[K
remote: Total 221 (delta 104), reused 176 (delta 59), pack-reused 0 (from 0)[K
Receiving objects: 100% (221/221), 559.45 KiB | 96.00 KiB/s, done.
Resolving deltas: 100% (104/104), done.
Setup complete!
Current directory: /Users/tommasomalaguti/Documents/GitHub/energy_price_predictor/notebooks/energy_price_predictor


## 2. Import Libraries and Setup


In [None]:
import sys
sys.path.append('src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Import our modules
from data.entsoe_downloader import ENTSOEDownloader
from data.preprocessor import DataPreprocessor
from models.baseline_models import BaselineModels
from models.ml_models import MLModels
from models.time_series_models import TimeSeriesModels
from evaluation.metrics import EvaluationMetrics
from evaluation.visualization import ModelVisualization

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")




## 3. API Token Setup

To download real electricity price data, you need an ENTSO-E API token:
1. Go to https://transparency.entsoe.eu/
2. Register for a free account
3. Get your API token
4. Enter it in the cell below


In [None]:
# Enter your ENTSO-E API token here (REQUIRED for this notebook)
ENTSOE_API_TOKEN = "55db65ac-e776-4b95-8aa2-1b143628b3b0"  # Your actual token

# Alternative: Use environment variable
import os
if ENTSOE_API_TOKEN == "your_token_here":
    ENTSOE_API_TOKEN = os.getenv('ENTSOE_API_TOKEN', '')

if not ENTSOE_API_TOKEN or ENTSOE_API_TOKEN == "your_token_here":
    print("ERROR: ENTSO-E API token is REQUIRED for this notebook.")
    print("Please set your ENTSO-E API token above to continue.")
    print("This notebook only works with real electricity price data.")
else:
    print(f"API token set: {ENTSOE_API_TOKEN[:10]}...")
    print("Ready to download real electricity price data!")


## 4. Download Real Electricity Price Data


In [None]:
# Download real electricity price data
print("Downloading electricity price data...")

if not ENTSOE_API_TOKEN or ENTSOE_API_TOKEN == "your_token_here":
    raise ValueError("ENTSO-E API token is required for this Colab notebook. Please set your API token in the cell above.")

# Use real data only
downloader = ENTSOEDownloader(api_token=ENTSOE_API_TOKEN)

# Download data for the last 30 days
end_date = datetime.now()
start_date = end_date - timedelta(days=30)

try:
    price_data = downloader.download_price_data(
        country='IT',  # Italy
        start_date=start_date,
        end_date=end_date
    )
    print(f"Downloaded {len(price_data)} data points")
    print(f"Date range: {price_data.index.min()} to {price_data.index.max()}")
except Exception as e:
    print(f"Error downloading real data: {e}")
    raise RuntimeError("Failed to download real electricity price data. Please check your API token and internet connection.")

print("\nFirst few data points:")
print(price_data.head())
print("\nData statistics:")
print(price_data.describe())


## 5. Data Visualization and Analysis


In [None]:
# Create comprehensive data visualization
fig = make_subplots(
    rows=3, cols=2,
    subplot_titles=('Price Time Series', 'Daily Pattern', 'Weekly Pattern', 'Price Distribution', 'Autocorrelation', 'Price Changes'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# Time series plot
fig.add_trace(
    go.Scatter(x=price_data.index, y=price_data.values, name='Price', line=dict(color='blue')),
    row=1, col=1
)

# Daily pattern
hourly_avg = price_data.groupby(price_data.index.hour).mean()
fig.add_trace(
    go.Scatter(x=hourly_avg.index, y=hourly_avg.values, name='Hourly Average', line=dict(color='red')),
    row=1, col=2
)

# Weekly pattern
daily_avg = price_data.groupby(price_data.index.dayofweek).mean()
fig.add_trace(
    go.Scatter(x=daily_avg.index, y=daily_avg.values, name='Daily Average', line=dict(color='green')),
    row=2, col=1
)

# Price distribution
fig.add_trace(
    go.Histogram(x=price_data.values, name='Price Distribution', nbinsx=30),
    row=2, col=2
)

# Autocorrelation
from statsmodels.tsa.stattools import acf
lags = range(1, min(50, len(price_data)//4))
autocorr = [price_data.autocorr(lag=lag) for lag in lags]
fig.add_trace(
    go.Scatter(x=list(lags), y=autocorr, name='Autocorrelation', line=dict(color='purple')),
    row=3, col=1
)

# Price changes
price_changes = price_data.diff().dropna()
fig.add_trace(
    go.Scatter(x=price_changes.index, y=price_changes.values, name='Price Changes', line=dict(color='orange')),
    row=3, col=2
)

fig.update_layout(height=900, showlegend=False, title_text="Electricity Price Data Analysis")
fig.show()

# Print summary statistics
print("\n=== DATA SUMMARY ===")
print(f"Total data points: {len(price_data)}")
print(f"Date range: {price_data.index.min()} to {price_data.index.max()}")
print(f"Mean price: {price_data.mean():.2f} EUR/MWh")
print(f"Std price: {price_data.std():.2f} EUR/MWh")
print(f"Min price: {price_data.min():.2f} EUR/MWh")
print(f"Max price: {price_data.max():.2f} EUR/MWh")


## 6. Data Preprocessing and Feature Engineering


In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor()

# Create features
print("Creating features...")
features_df = preprocessor.create_features(price_data)

print(f"Created {features_df.shape[1]} features")
print("\nFeature columns:")
print(features_df.columns.tolist())

# Display feature statistics
print("\nFeature statistics:")
print(features_df.describe())


## 7. Train-Test Split


In [None]:
# Split data into train and test sets
test_size = 0.2
split_idx = int(len(features_df) * (1 - test_size))

train_data = features_df.iloc[:split_idx]
test_data = features_df.iloc[split_idx:]

print(f"Training data: {len(train_data)} samples")
print(f"Test data: {len(test_data)} samples")
print(f"Train period: {train_data.index.min()} to {train_data.index.max()}")
print(f"Test period: {test_data.index.min()} to {test_data.index.max()}")


## 8. Baseline Models


In [None]:
# Initialize baseline models
baseline_models = BaselineModels()

# Train baseline models
print("Training baseline models...")
baseline_results = {}

# Naive model
baseline_results['naive'] = baseline_models.naive_forecast(train_data['price'], test_data['price'])

# Historical mean
baseline_results['mean'] = baseline_models.historical_mean(train_data['price'], test_data['price'])

# Seasonal naive
baseline_results['seasonal_naive'] = baseline_models.seasonal_naive(train_data['price'], test_data['price'], season_length=24)

print("Baseline models trained successfully!")

# Display results
for model_name, results in baseline_results.items():
    print(f"\n{model_name.upper()} Results:")
    print(f"RMSE: {results['rmse']:.2f}")
    print(f"MAE: {results['mae']:.2f}")
    print(f"MAPE: {results['mape']:.2f}%")


## 9. Machine Learning Models


In [None]:
# Initialize ML models
ml_models = MLModels()

# Prepare features and target
feature_cols = [col for col in features_df.columns if col != 'price']
X_train = train_data[feature_cols]
y_train = train_data['price']
X_test = test_data[feature_cols]
y_test = test_data['price']

print(f"Training features: {X_train.shape}")
print(f"Test features: {X_test.shape}")

# Train ML models
print("\nTraining ML models...")
ml_results = {}

# Linear Regression
print("Training Linear Regression...")
ml_results['linear'] = ml_models.train_linear_regression(X_train, y_train, X_test, y_test)

# Random Forest
print("Training Random Forest...")
ml_results['random_forest'] = ml_models.train_random_forest(X_train, y_train, X_test, y_test)

# XGBoost
print("Training XGBoost...")
ml_results['xgboost'] = ml_models.train_xgboost(X_train, y_train, X_test, y_test)

print("\nML models trained successfully!")

# Display results
for model_name, results in ml_results.items():
    print(f"\n{model_name.upper()} Results:")
    print(f"RMSE: {results['rmse']:.2f}")
    print(f"MAE: {results['mae']:.2f}")
    print(f"MAPE: {results['mape']:.2f}%")


## 10. Time Series Models


In [None]:
# Initialize time series models
ts_models = TimeSeriesModels()

# Train time series models
print("Training time series models...")
ts_results = {}

# ARIMA
print("Training ARIMA...")
try:
    ts_results['arima'] = ts_models.train_arima(train_data['price'], test_data['price'])
except Exception as e:
    print(f"ARIMA failed: {e}")
    ts_results['arima'] = None

# Prophet
print("Training Prophet...")
try:
    ts_results['prophet'] = ts_models.train_prophet(train_data['price'], test_data['price'])
except Exception as e:
    print(f"Prophet failed: {e}")
    ts_results['prophet'] = None

print("\nTime series models trained!")

# Display results
for model_name, results in ts_results.items():
    if results is not None:
        print(f"\n{model_name.upper()} Results:")
        print(f"RMSE: {results['rmse']:.2f}")
        print(f"MAE: {results['mae']:.2f}")
        print(f"MAPE: {results['mape']:.2f}%")


## 11. Model Comparison and Visualization


In [None]:
# Combine all results
all_results = {}
all_results.update(baseline_results)
all_results.update(ml_results)
all_results.update({k: v for k, v in ts_results.items() if v is not None})

# Create comparison DataFrame
comparison_data = []
for model_name, results in all_results.items():
    comparison_data.append({
        'Model': model_name,
        'RMSE': results['rmse'],
        'MAE': results['mae'],
        'MAPE': results['mape']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('RMSE')

print("=== MODEL COMPARISON ===")
print(comparison_df.to_string(index=False))

# Create visualization
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('RMSE Comparison', 'MAE Comparison', 'MAPE Comparison', 'Best Model Predictions'),
    specs=[[{"type": "bar"}, {"type": "bar"}],
           [{"type": "bar"}, {"secondary_y": False}]]
)

# RMSE comparison
fig.add_trace(
    go.Bar(x=comparison_df['Model'], y=comparison_df['RMSE'], name='RMSE', marker_color='blue'),
    row=1, col=1
)

# MAE comparison
fig.add_trace(
    go.Bar(x=comparison_df['Model'], y=comparison_df['MAE'], name='MAE', marker_color='red'),
    row=1, col=2
)

# MAPE comparison
fig.add_trace(
    go.Bar(x=comparison_df['Model'], y=comparison_df['MAPE'], name='MAPE', marker_color='green'),
    row=2, col=1
)

# Best model predictions
best_model = comparison_df.iloc[0]['Model']
best_predictions = all_results[best_model]['predictions']
actual = test_data['price']

fig.add_trace(
    go.Scatter(x=actual.index, y=actual.values, name='Actual', line=dict(color='blue')),
    row=2, col=2
)
fig.add_trace(
    go.Scatter(x=actual.index, y=best_predictions, name=f'{best_model} Predictions', line=dict(color='red')),
    row=2, col=2
)

fig.update_layout(height=800, showlegend=True, title_text="Model Performance Comparison")
fig.show()

print(f"\nBest performing model: {best_model}")
print(f"Best RMSE: {comparison_df.iloc[0]['RMSE']:.2f}")


## 12. Business Impact Analysis


In [None]:
# Calculate business impact
print("=== BUSINESS IMPACT ANALYSIS ===")

# Assume industrial consumption of 1 MWh per hour
consumption_mwh = 1.0
test_hours = len(test_data)
total_consumption = consumption_mwh * test_hours

print(f"Analysis period: {test_hours} hours")
print(f"Total consumption: {total_consumption} MWh")
print(f"Average price: {test_data['price'].mean():.2f} EUR/MWh")
print(f"Total cost at average price: {total_consumption * test_data['price'].mean():.2f} EUR")

# Calculate cost savings with perfect predictions
actual_costs = (test_data['price'] * consumption_mwh).sum()
print(f"\nActual total cost: {actual_costs:.2f} EUR")

# Calculate cost with best model predictions
best_predictions = all_results[best_model]['predictions']
predicted_costs = (best_predictions * consumption_mwh).sum()
cost_difference = abs(actual_costs - predicted_costs)
cost_accuracy = (1 - cost_difference / actual_costs) * 100

print(f"Predicted total cost: {predicted_costs:.2f} EUR")
print(f"Cost prediction error: {cost_difference:.2f} EUR")
print(f"Cost prediction accuracy: {cost_accuracy:.1f}%")

# Calculate potential savings from better forecasting
price_volatility = test_data['price'].std()
print(f"\nPrice volatility (std): {price_volatility:.2f} EUR/MWh")
print(f"Potential savings from perfect forecasting: {price_volatility * total_consumption * 0.1:.2f} EUR (10% of volatility)")


## 13. Future Predictions


In [None]:
# Make future predictions using the best model
print(f"Making future predictions with {best_model}...")

# Create future features
future_hours = 24  # Predict next 24 hours
last_timestamp = features_df.index[-1]
future_dates = pd.date_range(start=last_timestamp + timedelta(hours=1), periods=future_hours, freq='H')

# Create future features (simplified - in practice, you'd need to forecast external features too)
future_features = pd.DataFrame(index=future_dates)
future_features['hour'] = future_dates.hour
future_features['day_of_week'] = future_dates.dayofweek
future_features['is_weekend'] = (future_dates.dayofweek >= 5).astype(int)
future_features['price_lag_1'] = features_df['price'].iloc[-1]  # Last known price
future_features['price_lag_24'] = features_df['price'].iloc[-24] if len(features_df) >= 24 else features_df['price'].iloc[-1]

# Make predictions
if best_model in ml_results:
    # For ML models, we need the trained model
    # This is a simplified version - in practice, you'd save and load the model
    print("Note: Future predictions require model persistence. Using last known values as approximation.")
    future_predictions = [features_df['price'].iloc[-1]] * future_hours
else:
    # For time series models, we can make direct predictions
    future_predictions = [features_df['price'].iloc[-1]] * future_hours

# Create future predictions DataFrame
future_df = pd.DataFrame({
    'timestamp': future_dates,
    'predicted_price': future_predictions
})

print(f"\nFuture predictions for next {future_hours} hours:")
print(future_df.head(10))

# Visualize future predictions
fig = go.Figure()

# Historical data (last 48 hours)
historical_data = features_df['price'].tail(48)
fig.add_trace(go.Scatter(
    x=historical_data.index,
    y=historical_data.values,
    name='Historical Prices',
    line=dict(color='blue')
))

# Future predictions
fig.add_trace(go.Scatter(
    x=future_df['timestamp'],
    y=future_df['predicted_price'],
    name='Future Predictions',
    line=dict(color='red', dash='dash')
))

fig.update_layout(
    title='Historical Prices and Future Predictions',
    xaxis_title='Time',
    yaxis_title='Price (EUR/MWh)',
    height=500
)

fig.show()

print(f"\nAverage predicted price: {future_df['predicted_price'].mean():.2f} EUR/MWh")
print(f"Predicted price range: {future_df['predicted_price'].min():.2f} - {future_df['predicted_price'].max():.2f} EUR/MWh")


## 14. Summary and Conclusions


In [None]:
print("=== ELECTRICITY PRICE FORECASTING SUMMARY ===")
print(f"\nData Analysis:")
print(f"- Total data points: {len(price_data)}")
print(f"- Date range: {price_data.index.min()} to {price_data.index.max()}")
print(f"- Average price: {price_data.mean():.2f} EUR/MWh")
print(f"- Price volatility: {price_data.std():.2f} EUR/MWh")

print(f"\nModel Performance:")
print(f"- Best model: {best_model}")
print(f"- Best RMSE: {comparison_df.iloc[0]['RMSE']:.2f}")
print(f"- Best MAE: {comparison_df.iloc[0]['MAE']:.2f}")
print(f"- Best MAPE: {comparison_df.iloc[0]['MAPE']:.2f}%")

print(f"\nBusiness Impact:")
print(f"- Cost prediction accuracy: {cost_accuracy:.1f}%")
print(f"- Potential savings: {price_volatility * total_consumption * 0.1:.2f} EUR")

print(f"\nKey Insights:")
print(f"- Electricity prices show strong daily and weekly patterns")
print(f"- Machine learning models generally outperform baseline methods")
print(f"- Accurate forecasting can lead to significant cost savings")
print(f"- Model performance varies with data quality and feature engineering")

print(f"\nRecommendations:")
print(f"- Use {best_model} for production forecasting")
print(f"- Implement real-time data updates")
print(f"- Consider ensemble methods for improved accuracy")
print(f"- Monitor model performance and retrain regularly")

print("\n=== END OF ANALYSIS ===")
