# Convolutional Autoencoder Training Pipeline

This notebook implements a complete pipeline for training a convolutional autoencoder on weather data using Bayesian optimization for hyperparameter tuning.

## Step 1: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
import os
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
from bayesian_tuner import MLModel
from data_processing import process_epw_to_parquet_with_dask

print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

## Step 2: Generate Weather Data (if not already exists)

**Note:** Update the paths below to match your actual EPW files location and desired output path.

In [None]:
# Check if weather_data.parquet already exists
parquet_file_path = "weather_data.parquet"

if not os.path.exists(parquet_file_path):
    print("Weather data parquet file not found. Generating from EPW files...")
    
    # Update these paths to match your system
    epw_directory = "path/to/your/epw/files"  # UPDATE THIS PATH
    output_path = "weather_data.parquet"
    
    # Generate the parquet file using your data processing function
    process_epw_to_parquet_with_dask(epw_directory, output_path)
    print(f"Generated {parquet_file_path}")
else:
    print(f"Found existing parquet file: {parquet_file_path}")

## Step 3: Load and Explore the Weather Data

In [None]:
# Load the cleaned weather data
print("Loading weather data from parquet file...")
df = pd.read_parquet(parquet_file_path)

# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Unique locations: {df['location'].nunique()}")
print(f"Location names: {df['location'].unique()}")

# Display first few rows
print("\nFirst 5 rows:")
print(df.head())

# Check data types and missing values
print("\nData info:")
print(df.info())
print("\nMissing values per column:")
print(df.isnull().sum())

## Step 4: Preprocess the Data

In [None]:
# Select only numeric features (exclude timestamp and location)
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
if 'timestamp' in numeric_columns:
    numeric_columns.remove('timestamp')

print(f"Selected numeric features ({len(numeric_columns)}): {numeric_columns}")

# Extract numeric data for each location
locations = df['location'].unique()
location_data = []

for location in locations:
    location_df = df[df['location'] == location][numeric_columns]
    # Ensure we have exactly 8760 hours (1 year)
    if len(location_df) >= 8760:
        location_df = location_df.iloc[:8760]
    else:
        print(f"Warning: {location} has only {len(location_df)} hours, padding to 8760")
        # Pad with the last known values
        last_row = location_df.iloc[-1:]
        padding_needed = 8760 - len(location_df)
        padding_df = pd.concat([last_row] * padding_needed, ignore_index=True)
        location_df = pd.concat([location_df, padding_df], ignore_index=True)
    
    location_data.append(location_df.values)

# Stack data for all locations into a single array
# Shape: (num_locations, 8760, num_features)
weather_data = np.array(location_data)
print(f"Stacked weather data shape: {weather_data.shape}")

# Normalize the data using MinMaxScaler
print("Normalizing data with MinMaxScaler...")
scaler = MinMaxScaler()

# Reshape for scaling: (num_locations * 8760, num_features)
original_shape = weather_data.shape
weather_data_reshaped = weather_data.reshape(-1, weather_data.shape[-1])

# Fit and transform
weather_data_normalized = scaler.fit_transform(weather_data_reshaped)

# Reshape back to original shape
weather_data_normalized = weather_data_normalized.reshape(original_shape)

print(f"Normalized data shape: {weather_data_normalized.shape}")
print(f"Data range after normalization: [{weather_data_normalized.min():.3f}, {weather_data_normalized.max():.3f}]")

## Step 5: Data Shape for Conv1D Compatibility (Updated for ResNet)

In [None]:
# The new ResNet architecture uses Conv1D layers, so we don't need the extra channel dimension
# Data is already in the correct shape (num_locations, 8760, num_features)
weather_data_conv = weather_data_normalized

print(f"Data shape for Conv1D ResNet: {weather_data_conv.shape}")
print(f"Input shape for autoencoder: {weather_data_conv.shape[1:]}")

# Store input shape for model building
input_shape = weather_data_conv.shape[1:]  # (8760, num_features)

## Step 6: Split Data into Training and Testing Sets

In [None]:
# Split the data into training and testing sets
# Using 80% for training, 20% for testing
X_train, X_test = train_test_split(
    weather_data_conv, 
    test_size=0.2, 
    random_state=42, 
    shuffle=True
)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

## Step 7: Setup Model Configuration

In [None]:
# Define directories and model configuration
tuner_directory = "autoencoder_tuner"
project_name = "weather_autoencoder_optimization"
model_path = "trained_models"
model_name = "weather_autoencoder.h5"

# Create directories if they don't exist
os.makedirs(tuner_directory, exist_ok=True)
os.makedirs(model_path, exist_ok=True)

print(f"Tuner directory: {tuner_directory}")
print(f"Project name: {project_name}")
print(f"Model path: {model_path}")
print(f"Model name: {model_name}")
print(f"Input shape: {input_shape}")

## Step 8: Initialize MLModel and Start Bayesian Optimization

In [None]:
# Initialize the MLModel class for autoencoder
ml_model = MLModel(
    tuner_directory=tuner_directory,
    project_name=project_name,
    path_model=model_path,
    model_name=model_name,
    input_shape=input_shape,
    model_type='Autoencoder'
)

print("MLModel initialized successfully for autoencoder training")
print(f"Model type: {ml_model.model_type}")
print(f"Input shape: {ml_model.input_shape}")

## Step 9: Run Hyperparameter Tuning

In [None]:
# Start the Bayesian optimization process
# Note: This will take a considerable amount of time depending on max_trials
print("Starting Bayesian optimization for autoencoder hyperparameters...")
print("This process may take several hours depending on the number of trials.")

# For autoencoders, we pass the same data as input and target
ml_model.tune_model(
    train_x=X_train,
    train_y=X_train,  # In autoencoders, target = input
    epochs=50  # Reduced for faster tuning, increase if needed
)

print("Hyperparameter tuning completed!")

## Step 10: Get Best Model and Evaluate Performance

In [None]:
# Get the best model from tuning results
print("Retrieving best autoencoder model...")
best_model = ml_model.get_best_model(
    train_x=X_train,
    train_y=X_train,  # Target = input for autoencoder
    val_x=X_test,
    val_y=X_test     # Target = input for autoencoder
)

print("Best model retrieved and trained successfully!")

## Step 11: Evaluate Reconstruction Performance

In [None]:
# Evaluate the autoencoder's reconstruction performance on test set
print("Evaluating reconstruction performance on test set...")

# Get reconstructions from the trained model
X_test_reconstructed = best_model.predict(X_test)

# Calculate reconstruction metrics
mse_per_sample = np.mean(np.square(X_test - X_test_reconstructed), axis=(1, 2, 3))
mae_per_sample = np.mean(np.abs(X_test - X_test_reconstructed), axis=(1, 2, 3))

# Overall metrics
overall_mse = np.mean(mse_per_sample)
overall_mae = np.mean(mae_per_sample)
overall_rmse = np.sqrt(overall_mse)

print(f"\nReconstruction Performance Metrics:")
print(f"Mean Squared Error (MSE): {overall_mse:.6f}")
print(f"Root Mean Squared Error (RMSE): {overall_rmse:.6f}")
print(f"Mean Absolute Error (MAE): {overall_mae:.6f}")

print(f"\nPer-sample statistics:")
print(f"MSE - Min: {np.min(mse_per_sample):.6f}, Max: {np.max(mse_per_sample):.6f}, Std: {np.std(mse_per_sample):.6f}")
print(f"MAE - Min: {np.min(mae_per_sample):.6f}, Max: {np.max(mae_per_sample):.6f}, Std: {np.std(mae_per_sample):.6f}")

## Step 12: Visualize Results

In [None]:
# Plot training history if available
try:
    ml_model.plot_loss()
except:
    print("Could not plot training history")

# Plot reconstruction error distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(mse_per_sample, bins=30, alpha=0.7, color='blue')
plt.xlabel('MSE per Sample')
plt.ylabel('Frequency')
plt.title('Distribution of MSE per Sample')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(mae_per_sample, bins=30, alpha=0.7, color='green')
plt.xlabel('MAE per Sample')
plt.ylabel('Frequency')
plt.title('Distribution of MAE per Sample')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Visualize original vs reconstructed data for a sample
sample_idx = 0
feature_idx = 0  # Choose which feature to visualize

plt.figure(figsize=(15, 5))

# Original data
plt.subplot(1, 3, 1)
plt.plot(X_test[sample_idx, :, feature_idx, 0])
plt.title(f'Original Data (Sample {sample_idx}, Feature {feature_idx})')
plt.xlabel('Time (hours)')
plt.ylabel('Normalized Value')
plt.grid(True, alpha=0.3)

# Reconstructed data
plt.subplot(1, 3, 2)
plt.plot(X_test_reconstructed[sample_idx, :, feature_idx, 0])
plt.title(f'Reconstructed Data (Sample {sample_idx}, Feature {feature_idx})')
plt.xlabel('Time (hours)')
plt.ylabel('Normalized Value')
plt.grid(True, alpha=0.3)

# Comparison
plt.subplot(1, 3, 3)
plt.plot(X_test[sample_idx, :, feature_idx, 0], label='Original', alpha=0.8)
plt.plot(X_test_reconstructed[sample_idx, :, feature_idx, 0], label='Reconstructed', alpha=0.8)
plt.title(f'Comparison (Sample {sample_idx}, Feature {feature_idx})')
plt.xlabel('Time (hours)')
plt.ylabel('Normalized Value')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nSample {sample_idx} reconstruction MSE: {mse_per_sample[sample_idx]:.6f}")
print(f"Sample {sample_idx} reconstruction MAE: {mae_per_sample[sample_idx]:.6f}")

## Step 13: Save Results and Model Information

In [None]:
# Save reconstruction metrics
results = {
    'overall_mse': float(overall_mse),
    'overall_rmse': float(overall_rmse),
    'overall_mae': float(overall_mae),
    'mse_per_sample_stats': {
        'min': float(np.min(mse_per_sample)),
        'max': float(np.max(mse_per_sample)),
        'mean': float(np.mean(mse_per_sample)),
        'std': float(np.std(mse_per_sample))
    },
    'mae_per_sample_stats': {
        'min': float(np.min(mae_per_sample)),
        'max': float(np.max(mae_per_sample)),
        'mean': float(np.mean(mae_per_sample)),
        'std': float(np.std(mae_per_sample))
    },
    'model_info': {
        'input_shape': input_shape,
        'num_locations': len(locations),
        'num_features': len(numeric_columns),
        'train_samples': len(X_train),
        'test_samples': len(X_test)
    }
}

# Save results to JSON file
import json
results_file = os.path.join(model_path, 'autoencoder_results.json')
with open(results_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"Results saved to: {results_file}")

# Save the scaler for future use
import joblib
scaler_file = os.path.join(model_path, 'weather_data_scaler.pkl')
joblib.dump(scaler, scaler_file)
print(f"Data scaler saved to: {scaler_file}")

print("\n=== TRAINING COMPLETE ===")
print(f"Best autoencoder model saved to: {os.path.join(model_path, model_name)}")
print(f"Overall reconstruction MSE: {overall_mse:.6f}")
print(f"Overall reconstruction MAE: {overall_mae:.6f}")