# AIDM Preprocessing Pipeline

This notebook implements the data preprocessing pipeline for the AIDM (Anomaly and Intrusion Detection Model) system.

## Pipeline Steps:
1. Load raw data from the digital twin dataset
2. Align and resample multiple modalities to common time base
3. Handle missing data
4. Extract engineered features
5. Create LSTM sequences
6. Split data temporally
7. Fit scalers and normalize
8. Save processed data for model training

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import logging
from pathlib import Path

# Import our custom modules
from data_loader import load_sample_data, DigitalTwinDataLoader
from preprocess import DataPreprocessor, load_config

# Setup plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✅ Libraries imported successfully")

## 1. Load Configuration and Data

In [None]:
# Load configuration
config = load_config('../config.yaml')
print("Configuration loaded:")
print(f"  Dataset path: {config['data']['dataset_path']}")
print(f"  Small data mode: {config['compute']['small_data_mode']}")
print(f"  Base sampling rate: {config['preprocessing']['base_sampling_rate']}s")
print(f"  Sequence window: {config['preprocessing']['sequence_window']}")

In [None]:
# Load sample data
dataset_path = config['data']['dataset_path']
print(f"Loading data from: {dataset_path}")

# Load data using our data loader
data = load_sample_data(
    dataset_path, 
    small_data_mode=config['compute']['small_data_mode'],
    synthetic_fallback=True
)

print("\nLoaded data modalities:")
for modality, df in data.items():
    print(f"  {modality}: {df.shape} - {df.index[0]} to {df.index[-1]}")
    print(f"    Columns: {list(df.columns[:5])}{'...' if len(df.columns) > 5 else ''}")

## 2. Data Exploration and Visualization

In [None]:
# Plot sample data for each modality
fig, axes = plt.subplots(len(data), 1, figsize=(15, 4*len(data)))
if len(data) == 1:
    axes = [axes]

for i, (modality, df) in enumerate(data.items()):
    ax = axes[i]
    
    # Plot first few columns
    cols_to_plot = df.columns[:min(5, len(df.columns))]
    for col in cols_to_plot:
        ax.plot(df.index, df[col], label=col, alpha=0.7)
    
    ax.set_title(f'{modality.capitalize()} Data')
    ax.set_xlabel('Time')
    ax.set_ylabel('Value')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Data quality assessment
print("Data Quality Assessment:")
print("=" * 50)

for modality, df in data.items():
    print(f"\n{modality.upper()}:")
    print(f"  Shape: {df.shape}")
    print(f"  Time range: {df.index[0]} to {df.index[-1]}")
    print(f"  Duration: {df.index[-1] - df.index[0]}")
    print(f"  Missing values: {df.isnull().sum().sum()}")
    print(f"  Data types: {df.dtypes.value_counts().to_dict()}")
    
    # Basic statistics
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"  Numeric columns: {len(numeric_cols)}")
        print(f"  Mean values: {df[numeric_cols].mean().mean():.4f}")
        print(f"  Std deviation: {df[numeric_cols].std().mean():.4f}")

## 3. Initialize Preprocessor and Run Pipeline

In [None]:
# Initialize the preprocessor
preprocessor = DataPreprocessor(config)
print("✅ Preprocessor initialized")

# Set output path
output_path = config['data']['output_path']
print(f"Output path: {output_path}")

### 3.1 Data Alignment and Resampling

In [None]:
# Step 1: Align and resample data
print("Step 1: Aligning and resampling data...")
aligned_df = preprocessor.align_and_resample(data)

print(f"Aligned data shape: {aligned_df.shape}")
print(f"Time range: {aligned_df.index[0]} to {aligned_df.index[-1]}")
print(f"Columns: {list(aligned_df.columns[:10])}{'...' if len(aligned_df.columns) > 10 else ''}")

### 3.2 Missing Data Handling

In [None]:
# Step 2: Handle missing data
print("Step 2: Handling missing data...")
print(f"Missing values before: {aligned_df.isnull().sum().sum()}")

clean_df = preprocessor.handle_missing_data(aligned_df)

print(f"Missing values after: {clean_df.isnull().sum().sum()}")
print(f"Data shape after cleaning: {clean_df.shape}")

### 3.3 Feature Engineering

In [None]:
# Step 3: Feature engineering
print("Step 3: Extracting engineered features...")
print(f"Input features: {clean_df.shape[1]}")

feature_df = preprocessor.extract_features(clean_df)

print(f"Output features: {feature_df.shape[1]}")
print(f"Feature names: {preprocessor.feature_names[:10]}{'...' if len(preprocessor.feature_names) > 10 else ''}")

# Show feature statistics
print("\nFeature statistics:")
print(feature_df.describe())

### 3.4 Sequence Creation

In [None]:
# Step 4: Create sequences
print("Step 4: Creating LSTM sequences...")
X_sequences, y_next, X_tabular = preprocessor.create_sequences(feature_df)

print(f"Sequence data shape: {X_sequences.shape}")
print(f"Target data shape: {y_next.shape}")
print(f"Tabular data shape: {X_tabular.shape}")
print(f"Sequence window: {config['preprocessing']['sequence_window']} timesteps")

### 3.5 Data Splitting

In [None]:
# Step 5: Split data
print("Step 5: Splitting data temporally...")
splits = preprocessor.split_data(X_tabular, X_sequences, y_next)

print("Data splits:")
for split_name in ['train', 'val', 'test']:
    X_key = f'X_{split_name}'
    print(f"  {split_name}: {splits[X_key].shape[0]} samples")

### 3.6 Scaling and Normalization

In [None]:
# Step 6: Fit scalers
print("Step 6: Fitting scalers on training data...")
preprocessor.fit_scalers(splits['X_train'], splits['X_seq_train'])

# Step 7: Transform all data
print("Step 7: Scaling all data...")
for split_name in ['train', 'val', 'test']:
    X_tab_key = f'X_{split_name}'
    X_seq_key = f'X_seq_{split_name}'
    
    splits[X_tab_key], splits[X_seq_key] = preprocessor.transform_data(
        splits[X_tab_key], splits[X_seq_key]
    )

print("✅ Data scaling completed")

# Show scaling statistics
print("\nScaled data statistics (training set):")
print(f"Tabular features - Mean: {splits['X_train'].mean():.4f}, Std: {splits['X_train'].std():.4f}")
print(f"Sequence features - Mean: {splits['X_seq_train'].mean():.4f}, Std: {splits['X_seq_train'].std():.4f}")

## 4. Save Processed Data

In [None]:
# Step 8: Save processed data
print("Step 8: Saving processed data...")
preprocessor.save_processed_data(splits, output_path)

# Verify saved files
processed_dir = Path(output_path) / "processed"
saved_files = list(processed_dir.glob("*"))

print("\nSaved files:")
for file_path in saved_files:
    size_mb = file_path.stat().st_size / (1024 * 1024)
    print(f"  {file_path.name}: {size_mb:.2f} MB")

print(f"\n✅ All processed data saved to: {processed_dir}")

## 5. Data Visualization and Summary

In [None]:
# Visualize processed data distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Feature distributions (first few features)
n_features_to_plot = min(5, splits['X_train'].shape[1])
for i in range(n_features_to_plot):
    axes[0, 0].hist(splits['X_train'][:, i], alpha=0.7, bins=30, label=f'Feature {i}')
axes[0, 0].set_title('Training Feature Distributions')
axes[0, 0].set_xlabel('Scaled Value')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()

# Plot 2: Data split sizes
split_sizes = [len(splits['X_train']), len(splits['X_val']), len(splits['X_test'])]
split_labels = ['Train', 'Validation', 'Test']
axes[0, 1].bar(split_labels, split_sizes, color=['blue', 'orange', 'green'], alpha=0.7)
axes[0, 1].set_title('Data Split Sizes')
axes[0, 1].set_ylabel('Number of Samples')

# Plot 3: Sequence example (first sequence, first few features)
seq_example = splits['X_seq_train'][0, :, :min(3, splits['X_seq_train'].shape[2])]
for i in range(seq_example.shape[1]):
    axes[1, 0].plot(seq_example[:, i], label=f'Feature {i}', marker='o')
axes[1, 0].set_title('Example LSTM Sequence')
axes[1, 0].set_xlabel('Timestep')
axes[1, 0].set_ylabel('Scaled Value')
axes[1, 0].legend()

# Plot 4: Feature correlation heatmap (subset)
n_corr_features = min(10, splits['X_train'].shape[1])
corr_matrix = np.corrcoef(splits['X_train'][:, :n_corr_features].T)
im = axes[1, 1].imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
axes[1, 1].set_title('Feature Correlation Matrix')
plt.colorbar(im, ax=axes[1, 1])

plt.tight_layout()
plt.show()

## 6. Pipeline Summary

In [None]:
# Final summary
print("\n" + "="*60)
print("PREPROCESSING PIPELINE COMPLETED SUCCESSFULLY")
print("="*60)

print(f"\n📊 DATA SUMMARY:")
print(f"  • Original modalities: {len(data)}")
print(f"  • Final features: {len(preprocessor.feature_names)}")
print(f"  • Sequence window: {config['preprocessing']['sequence_window']} timesteps")
print(f"  • Total samples: {len(X_tabular)}")

print(f"\n🔄 DATA SPLITS:")
print(f"  • Training: {len(splits['X_train'])} samples ({len(splits['X_train'])/len(X_tabular)*100:.1f}%)")
print(f"  • Validation: {len(splits['X_val'])} samples ({len(splits['X_val'])/len(X_tabular)*100:.1f}%)")
print(f"  • Test: {len(splits['X_test'])} samples ({len(splits['X_test'])/len(X_tabular)*100:.1f}%)")

print(f"\n💾 OUTPUT FILES:")
print(f"  • Processed data: {processed_dir / 'processed_data.npz'}")
print(f"  • Scalers: {processed_dir / 'scalers.pkl'}")
print(f"  • Config: {processed_dir / 'preprocessing_config.yaml'}")

print(f"\n🎯 NEXT STEPS:")
print(f"  1. Run notebook_2_attacks.ipynb to generate adversarial data")
print(f"  2. Run notebook_3_train_evaluate.ipynb to train AIDM models")
print(f"  3. Use CLI: python src/attacks.py --type fdia")
print(f"  4. Use CLI: python src/train_ids.py --model autoencoder")

print("\n" + "="*60)

## 7. Load and Verify Saved Data

In [None]:
# Verify we can load the saved data
print("Verifying saved data can be loaded...")

# Load processed data
loaded_data = np.load(processed_dir / "processed_data.npz", allow_pickle=True)
loaded_scalers = joblib.load(processed_dir / "scalers.pkl")

print("\nLoaded data keys:")
for key in loaded_data.files:
    if key != 'feature_names':
        print(f"  {key}: {loaded_data[key].shape}")
    else:
        print(f"  {key}: {len(loaded_data[key])} features")

print("\nLoaded scalers:")
for scaler_name, scaler in loaded_scalers.items():
    print(f"  {scaler_name}: {type(scaler).__name__}")

print("\n✅ Data verification completed successfully!")
print("The preprocessing pipeline is ready for the next stage.")