<a href="https://colab.research.google.com/github/supriyag123/PHD_Pub/blob/main/AGENTIC-MODULE1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install statsmodels --upgrade
!pip install -U lingam
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import math
import plotly.graph_objects as go
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, RepeatVector, TimeDistributed, Input
from keras.models import Model
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic
import ast
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_squared_error
from math import sqrt
from matplotlib import pyplot
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LassoCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import TimeSeriesSplit
from numpy import arange
import warnings
warnings.filterwarnings('ignore')
from statsmodels.tsa.stattools import grangercausalitytests
from importlib.metadata import version
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
import pickle

# ================================================================================
# DATA LOADING AND PREPROCESSING
# ================================================================================

# Load MetroPT-3 dataset
df=pd.read_csv(r'/content/drive/MyDrive/PHD/metropt+3+dataset (1).zip (Unzipped Files)/MetroPT3(AirCompressor).csv', parse_dates={'datetime':[1]}, index_col=['datetime'])

# Handle missing values
df = df.replace('?', np.nan)
df.isnull().sum()

def fill_missing(values):
    one_day = 24*6
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            if np.isnan(values[row][col]):
                values[row,col] = values[row-one_day,col]

df = df.astype('float32')
fill_missing(df.values)

# Resample to hourly data to reduce computation
daily_df = df.resample('1H').mean().backfill()

# Convert index to column and filter to April 2020
daily_df['datetime']=daily_df.index
daily_df = daily_df.loc[(daily_df['datetime'] >= '2020-04-01')]

# Remove index and datetime columns
daily_df.drop(daily_df.columns[0], axis=1, inplace=True) # remove ID column if exists
daily_df.drop(daily_df.columns[-1], axis=1, inplace=True) # remove datetime column

# Scaling the values
whole_series = daily_df

scalers={}
for i in daily_df.columns:
    scaler = MinMaxScaler(feature_range=(-1,1))
    s_s = scaler.fit_transform(whole_series[i].values.reshape(-1,1))
    s_s=np.reshape(s_s,len(s_s))
    scalers['scaler_'+ i] = scaler
    whole_series[i]=s_s

# ================================================================================
# GRANGER CAUSALITY FUNCTIONS
# ================================================================================

def granger_causation_matrix(data, variables, max_lag=25, test='ssr_chi2test', verbose=False):
    """Check Granger Causality of all possible combinations of the Time series."""
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            try:
                test_result = grangercausalitytests(data[[r,c]], maxlag=max_lag, verbose=False)
                p_values = [round(test_result[i+1][0][test][1],4) for i in range(max_lag)]
                min_p_value = np.min(p_values)
                df.loc[r, c] = min_p_value
            except:
                df.loc[r, c] = 1.0  # No causality if test fails
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

def auto_feature_selection(data, max_lag='adaptive', significance_level=0.05, min_features=3, removal_strategy='moderate'):
    """Automatically remove features with weak causal relationships"""
    print("Starting automated feature selection...")
    print(f"Input data shape: {data.shape}")

    # Adaptive max_lag based on data characteristics
    if max_lag == 'adaptive':
        data_length = len(data)
        if data_length > 2000:
            max_lag = min(24, data_length // 100)
        elif data_length > 1000:
            max_lag = min(12, data_length // 80)
        else:
            max_lag = min(6, data_length // 50)

    # Granger causality analysis
    gc_matrix = granger_causation_matrix(data, data.columns, max_lag)
    significant_mask = gc_matrix < significance_level

    # Count significant relationships for each feature
    feature_scores = {}
    for col in data.columns:
        causes_count = (gc_matrix[col + '_x'] < significance_level).sum()
        caused_by_count = (gc_matrix.loc[col + '_y'] < significance_level).sum()
        total_score = causes_count + caused_by_count
        feature_scores[col] = total_score

    # Feature selection strategy
    sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)

    if removal_strategy == 'conservative':
        keep_features = [f[0] for f in sorted_features if f[1] > 0]
    elif removal_strategy == 'moderate':
        threshold = len(data.columns) * 0.5
        keep_features = [f[0] for f in sorted_features if f[1] >= threshold]
    elif removal_strategy == 'aggressive':
        keep_count = max(min_features, int(len(data.columns) * 0.75))
        keep_features = [f[0] for f in sorted_features[:keep_count]]

    if len(keep_features) < min_features:
        keep_features = [f[0] for f in sorted_features[:min_features]]

    print(f"Selected {len(keep_features)} features out of {len(data.columns)}")
    return data[keep_features]

# ================================================================================
# DUAL LABELING SYSTEM
# ================================================================================

def create_detection_labels(df):
    """
    Detection labels - mark periods when failures are currently happening
    For Agent 3: Current anomaly detection task
    """
    print("Creating DETECTION labels (current failures)...")

    labels = np.zeros(len(df))
    failure_periods = [
        ('2020-04-18 00:00:00', '2020-04-18 23:59:59', 'Air_leak_1'),
        ('2020-05-29 23:30:00', '2020-05-30 06:00:00', 'Air_leak_2'),
        ('2020-06-05 10:00:00', '2020-06-07 14:30:00', 'Air_leak_3'),
        ('2020-07-15 14:30:00', '2020-07-15 19:00:00', 'Air_leak_4')
    ]

    for start_time, end_time, failure_type in failure_periods:
        failure_mask = (df.index >= start_time) & (df.index <= end_time)
        failure_indices = np.where(failure_mask)[0]
        if len(failure_indices) > 0:
            labels[failure_indices] = 1
            print(f"  {failure_type}: {len(failure_indices)} points")

    failure_count = np.sum(labels)
    print(f"Detection labels: {failure_count}/{len(labels)} ({failure_count/len(labels)*100:.2f}%)")
    return labels.astype(int)

def create_prediction_labels(df, horizons=[1, 3, 5, 12]):
    """
    Prediction labels - mark periods that should trigger early warnings
    For Agent 3: Early warning prediction task
    """
    print("Creating PREDICTION labels (early warnings)...")

    failure_periods = [
        ('2020-04-18 00:00:00', '2020-04-18 23:59:59', 'Air_leak_1'),
        ('2020-05-29 23:30:00', '2020-05-30 06:00:00', 'Air_leak_2'),
        ('2020-06-05 10:00:00', '2020-06-07 14:30:00', 'Air_leak_3'),
        ('2020-07-15 14:30:00', '2020-07-15 19:00:00', 'Air_leak_4')
    ]

    prediction_labels = {}

    for H in horizons:
        print(f"  Creating H{H} (warn {H}h before failure)...")
        labels = np.zeros(len(df))

        for start_time, end_time, failure_type in failure_periods:
            failure_start = pd.to_datetime(start_time)
            warning_start = failure_start - pd.Timedelta(hours=H)

            warning_mask = (df.index >= warning_start) & (df.index < start_time)
            warning_indices = np.where(warning_mask)[0]

            if len(warning_indices) > 0:
                labels[warning_indices] = 1

        warning_count = np.sum(labels)
        prediction_labels[f'H{H}'] = labels.astype(int)
        print(f"    H{H}: {warning_count}/{len(labels)} ({warning_count/len(labels)*100:.2f}%)")

    return prediction_labels

# ================================================================================
# VAR WINDOW SELECTION
# ================================================================================

def smart_var_selection(series, max_lag=None, early_stopping=True, patience=5):
    """Smart VAR model selection for Agent 2 (dynamic windowing)"""
    if max_lag is None:
        max_lag = len(series)//(series.shape[1]*10)

    AIC_values = []
    best_aic = float('inf')
    best_lag = 1

    try:
        selector = VarianceThreshold(0.00002)
        series_filtered = selector.fit_transform(series)
        if series_filtered.shape[1] < 2:
            series_filtered = series

        model = VAR(series_filtered)
        no_improvement_count = 0

        for lag in range(max_lag):
            try:
                results = model.fit(lag)
                current_aic = results.aic
                AIC_values.append(current_aic)

                if current_aic < best_aic:
                    best_aic = current_aic
                    best_lag = lag + 1
                    no_improvement_count = 0
                else:
                    no_improvement_count += 1

                if early_stopping and no_improvement_count >= patience:
                    break

            except Exception:
                AIC_values.append(99999)
                no_improvement_count += 1

        return AIC_values.index(min(AIC_values)) + 1 if AIC_values else 1

    except Exception:
        return 1

def extract_windows(array, window_size, labels=None, task_type='detection'):
    """Extract windowed sequences for agentic tasks"""
    start = 0
    last_index = len(array) - 1
    max_time = last_index - window_size + 1

    sub_windows = (
        start +
        np.expand_dims(np.arange(window_size), 0) +
        np.expand_dims(np.arange(max_time + 1), 0).T
    ).astype(int)

    windows = array[sub_windows]

    if labels is not None:
        # Use label at end of window for prediction tasks
        window_labels = labels[sub_windows[:, -1]]
        return windows, window_labels

    return windows

def auto_filter_windows(window_data, sequences, labels=None, percentile_range=(10, 90)):
    """Filter windows based on VAR selection quality - REMOVED FOR AGENTIC SYSTEM"""
    pass  # Function removed as requested

# ================================================================================
# MAIN AGENTIC PIPELINE
# ================================================================================

print("="*60)
print("AGENTIC METROPT-3 PIPELINE")
print("="*60)

# Step 1: Feature selection
print("\n1. FEATURE SELECTION")
whole_series_auto = auto_feature_selection(data=whole_series, removal_strategy='moderate')
n_features = whole_series_auto.shape[1]

# Step 2: Create dual labels for agentic tasks
print(f"\n2. DUAL LABELING FOR AGENTIC TASKS")
detection_labels = create_detection_labels(whole_series_auto)
prediction_labels = create_prediction_labels(whole_series_auto, [1, 3, 5, 12])


# Step 3: Extract windows and apply VAR selection
print(f"\n4. WINDOWING AND VAR SELECTION")
K = 50  # Window size for Agent 2
print(f"Creating {K}-length sequences from {len(whole_series_auto)} timesteps with {n_features} features...")


# For this pipeline, use prediction task (Agent 3 early warning)
windows, window_labels = extract_windows(
    whole_series_auto.values,
    K,
    prediction_labels['H5'],  # 5-hour early warning
    'prediction'
)

# Create separate datasets for each task
detection_windows, detection_window_labels = extract_windows(whole_series_auto.values,K, detection_labels)
h1_windows, h1_window_labels = extract_windows(whole_series_auto.values,K,, prediction_labels['H1'])
h3_windows, h3_window_labels = extract_windows(whole_series_auto.values,K,, prediction_labels['H3'])
h5_windows, h5_window_labels = extract_windows(whole_series_auto.values,K,, prediction_labels['H5'])
h12_windows, h12_window_labels = extract_windows(whole_series_auto.values,K,, prediction_labels['H12'])

print(f"Created {len(windows):,} windows")
print(f"Positive windows: {np.sum(window_labels):,} ({np.mean(window_labels)*100:.2f}%)")

# Step 5: Save results for agentic modules
print(f"\n5. SAVING RESULTS FOR AGENTIC MODULES")
output_dir = r'/content/drive/MyDrive/PHD/2025/AGENTIC_METROPT/'

# Save processed data (no filtering applied)
np.save(f'{output_dir}agentic_sequences.npy', windows)
np.save(f'{output_dir}agentic_labels.npy', window_labels)

# Save dual labels
np.save(f'{output_dir}detection_labels.npy', detection_labels)
for horizon_key, pred_labels in prediction_labels.items():
    np.save(f'{output_dir}prediction_labels_{horizon_key}.npy', pred_labels)

# Save other components
with open(f'{output_dir}lofo_splits.pkl', 'wb') as f:
    pickle.dump(lofo_splits, f)

with open(f'{output_dir}scalers.pkl', 'wb') as f:
    pickle.dump(scalers, f)

# Save metadata
metadata = {
    'n_features': n_features,
    'window_size': K,
    'n_sequences': len(windows),
    'feature_names': list(whole_series_auto.columns),
    'date_range': (whole_series_auto.index.min(), whole_series_auto.index.max())
}

with open(f'{output_dir}metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print("✅ Saved agentic data:")
print(f"   Sequences: {windows.shape}")
print(f"   Labels: {window_labels.shape}")
print(f"   Detection labels: {detection_labels.shape}")
print(f"   Prediction labels: {len(prediction_labels)} horizons")
print(f"   LOFO splits: {len(lofo_splits)} folds")

# ================================================================================
# CREATE AGENTIC DATA CONTAINER
# ================================================================================

class AgenticMetroPTData:
    """Data container for agentic MetroPT system"""
    def __init__(self):
        self.sequences = windows
        self.labels = window_labels
        self.detection_labels = detection_labels
        self.prediction_labels = prediction_labels
        self.lofo_splits = lofo_splits
        self.scalers = scalers
        self.metadata = metadata
        self.raw_data = whole_series_auto

    def get_sensor_data(self, sensor_idx):
        """Get individual sensor data for Agent 1"""
        sensor_name = self.metadata['feature_names'][sensor_idx]
        sensor_data = self.raw_data.iloc[:, sensor_idx]
        return sensor_data, sensor_name

    def get_task_data(self, task_type='prediction', horizon='H5'):
        """Get data for specific agentic task"""
        if task_type == 'detection':
            return self.sequences, self.detection_labels
        else:
            task_labels = self.prediction_labels[horizon]
            return self.sequences, task_labels

    def get_fold_data(self, fold_idx):
        """Get train/test data for LOFO fold"""
        split = self.lofo_splits[fold_idx]
        return split

    def summary(self):
        """Print summary for agents"""
        print("\n" + "="*60)
        print("AGENTIC METROPT DATA SUMMARY")
        print("="*60)
        print(f"📊 Sequences: {self.sequences.shape}")
        print(f"🏷️  Labels: Detection + {len(self.prediction_labels)} prediction horizons")
        print(f"📋 CV Folds: {len(self.lofo_splits)} LOFO splits")
        print(f"🔧 Features: {self.metadata['n_features']} sensors")
        print(f"🪟 Window size: {self.metadata['window_size']}")
        print(f"📅 Date range: {self.metadata['date_range']}")
        print("\n🤖 READY FOR AGENTIC IMPLEMENTATION!")

# Create agentic data container
agentic_data = AgenticMetroPTData()
agentic_data.summary()

print("\n" + "="*60)
print("MODULE 1: DUAL LABELING COMPLETE! ✅")
print("="*60)
print("🚀 Ready for Module 2: Individual Time Series Agent")
print("🚀 Ready for Module 3: Dynamic Window Agent")
print("🚀 Ready for Module 4: Fusion Agent")
print("🚀 Ready for Module 5: Orchestrator")

print(f"\n💡 USAGE:")
print(f"   agentic_data = AgenticMetroPTData()")
print(f"   sequences, labels = agentic_data.get_task_data('prediction', 'H5')")
print(f"   sensor_data, name = agentic_data.get_sensor_data(0)")


4. WINDOWING AND VAR SELECTION
Creating 50-length sequences from 3676 timesteps with 13 features...


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# New Section