<!-- # Univariate analysis plots. These plots help reveal trends over time, 
# distribution characteristics, relationships between variables, and comparisons across different facilities.
# Univariate seasonal decomposition plots below
# These plots break down each time series into four components:

# Observed - The original time series data
# Trend - The long-term progression of the series
# Seasonal - The repeating patterns or cycles
# Residual - The random variation in the data
# For both variables, you can observe:

# Clear trend components showing the overall direction of the measurements
# Seasonal patterns with regular cycles (using a 30-day period for analysis)
# Residual components showing the random fluctuations after accounting for trend and seasonality
# The seasonal decomposition helps identify recurring patterns that might be related to operational cycles, environmental factors, or maintenance schedules.

# Flow rate demonstrates clear differences, with consistent flow during operational periods and minimal flow during non-operational times.
# These trends highlight how facility performance metrics vary significantly between operational and non-operational states, which could be valuable for predictive maintenance, efficiency optimization, and operational planning.
# Now let's analyze key metrics based on operational status -->

In [None]:
import boto3
import logging
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from statsmodels.tsa.seasonal import seasonal_decompose

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Plot style
plt.rcParams.update({
    'font.family': 'sans-serif',
    'font.sans-serif': 'DejaVu Sans',
    'axes.titlesize': 20,
    'axes.titleweight': 'semibold',
    'axes.labelsize': 16,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'legend.fontsize': 12
})


def read_csv_from_s3(s3_path, aws_profile=None):
    logger.info(f"Reading data from S3: {s3_path}")
    s3 = boto3.Session(profile_name=aws_profile).client('s3') if aws_profile else boto3.client('s3')
    bucket, key = s3_path.replace("s3://", "").split("/", 1)
    response = s3.get_object(Bucket=bucket, Key=key)
    return pd.read_csv(StringIO(response['Body'].read().decode('ascii')))


def preprocess_timestamp(df):
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    return df


def plot_time_series(df, time_col, value_col, title, ylabel):
    df_ts = df.dropna(subset=[time_col, value_col]).sort_values(time_col)
    plt.figure(figsize=(9, 6))
    plt.plot(df_ts[time_col], df_ts[value_col], color='#766CDB')
    plt.title(title, pad=15)
    plt.xlabel('Timestamp (Date)', labelpad=10)
    plt.ylabel(ylabel, labelpad=10)
    plt.grid(color='#E0E0E0')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


def plot_histogram(df, col, title, xlabel):
    plt.figure(figsize=(9, 6))
    sns.histplot(df[col].dropna(), kde=True, color='#DA847C')
    plt.title(title, pad=15)
    plt.xlabel(xlabel, labelpad=10)
    plt.ylabel('Frequency (count)', labelpad=10)
    plt.grid(color='#E0E0E0')
    plt.tight_layout()
    plt.show()


def plot_correlation_heatmap(df):
    numeric_cols = df.select_dtypes(include='number').columns[:10]
    if len(numeric_cols) > 1:
        plt.figure(figsize=(10, 8))
        corr = df[numeric_cols].corr()
        sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
        plt.title("Correlation Heatmap of Key Numeric Features", pad=15)
        plt.tight_layout()
        plt.show()


def plot_boxplot(df, x_col, y_col, title, ylabel):
    plt.figure(figsize=(9, 6))
    sns.boxplot(x=x_col, y=y_col, data=df)
    plt.title(title, pad=15)
    plt.xlabel(x_col.title(), labelpad=10)
    plt.ylabel(ylabel, labelpad=10)
    plt.grid(color='#E0E0E0', axis='y')
    plt.tight_layout()
    plt.show()


def plot_scatter(df, x_col, y_col, title, xlabel, ylabel):
    plt.figure(figsize=(9, 6))
    plt.scatter(df[x_col], df[y_col], alpha=0.6, color='#D9CC8B')
    plt.title(title, pad=15)
    plt.xlabel(xlabel, labelpad=10)
    plt.ylabel(ylabel, labelpad=10)
    plt.grid(color='#E0E0E0')
    plt.tight_layout()
    plt.show()


def seasonal_decompose_plot(series, title, ylabel, freq):
    series = series.resample('D').mean().interpolate()
    result = seasonal_decompose(series, model='additive', period=freq)
    fig, axes = plt.subplots(4, 1, figsize=(9, 8), sharex=True)
    for ax, comp, color in zip(axes, ['observed', 'trend', 'seasonal', 'resid'], ['#766CDB', '#DA847C', '#7CD9A5', '#D9CC8B']):
        getattr(result, comp).plot(ax=ax, color=color)
        ax.set_title(f'{title} - {comp.capitalize()}', pad=15, fontsize=16)
        ax.set_ylabel(ylabel)
        ax.grid(color='#E0E0E0')
        ax.set_axisbelow(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


def plot_operational_trends(df, key_metrics, units):
    df['operational_clean'] = df['operational'].astype(str).str.lower().map({'1': True, '1.0': True, 'true': True, '0': False, '0.0': False, 'false': False})
    df.dropna(subset=['operational_clean', 'timestamp'], inplace=True)
    df['operational_status'] = df['operational_clean'].map({True: 'Operational', False: 'Non-Operational'})
    df['date'] = pd.to_datetime(df['timestamp']).dt.date
    daily = df.groupby(['date', 'operational_status'])[key_metrics].mean().reset_index()

    for metric in key_metrics:
        plt.figure(figsize=(9, 6))
        sns.lineplot(data=daily, x='date', y=metric, hue='operational_status', style='operational_status',
                     markers=True, dashes=False, palette=['#766CDB', '#DA847C'])
        plt.title(f'Daily Average {metric.replace("_", " ").title()} by Operational Status', pad=15)
        plt.xlabel('Date', labelpad=10)
        plt.ylabel(f'{metric.replace("_", " ").title()} ({units.get(metric, "")})', labelpad=10)
        plt.xticks(rotation=45)
        plt.grid(color='#E0E0E0')
        plt.tight_layout()
        plt.show()


def summarize_by_operational_status(df, key_metrics, units):
    summary = df.groupby('operational_status')[key_metrics].agg(['mean', 'std', 'min', 'max']).round(2)
    logger.info("Summary statistics by operational status:")
    for status in summary.index:
        logger.info(f"{status}:")
        for metric in key_metrics:
            logger.info(
                f"  {metric.replace('_', ' ').title()} ({units.get(metric, '')}): "
                f"Mean: {summary.loc[status, (metric, 'mean')]}, "
                f"Std: {summary.loc[status, (metric, 'std')]}, "
                f"Min: {summary.loc[status, (metric, 'min')]}, "
                f"Max: {summary.loc[status, (metric, 'max')]}"
            )


if __name__ == "__main__":
    s3_path = 's3://sagemaker-us-east-2-426179662034/canvas/processed/facility_merge_processed.csv'
    df = read_csv_from_s3(s3_path)
    df = preprocess_timestamp(df)

    if 'bge_sump_bottomsensor_level' in df.columns:
        plot_time_series(df, 'timestamp', 'bge_sump_bottomsensor_level', 'Time Series of Bottom Sensor Level', 'Bottom Sensor Level (feet)')
        plot_boxplot(df, 'facility', 'bge_sump_bottomsensor_level', 'Bottom Sensor Level by Facility', 'Bottom Sensor Level (feet)')

    # if 'bge_h2soutlet_temp' in df.columns:
    #     plot_time_series(df, 'timestamp', 'bge_h2soutlet_temp', 'Time Series of H2S Outlet Temperature (°F)', 
    #                      'Timestamp (Date)')

    if 'bge_blowervfd_speed' in df.columns:
        plot_histogram(df, 'bge_blowervfd_speed', 'Distribution of Blower VFD Speed', 'Blower VFD Speed (Hz)')

    if 'bge_blowersuction_pressure' in df.columns and 'bge_blowersuction_temp' in df.columns:
        plot_scatter(df, 'bge_blowersuction_pressure', 'bge_blowersuction_temp',
                     'Blower Suction Pressure vs Temperature', 'Pressure (psi)', 'Temperature (°F)')

    plot_correlation_heatmap(df)

    # Seasonal decomposition
    if 'timestamp' in df.columns:
        df.set_index('timestamp', inplace=True)
        if 'bge_sump_bottomsensor_level' in df.columns:
            seasonal_decompose_plot(df['bge_sump_bottomsensor_level'], 'Bottom Sensor Level', 'Feet', 30)
        if 'bge_h2soutlet_temp' in df.columns:
            seasonal_decompose_plot(df['bge_h2soutlet_temp'], 'H2S Outlet Temperature', '°F', 30)

    # Operational trend and summary
    key_metrics = [
        'bge_sump_bottomsensor_level', 'bge_h2soutlet_temp', 'bge_blowervfd_speed',
        'methane_percent', 'flow_rate'
    ]
    units = {
        'bge_sump_bottomsensor_level': 'Feet',
        'bge_h2soutlet_temp': '°F',
        'bge_blowervfd_speed': '%',
        'methane_percent': '%',
        'flow_rate': 'SCFM'
    }

    df_operational = read_csv_from_s3('s3://sagemaker-us-east-2-426179662034/canvas/processed/facility_merge_processed.csv')
    df_operational = preprocess_timestamp(df_operational)
    plot_operational_trends(df_operational, key_metrics, units)
    summarize_by_operational_status(df_operational, key_metrics, units)

    ###################### Summary: ########################
    # Univariate analysis plots. These plots help reveal trends over time, 
    # distribution characteristics, relationships between variables, and comparisons across different facilities.
    # Univariate seasonal decomposition plots below
    # These plots break down each time series into four components:

    # Observed - The original time series data
    # Trend - The long-term progression of the series
    # Seasonal - The repeating patterns or cycles
    # Residual - The random variation in the data
    # For both variables, you can observe:

    # Clear trend components showing the overall direction of the measurements
    # Seasonal patterns with regular cycles (using a 30-day period for analysis)
    # Residual components showing the random fluctuations after accounting for trend and seasonality
    # The seasonal decomposition helps identify recurring patterns that might be related to operational cycles, environmental factors, or maintenance schedules.
    
    # Flow rate demonstrates clear differences, with consistent flow during operational periods and minimal flow during non-operational times.
    # These trends highlight how facility performance metrics vary significantly between operational and non-operational states, which could be valuable for predictive maintenance, efficiency optimization, and operational planning.
    # Now let's analyze key metrics based on operational status