In [None]:
import pandas as pd
import numpy as np
import os
from google.colab import drive
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind

# OPTIONAL: Load data from local or online source
# If running on Colab and using Google Drive, uncomment below:

# from google.colab import drive
# drive.mount('/content/drive')
# os.chdir('/content/drive/MyDrive/your_project_folder')

# Otherwise, place your data in the same folder or set the correct relative path

In [None]:
pca_recent   = pd.read_csv("data/pca_normalized_recent.csv")
kpca_recent  = pd.read_csv("data/kpca_normalized_recent.csv")
wpca_recent  = pd.read_csv("data/wpca_normalized_recent.csv")

In [None]:
# Load volcano data
volcano_df = pd.read_csv(os.path.join(base_path, 'data/volcano-events.csv'))

significant_eruptions_recent = volcano_df[
    (volcano_df['VEI'] >= 5) & (volcano_df['Year'] >= 1)
].copy()

significant_eruptions_recent['Year_Bin'] = significant_eruptions_recent['Year'].astype(int)

eruption_years_recent = significant_eruptions_recent['Year_Bin'].values
num_events_recent = len(eruption_years_recent)

significant_eruptions_recent.tail()

In [None]:
def perform_sea(pca_real_df, eruption_years, pc1_col):
    values = pca_real_df.set_index('Year_Bin')[pc1_col]
    composite_segments = []
    for y in eruption_years:
        window = np.arange(y - 3, y + 7)
        try:
            segment = values.loc[window].values
            centered = segment - np.nanmean(segment[:3])  # center on pre-eruption mean
            if np.isnan(centered).any():
                continue
            composite_segments.append(centered)
        except KeyError:
            continue
    if not composite_segments:
        return np.full(10, np.nan)
    return np.mean(composite_segments, axis=0)

def bootstrap_sea(pca_real_df, eruption_years, pc1_col, n=1000):
    values = pca_real_df.set_index('Year_Bin')[pc1_col]
    all_years = values.index.values
    non_eruption_years = [y for y in all_years if y not in eruption_years and y >= min(all_years)+3 and y <= max(all_years)-6]

    bootstrapped = []
    for _ in range(n):
        sampled_years = np.random.choice(non_eruption_years, size=len(eruption_years), replace=False)
        segment_means = []
        for y in sampled_years:
            window = np.arange(y - 3, y + 7)
            try:
                segment = values.loc[window].values
                centered = segment - np.nanmean(segment[:3])
                if np.isnan(centered).any():
                    break
                segment_means.append(centered)
            except KeyError:
                break
        if len(segment_means) == len(eruption_years):
            bootstrapped.append(np.mean(segment_means, axis=0))
    return np.array(bootstrapped)

def run_sea_analysis(pca_df, method_name, pc1_col, eruption_years):
    all_real_sea = []
    for r in pca_df['Realization'].unique():
        pca_real_df = pca_df[pca_df['Realization'] == r]
        result = perform_sea(pca_real_df, eruption_years, pc1_col)
        all_real_sea.append(result)
    all_real_sea = np.array(all_real_sea)

    # Mean SEA response across all realizations
    sea_median = np.nanmedian(all_real_sea, axis=0)

    # Bootstrap null for confidence intervals (from first available realization)
    first_real = pca_df['Realization'].unique()[0]

    all_boots = []
    for r in pca_df['Realization'].unique():
        boot = bootstrap_sea(pca_df[pca_df['Realization'] == r], eruption_years, pc1_col, n=5)
        all_boots.append(boot)
    boot_all = np.concatenate(all_boots, axis=0)



    # Confidence intervals (used for shaded plot if needed)
    boot_lower = np.percentile(boot_all, 2.5, axis=0)
    boot_upper = np.percentile(boot_all, 97.5, axis=0)

    return {
        'method': method_name,
        'years': np.arange(-3, 7),
        'median': sea_median,
        'boot_lower': boot_lower,
        'boot_upper': boot_upper,
        'all_real_sea': all_real_sea,
        'boot_all': boot_all
    }

def compute_sea_significance_proportions(all_real_sea, boot_all):
    n_realizations = all_real_sea.shape[0]
    proportions_pos = []
    proportions_neg = []

    for t in range(all_real_sea.shape[1]):  # Loop over -3 to +6
        real_vals = all_real_sea[:, t]
        boot_vals = boot_all[:, t]
        upper = np.nanpercentile(boot_vals, 97.5)
        lower = np.nanpercentile(boot_vals, 2.5)

        # Proportions exceeding null
        prop_pos = np.sum(real_vals > upper) / n_realizations
        prop_neg = np.sum(real_vals < lower) / n_realizations

        proportions_pos.append(prop_pos)
        proportions_neg.append(prop_neg)

    return np.array(proportions_pos), np.array(proportions_neg)

In [None]:
sea_results_recent = []
for method, df, pc1_col in [
    ('Standard PCA', pca_recent, 'PC1_norm'),
    ('Kernel PCA', kpca_recent, 'KPC1_norm'),
    ('Wavelet PCA', wpca_recent, 'WPC1_norm')
]:
    sea_result = run_sea_analysis(df, method, pc1_col, eruption_years_recent)
    sea_results_recent.append(sea_result)

In [None]:
def plot_falster_style_sea_cleaned(sea_results, max_realizations_to_plot=100):
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    def compute_sea_significance_proportions(all_real_sea, boot_all):
        proportions_pos = []
        proportions_neg = []
        n_realizations = all_real_sea.shape[0]
        for t in range(all_real_sea.shape[1]):
            real_vals = all_real_sea[:, t]
            boot_vals = boot_all[:, t]
            upper = np.nanpercentile(boot_vals, 97.5)
            lower = np.nanpercentile(boot_vals, 2.5)
            prop_pos = np.sum(real_vals > upper) / n_realizations
            prop_neg = np.sum(real_vals < lower) / n_realizations
            proportions_pos.append(prop_pos)
            proportions_neg.append(prop_neg)
        return proportions_pos, proportions_neg

    def gather_stacked_bar_data(results):
        all_methods = []
        for res in results:
            method = res['method']
            years = res['years']
            prop_pos, prop_neg = compute_sea_significance_proportions(res['all_real_sea'], res['boot_all'])
            for i, year in enumerate(years):
                all_methods.append({'Year': year, 'Method': method,
                                    'Positive': prop_pos[i], 'Negative': prop_neg[i]})
        df = pd.DataFrame(all_methods)

        # Sort methods by contribution so that taller bars go last
        stacked_pos = []
        stacked_neg = []
        for year in sorted(df['Year'].unique()):
            dfy = df[df['Year'] == year].copy()
            dfy = dfy.sort_values(by='Positive', ascending=True)
            base = 0
            for _, row in dfy.iterrows():
                contrib = max(0, row['Positive'] - base)
                stacked_pos.append({'Year': year, 'Method': row['Method'], 'Value': contrib, 'Base': base})
                base += contrib
            dfy = df[df['Year'] == year].copy()
            dfy = dfy.sort_values(by='Negative', ascending=True)
            base = 0
            for _, row in dfy.iterrows():
                contrib = max(0, row['Negative'] - base)
                stacked_neg.append({'Year': year, 'Method': row['Method'], 'Value': -contrib, 'Base': -base})
                base += contrib
        return pd.DataFrame(stacked_pos), pd.DataFrame(stacked_neg)

    df_pos, df_neg = gather_stacked_bar_data(sea_results)
    sea_ref = sea_results[0]
    years = sea_ref['years']

    method_colors = {
        'Standard PCA': '#000004FF',
        'Kernel PCA': '#5F187FFF',
        'Wavelet PCA': '#D3436EFF'
    }

    fig, ax1 = plt.subplots(figsize=(12, 7))

    # Draw stacked bars
    for method in method_colors:
        color = method_colors[method]
        dfp = df_pos[df_pos['Method'] == method]
        dfn = df_neg[df_neg['Method'] == method]
        ax1.bar(dfp['Year'], dfp['Value'], bottom=dfp['Base'], color=color, width=0.8, zorder=1)
        ax1.bar(dfn['Year'], dfn['Value'], bottom=dfn['Base'], color=color, width=0.8, zorder=1)

    ax1.axhline(0, color='black', zorder=2)
    ax1.axvline(0, color='black', linestyle='--', zorder=2)
    ax1.set_ylabel('Proportion of Realizations Beyond Null', fontsize=12)
    ax1.set_xlabel('Years Relative to Eruption', fontsize=12)
    ax1.set_ylim(-1, 1)

    # Composite PC1 anomaly
    ax2 = ax1.twinx()
    ax1.set_zorder(ax2.get_zorder() + 1)
    ax1.patch.set_visible(False)
    ax2.fill_between(sea_ref['years'], sea_ref['boot_lower'], sea_ref['boot_upper'], color='#8c9cb0', alpha=0.4, zorder=0)
    all_real = sea_ref['all_real_sea']
    if all_real.shape[0] > max_realizations_to_plot:
        sample_idx = np.random.choice(all_real.shape[0], max_realizations_to_plot, replace=False)
        all_real = all_real[sample_idx, :]

    for line in all_real:
        ax2.plot(sea_ref['years'], line, color='gray', alpha=0.8, linewidth=0.7, zorder=3)

    all_values = np.concatenate([all_real.flatten(),
                                 sea_ref['boot_lower'],
                                 sea_ref['boot_upper']])
    ci_min = np.nanmin(all_values)
    ci_max = np.nanmax(all_values)
    range_pad = 0.1 * (ci_max - ci_min)
    ax2.set_ylim(ci_min - range_pad, ci_max + range_pad)
    ax2.set_ylabel('Composite PC1 Anomaly', fontsize=12)

    median_handles = []
    median_labels = []

    for res in sea_results:
        method = res['method']
        color = method_colors[method]
        line, = ax2.plot(
            res['years'], res['median'],
            label=f'{method} Median',
            color=color, linestyle='--', linewidth=2, zorder=6
        )
        median_handles.append(line)
        median_labels.append(f'{method} Median')

    bar_handles = [plt.Line2D([0], [0], color=color, lw=8) for color in method_colors.values()]
    bar_labels = list(method_colors.keys())

    ci_patch = plt.Rectangle((0, 0), 1, 1, color='#8c9cb0', alpha=0.4, label='95% CI')

    handles = bar_handles + median_handles + [ci_patch]
    labels = bar_labels + median_labels + ['95% CI']
    fig.suptitle("Volcanic Forcing on Indo-Pacific Hydroclimate: SEA Median Response and Null Exceedance Proportions", fontsize=14, y=0.97)
    fig.subplots_adjust(bottom=0.1)
    fig.legend(handles, labels, loc='lower center', ncol=3, frameon=False)

    plt.tight_layout(rect=[0, 0.1, 1, 1])
    plt.show()

In [None]:
plot_falster_style_sea_cleaned(sea_results_recent)