In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os

from io import StringIO
from google.colab import drive
from matplotlib.ticker import MultipleLocator
from matplotlib.lines import Line2D
from scipy.stats import mannwhitneyu, pearsonr, spearmanr, gaussian_kde

# OPTIONAL: Load data from local or online source
# If running on Colab and using Google Drive, uncomment below:

# from google.colab import drive
# drive.mount('/content/drive')
# os.chdir('/content/drive/MyDrive/your_project_folder')

# Otherwise, place your data in the same folder or set the correct relative path

In [None]:
pca_early   = pd.read_csv("data/pca_normalized_early.csv")
kpca_early  = pd.read_csv("data/kpca_normalized_early.csv")
wpca_early  = pd.read_csv("data/wpca_normalized_early.csv")

In [None]:
enso1_file_path = 'data/enso1_reconstruction.txt'
enso_df = pd.read_csv(enso1_file_path, sep=r'\s+', comment='#')
enso_df = enso_df.rename(columns={'age': 'Year_Bin', 'ensoi': 'ENSO'})
enso_df['ENSO_norm'] = (enso_df['ENSO'] - enso_df['ENSO'].mean()) / enso_df['ENSO'].std()

In [None]:
pdo1_file_path = 'data/pdo1_reconstruction.txt'
pdo_df = pd.read_csv(pdo1_file_path, sep=r'\s+', comment='#')
pdo_df = pdo_df.rename(columns={'age': 'Year_Bin', 'pdo': 'PDO'})
pdo_df['Year_Bin'] = pdo_df['Year_Bin'].astype(int)
pdo_df['PDO_norm'] = (pdo_df['PDO'] - pdo_df['PDO'].mean()) / pdo_df['PDO'].std()

In [None]:
ipo_file_path = 'data/pdo2_reconstruction.txt'
ipo_df = pd.read_csv(ipo_file_path, sep=r'\s+', comment='#')
ipo_df.columns = ['Year_Bin', 'IPO', 'Std_Dev']
ipo_df['Year_Bin'] = ipo_df['Year_Bin'].astype(int)
ipo_df['IPO_norm'] = (ipo_df['IPO'] - ipo_df['IPO'].mean()) / ipo_df['IPO'].std()

In [None]:
co2_df = pd.read_csv('data/co2-long-term-concentration.csv')
co2_df.rename(columns={'Annual concentration of atmospheric carbon dioxide': 'CO2', 'Year':'Year_Bin'}, inplace=True)
co2_df = co2_df[['Year_Bin', 'CO2']].copy()
co2_df['Year_Bin'] = co2_df['Year_Bin'].astype(int)
co2_df = co2_df[(co2_df['Year_Bin'] >= 1) & (co2_df['Year_Bin'] <= 2016)]
co2_df['CO2_norm'] = (co2_df['CO2'] - co2_df['CO2'].mean()) / co2_df['CO2'].std()

In [None]:
tsi_file_path = 'data/tsi_reconstruction.txt'
with open(tsi_file_path, 'r', encoding='latin1') as file:
    lines = file.read().splitlines()

start_idx = next(i for i, line in enumerate(lines) if line.strip()[:1].isdigit())
data_str = '\n'.join(lines[start_idx:])
tsi_df = pd.read_csv(StringIO(data_str), sep=r'\s+', names=['YearBP', 'dTSI', 'dTSI_sigma'])
tsi_df['Year_AD'] = 1950 - tsi_df['YearBP']
tsi_df = tsi_df[tsi_df['Year_AD'].between(1, 2016)]
tsi_df['TSI'] = 1365.57 + tsi_df['dTSI']
tsi_df['Year_Bin'] = np.floor(tsi_df['Year_AD']).astype(int)
tsi_df = tsi_df[['Year_Bin', 'TSI']].copy()
tsi_df['TSI_norm'] = (tsi_df['TSI'] - tsi_df['TSI'].mean()) / tsi_df['TSI'].std()

In [None]:
def summarize(df, colname):
    grouped = df.groupby("Year_Bin")[colname]
    median = grouped.median()
    p5 = grouped.quantile(0.05)
    p95 = grouped.quantile(0.95)
    return pd.DataFrame({
        "Year_Bin": median.index,
        "median": median.values,
        "p5": p5.values,
        "p95": p95.values
    })

pca_summary = summarize(pca_early, "PC1_norm")
kpca_summary = summarize(kpca_early, "KPC1_norm")
wpca_summary = summarize(wpca_early, "WPC1_norm")

In [None]:
from matplotlib.patches import Patch

# Define investigation intervals (start year, end year)
investigation_intervals = [
    (500, 550),
    (680, 730),
    (1050, 1100),
    (1140, 1250),
    (1340, 1400),
    (1460, 1550)
]

# Create plot
fig, ax = plt.subplots(figsize=(14, 6))

# Plot PCA reconstructions with uncertainty
for summary, label, color in zip([pca_summary, kpca_summary, wpca_summary],
                                 ['Standard PCA', 'Kernel PCA', 'Wavelet PCA'],
                                 ['blue', 'green', 'orange']):
    ax.plot(summary["Year_Bin"], summary["median"], label=label, color=color)
    ax.fill_between(summary["Year_Bin"], summary["p5"], summary["p95"], alpha=0.2, color=color)

# Add boxed intervals and markers
for start, end in investigation_intervals:
    # Bracket lines
    ax.axvline(x=start, color='black', linestyle='--', linewidth=1)
    ax.axvline(x=end, color='black', linestyle='--', linewidth=1)

    # Midpoint marker
    midpoint = (start + end) / 2
    ymin, ymax = ax.get_ylim()
    ax.plot(midpoint, ymin + (ymax - ymin) * 0.05, marker='*', color='black', markersize=8)


handles = [
    Patch(facecolor='gray', edgecolor='gray', alpha=0.15, label='5–95% Percentile Bound'),
    plt.Line2D([], [], color='blue', linewidth=2, label='Median PC1 (Standard)'),
    plt.Line2D([], [], color='green', linewidth=2, label='Median KPC1 (Kernel)'),
    plt.Line2D([], [], color='orange', linewidth=2, label='Median WPC1 (Wavelet)')
]

# Styling
ax.set_ylabel("PC1 (Normalized)", fontsize=12)
ax.set_xlabel("Year", fontsize=12)
ax.set_title("Hydroclimate Variability Reconstruction (1–1550 CE)", fontsize=14)
ax.set_xlim(0, 1550)
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)

for spine in ax.spines.values():
    spine.set_linewidth(1)
    spine.set_color('black')

ax.legend(handles=handles, loc='upper left')
ax.set_xlabel("Year")
plt.tight_layout()
plt.show()

In [None]:
# Define investigation intervals (start year, end year)
investigation_intervals = [
    (500, 550),
    (680, 730),
    (1050, 1100),
    (1140, 1250),
    (1340, 1400),
    (1460, 1550)
]

fig, axs = plt.subplots(5, 1, figsize=(15, 14), sharex=True, gridspec_kw={'height_ratios': [2, 1, 1, 1, 1]})

for ax in axs:
    ax.set_xlim(0, 1550)
    ax.xaxis.set_major_locator(MultipleLocator(100))
    ax.tick_params(labelbottom=True, labelsize=12)  # x-ticks font size
    ax.tick_params(labelleft=True, labelsize=12)    # y-ticks font size
    ax.grid(True, axis='x', linestyle='--', color='gray', linewidth=0.8, alpha=0.6)

# Shade intervals across all subplots
for start, end in investigation_intervals:
    for ax in axs:
        ax.axvspan(start, end, color='goldenrod', alpha=0.2)

# Hydroclimate Reconstruction
for summary, label, color in zip([pca_summary, kpca_summary, wpca_summary],
                                 ['Standard PCA', 'Kernel PCA', 'Wavelet PCA'],
                                 ['blue', 'green', 'orange']):
    axs[0].plot(summary["Year_Bin"], summary["median"], label=label, color=color)
    axs[0].fill_between(summary["Year_Bin"], summary["p5"], summary["p95"], alpha=0.2, color=color)

axs[0].set_ylabel("PC1 (Normalized)", fontsize=12)
axs[0].set_title("Hydroclimate Variability Reconstruction (Pre-1550 CE)", fontsize=14)
axs[0].legend()

# Volcanic eruption years
eruption_years = [653, 683, 710]

eruption_y = axs[0].get_ylim()[0] - 0.05

axs[0].scatter(
    eruption_years,
    [eruption_y] * len(eruption_years),
    marker='^', color='red', s=80, label='Volcanic Eruption'
)

axs[0].set_ylim(eruption_y - 0.1, axs[0].get_ylim()[1])
axs[0].legend(loc='upper left', fontsize=12)

# ENSO, PDO, TSI, CO2 (filtered to pre-1550)
axs[1].plot(enso_df[enso_df["Year_Bin"] <= 1550]["Year_Bin"], enso_df[enso_df["Year_Bin"] <= 1550]["ENSO_norm"], color='darkcyan')
axs[1].set_ylabel("ENSO (Normalized)", fontsize=12)
axs[1].set_title("ENSO Reconstructions", fontsize=14)

axs[2].plot(pdo_df[pdo_df["Year_Bin"] <= 1550]["Year_Bin"], pdo_df[pdo_df["Year_Bin"] <= 1550]["PDO_norm"], color='brown', label='PDO')
axs[2].plot(ipo_df[ipo_df["Year_Bin"] <= 1550]["Year_Bin"], ipo_df[ipo_df["Year_Bin"] <= 1550]["IPO_norm"], color='olive', label='IPO')
axs[2].set_ylabel("PDO (Normalized)", fontsize=12)
axs[2].set_title("PDO/IPO Reconstructions", fontsize=14)
axs[2].legend(loc='upper left', fontsize=12)

axs[3].plot(tsi_df[tsi_df["Year_Bin"] <= 1550]["Year_Bin"], tsi_df[tsi_df["Year_Bin"] <= 1550]["TSI_norm"], color='crimson')
axs[3].set_ylabel("TSI (Normalized)", fontsize=12)
axs[3].set_title("TSI Reconstruction", fontsize=14)

axs[4].plot(co2_df[co2_df["Year_Bin"] <= 1550]["Year_Bin"], co2_df[co2_df["Year_Bin"] <= 1550]["CO2_norm"], color='magenta')
axs[4].set_ylabel("CO2 (Normalized)", fontsize=12)
axs[4].set_xlabel("Year (CE)", fontsize=12)
axs[4].set_title("CO2 Reconstruction", fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

def bin_and_normalize(df, value_col):
    df = df.copy()
    df['Year_Bin'] = ((df['Year_Bin'] - 1) // 10) * 10 + 1
    df_binned = df.groupby('Year_Bin')[value_col].mean().reset_index()

    # Normalize after binning
    scaler = StandardScaler()
    df_binned[f'{value_col}_norm'] = scaler.fit_transform(df_binned[[value_col]])
    return df_binned[['Year_Bin', f'{value_col}_norm']]

# Apply to all forcings
enso_binned = bin_and_normalize(enso_df, 'ENSO')
pdo_binned = bin_and_normalize(pdo_df, 'PDO')
ipo_binned = bin_and_normalize(ipo_df, 'IPO')
tsi_binned = bin_and_normalize(tsi_df, 'TSI')
co2_binned = bin_and_normalize(co2_df, 'CO2')

In [None]:
investigation_intervals = [
    (500, 550),
    (680, 730),
    (1050, 1100),
    (1140, 1250),
    (1340, 1400),
    (1460, 1550)
]

# PCA info
pca_info = [
    ("Standard", pca_early, "PC1_norm"),
    ("Kernel", kpca_early, "KPC1_norm"),
    ("Wavelet", wpca_early, "WPC1_norm")
]

# Forcing datasets (binned and normalized)
forcing_dfs = {
    'ENSO': enso_binned,
    'PDO': pdo_binned,
    'IPO': ipo_binned,
    'TSI': tsi_binned,
    'CO2': co2_binned,
}

forcing_colors = {
    'ENSO': 'blue',
    'PDO': 'orange',
    'IPO': 'green',
    'TSI': 'red',
    'CO2': 'magenta',
}

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20, 6), sharey=True)

for ax, (pca_label, pca_df, pc1_col) in zip(axes, pca_info):
    all_corrs = {forcing: [] for forcing in forcing_dfs}
    sig_corrs = {forcing: [] for forcing in forcing_dfs}

    for realization in pca_df['Realization'].unique():
        realization_df = pca_df[pca_df['Realization'] == realization][['Year_Bin', pc1_col]]

        for forcing_name, forcing_df in forcing_dfs.items():
            merged = pd.merge(realization_df, forcing_df, on='Year_Bin', how='inner').dropna()
            if len(merged) > 2:
                corr, pval = pearsonr(merged[pc1_col], merged.iloc[:, 2])
                abs_corr = abs(corr)
                all_corrs[forcing_name].append(abs_corr)
                if pval < 0.05:
                    sig_corrs[forcing_name].append(abs_corr)

    # Plot KDEs
    for i, (forcing_name, corr_values) in enumerate(all_corrs.items()):
        if len(corr_values) > 0:
            color = forcing_colors.get(forcing_name, 'gray')
            # Plot main KDE
            sns.kdeplot(corr_values, label=forcing_name, ax=ax, color=color, linewidth=2)

            if sig_corrs[forcing_name]:
                kde = gaussian_kde(corr_values, bw_method=0.75)
                x_vals = np.linspace(0, 1, 500)
                y_vals = kde(x_vals)
                min_sig = min(sig_corrs[forcing_name])
                max_sig = max(sig_corrs[forcing_name])
                mask = (x_vals >= min_sig) & (x_vals <= max_sig) & (y_vals > 1e-3)
                ax.fill_between(x_vals[mask], 0, y_vals[mask], color=color, alpha=0.3)

            # Median line
            median_corr = np.median(corr_values)
            ax.axvline(median_corr, color=color, linestyle=':', linewidth=1.5, alpha=0.7)

            # Annotate % significant
            percent_sig = 100 * len(sig_corrs[forcing_name]) / len(corr_values)
            ax.annotate(
                f"{forcing_name}: {percent_sig:.1f}%",
                xy=(1.02, 0.95 - 0.08 * i),
                xycoords='axes fraction',
                fontsize=10,
                color=color,
                ha='left',
                va='top'
            )

    ax.set_xlim(0, 0.6)
    ax.set_xticks(np.linspace(0, 0.6, 7))
    ax.set_title(f'{pca_label} PCA', fontsize=14)
    ax.set_xlabel('|Correlation|', fontsize=12)
    ax.grid(True, linestyle='--', alpha=0.3)
    ax.tick_params(labelsize=11)

axes[0].set_ylabel('Density', fontsize=12)

# Common legend for all forcings
custom_handles = [
    Line2D([0], [0], color=forcing_colors[forcing], lw=2)
    for forcing in forcing_dfs.keys()
]
fig.legend(
    custom_handles,
    list(forcing_dfs.keys()),
    loc='lower center',
    bbox_to_anchor=(0.5, -0.1),
    ncol=5,
    frameon=False,
    fontsize=11,
    title="Forcings",
    title_fontsize=12
)
fig.suptitle(
    "Distribution of Absolute Correlations (|r|) Between PCA Realisations and Climate Forcings\n(Shading: Significant Correlations, p<0.05; Dotted Line: Median)",
    fontsize=14,
    y=0.98
)
plt.tight_layout(rect=[0, 0.05, 1, 0.96])
plt.show()

In [None]:
def plot_kde_baseline_vs_anomaly(start, end, pca_info, forcing_dfs, forcing_colors):
    baseline_start, baseline_end = start - 50, start
    interval = f"{start}-{end}"
    baseline_label = f"{baseline_start}-{baseline_end}"

    fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharey=True)

    for ax, (pca_label, pca_df, pc1_col) in zip(axes, pca_info):
        # Hydroclimate data
        baseline_pca = pca_df[(pca_df["Year_Bin"] >= baseline_start) & (pca_df["Year_Bin"] < baseline_end)]
        anomaly_pca = pca_df[(pca_df["Year_Bin"] >= start) & (pca_df["Year_Bin"] <= end)]

        for i, (forcing_name, forcing_df) in enumerate(forcing_dfs.items()):
            color = forcing_colors.get(forcing_name, 'gray')

            # Forcing data
            baseline_forcing = forcing_df[(forcing_df['Year_Bin'] >= baseline_start) & (forcing_df['Year_Bin'] < baseline_end)]
            anomaly_forcing = forcing_df[(forcing_df['Year_Bin'] >= start) & (forcing_df['Year_Bin'] <= end)]

            # Correlations storage
            baseline_corrs = []
            baseline_sig = 0
            anomaly_corrs = []
            anomaly_sig = 0

            for r in anomaly_pca["Realization"].unique():
                baseline_r = baseline_pca[baseline_pca["Realization"] == r][['Year_Bin', pc1_col]]
                anomaly_r = anomaly_pca[anomaly_pca["Realization"] == r][['Year_Bin', pc1_col]]

                merged_baseline = pd.merge(baseline_r, baseline_forcing, on='Year_Bin', how='inner').dropna()
                merged_anomaly = pd.merge(anomaly_r, anomaly_forcing, on='Year_Bin', how='inner').dropna()

                if len(merged_baseline) > 2:
                    corr_b, pval_b = pearsonr(merged_baseline[pc1_col], merged_baseline.iloc[:, 2])
                    baseline_corrs.append(abs(corr_b))
                    if pval_b < 0.05:
                        baseline_sig += 1

                if len(merged_anomaly) > 2:
                    corr_a, pval_a = pearsonr(merged_anomaly[pc1_col], merged_anomaly.iloc[:, 2])
                    anomaly_corrs.append(abs(corr_a))
                    if pval_a < 0.05:
                        anomaly_sig += 1

            # Plotting
            if baseline_corrs:
                kde_baseline = gaussian_kde(baseline_corrs, bw_method=0.75)
                x_vals = np.linspace(0, 1, 500)
                y_vals = kde_baseline(x_vals)
                ax.plot(x_vals, y_vals, linestyle='--', color=color, linewidth=1.8, alpha=0.7)  # dashed = baseline

            if anomaly_corrs:
                kde_anomaly = gaussian_kde(anomaly_corrs, bw_method=0.75)
                x_vals = np.linspace(0, 1, 500)
                y_vals = kde_anomaly(x_vals)
                ax.plot(x_vals, y_vals, linestyle='-', color=color, linewidth=2.5, alpha=0.9)  # solid = anomaly

            # Annotate % significant (inside the plot)
            total_baseline = len(baseline_corrs)
            total_anomaly = len(anomaly_corrs)

            if total_baseline > 0:
                baseline_percent_sig = 100 * baseline_sig / total_baseline
                ax.annotate(
                    f"{forcing_name} Baseline: {baseline_percent_sig:.1f}%",
                    xy=(1.02, 0.95 - 0.08 * (i * 2)),
                    xycoords='axes fraction',
                    fontsize=11,
                    color=color,
                    ha='left',
                    va='top'
                )

            if total_anomaly > 0:
                anomaly_percent_sig = 100 * anomaly_sig / total_anomaly
                ax.annotate(
                    f"{forcing_name} Anomaly: {anomaly_percent_sig:.1f}%",
                    xy=(1.02, 0.95 - 0.08 * (i * 2 + 1)),
                    xycoords='axes fraction',
                    fontsize=11,
                    color=color,
                    ha='left',
                    va='top'
                )

        ax.set_xlim(0, 1)
        ax.set_xticks(np.linspace(0, 1, 6))
        ax.tick_params(axis='both', labelsize=11)
        ax.set_title(f"{pca_label} PCA")
        ax.set_xlabel("|Pearson Correlation|", fontsize=12)
        ax.grid(True, linestyle='--', alpha=0.3)

    axes[0].set_ylabel("Density", fontsize=12)

    # Legend
    custom_handles = [
        Line2D([0], [0], color=forcing_colors[forcing], lw=2)
        for forcing in forcing_dfs.keys()
    ]
    style_handles = [
        Line2D([0], [0], color='black', lw=2, linestyle='-', label="Anomaly"),
        Line2D([0], [0], color='black', lw=2, linestyle='--', label="Baseline")
    ]
    fig.legend(
        custom_handles + style_handles,
        list(forcing_dfs.keys()) + ["Anomaly", "Baseline"],
        loc='lower center',
        bbox_to_anchor=(0.5, -0.08),
        ncol=7,
        frameon=False,
        fontsize=12
    )

    fig.suptitle(
        f"KDE of |r|: Baseline ({baseline_label}) vs Anomaly ({interval})\n(Solid = Anomaly, Dashed = Baseline)",
        y=0.98,
        fontsize=16
    )

    plt.tight_layout(rect=[0, 0.02, 1, 0.98])
    plt.show()

In [None]:
# Loop through each investigation interval
for start, end in investigation_intervals:
    plot_kde_baseline_vs_anomaly(
        start=start,
        end=end,
        pca_info=pca_info,
        forcing_dfs=forcing_dfs,
        forcing_colors=forcing_colors
    )