In [None]:
# Autoload when refreshing notebook
%load_ext autoreload
%autoreload 2

import numpy as np
import h5py
from scipy.io import loadmat
import pandas as pd
import re
import matplotlib.pyplot as plt
from types import SimpleNamespace
import scipy
import warnings
from scipy.ndimage import median_filter, gaussian_filter
from scipy.optimize import curve_fit

# import Python functions 
import sys
sys.path.append('../../')

from Python_Functions.functions import matstruct_to_dict, extractDAQBSAScalars, apply_tcav_zeroing_filter, analyze_eos_and_cher, analyze_SYAG
from Python_Functions.gmm import bigaussian_1d
from Python_Functions.cvae import CVAE

### MLP 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pickle
import numpy as np
from scipy.io import loadmat 
import re
import os 
import joblib
# Assumed: commonIndexFromSteps, extractDAQBSAScalars, and other helper functions are available

# ----------------------------------------------------------------------
# 0. Load the model with joblib
# Ex: MLP_LPS_GMM_E338_12710_20251031_194740


joblib_file = '../../model/LPS/MLP_LPS_CVAE_Forest_E300_12427_20251104_075246.pkl'  # Modify as needed
model = joblib.load(joblib_file)
joblib_file_cvae = '../../model/LPS/MLP_LPS_CVAE_E300_12427_20251104_075246.pkl'  # Modify as needed
model_cvae = joblib.load(joblib_file_cvae)
iz_scaler = pickle.load(open('../../model/LPS/E300_12427_iz_scaler_CVAE_20251104_075246.pkl', 'rb'))
# ----------------------------------------------------------------------
# 1. Define the list of (experiment, runname, step_identifier) pairs to test the model on.
# ----------------------------------------------------------------------
run_pairs = [
    ('E300', '12405', 1),  # Example pairs, modify this list
    #('E300', '12431', 1),
    #('E300', '12405', 1),
    # Add more pairs here...
]

# ----------------------------------------------------------------------
# 2. Initialize lists for concatenation
# ----------------------------------------------------------------------
all_predictors = []
all_indices = []

print("Starting multi-run data loading and concatenation...")

# ----------------------------------------------------------------------
# 3. Loop through runs, load data, and concatenate
# ----------------------------------------------------------------------
for experiment, runname, step_id in run_pairs:
        
    # --- B. Load and Filter Predictor Data (BSA Scalars) ---
    
    # 1. Load data_struct
    dataloc = f'../../data/raw/{experiment}/{experiment}_{runname}/{experiment}_{runname}.mat'
    try:
        mat = loadmat(dataloc,struct_as_record=False, squeeze_me=True)
        data_struct = mat['data_struct']
    except FileNotFoundError:
        print(f"Skipping {experiment}_{runname}: .mat file not found at {dataloc}")
        continue

    # 2. Extract full BSA scalars (filtered by step_list if needed)
    # Don't filter by common index here, we'll do it with the goodShots scalar common index loaded from the file
    bsaScalarData, bsaVars = extractDAQBSAScalars(data_struct, filter_index=False)
    bsaScalarData = apply_tcav_zeroing_filter(bsaScalarData, bsaVars)

    ampl_idx = next(i for i, var in enumerate(bsaVars) if 'TCAV_LI20_2400_A' in var)
    xtcavAmpl = bsaScalarData[ampl_idx, :]

    phase_idx = next(i for i, var in enumerate(bsaVars) if 'TCAV_LI20_2400_P' in var)
    xtcavPhase = bsaScalarData[phase_idx, :]
    xtcavOffShots = xtcavAmpl<0.1
    xtcavPhase[xtcavOffShots] = 0 #Set this for ease of plotting
    fig, ax1 = plt.subplots()
    ax1.plot(xtcavAmpl, label='Amplitude', color='b')
    ax1.set_ylabel('XTCAV Ampl [MV]', color='b')
    ax1.tick_params(axis='y', labelcolor='b')

    ax2 = ax1.twinx()
    ax2.plot(xtcavPhase, label='Phase', color='r')
    ax2.set_ylabel('XTCAV Phase [deg]', color='r')
    ax2.tick_params(axis='y', labelcolor='r')

    plt.title('XTCAV Amplitude and Phase')
    plt.show()
    # 3. 

    # 5. Filter BSA data using the final index
    # goodShots_scal_common_index is 1 based indexing from MATLAB, convert to 0 based
    bsaScalarData_filtered = bsaScalarData
    
    # 6. Construct the predictor array
    predictor_current = np.vstack(bsaScalarData_filtered).T
    
    # C. Append to master lists
    all_predictors.append(predictor_current)
    
# ----------------------------------------------------------------------
# 4. Concatenate and finalize arrays
# ----------------------------------------------------------------------
# Combine all data arrays from the runs
predictor_tmp = np.concatenate(all_predictors, axis=0)

# Set image half dimensions (should match preprocessing)
yrange = 100
xrange = 100
NCOMP = 8  # Number of GMM parameters
print("\n--- Final Concatenated Data Shapes ---")
print(f"Total Predictors (predictor): {predictor_tmp.shape}")



### Exclude BSA Variables

In [None]:
from Python_Functions.functions import exclude_bsa_vars
excluded_var_idx = exclude_bsa_vars(bsaVars)
predictor_tmp_cleaned = np.delete(predictor_tmp, excluded_var_idx, axis=1)
print(f"Predictor shape after excluding variables: {predictor_tmp_cleaned.shape}")


### Calibration

In [None]:
# Define XTCAV calibration
krf = 239.26
cal = 1167 # um/deg  http://physics-elog.slac.stanford.edu/facetelog/show.jsp?dir=/2025/11/13.03&pos=2025-$
streakFromGUI = cal*krf*180/np.pi*1e-6#um/um
xtcalibrationfactor = 6.35e-15

isChargePV = [bool(re.search(r'TORO_LI20_2452_TMIT', pv)) for pv in bsaVars]
if isChargePV:
    # Extract charge data
    pvidx = [i for i, val in enumerate(isChargePV) if val]
    charge = bsaScalarData[pvidx, :][0] * 1.6e-19  # in C 
    charge_filtered = charge

# Set flag for current profile fitting. If True, the current profile will be used to refine the GMM fit.
do_current_profile = True
NCOMP = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:

fig, ax1 = plt.subplots()
ax1.plot(xtcavAmpl, label='Amplitude', color='b')
ax1.set_ylabel('XTCAV Ampl [MV]', color='b')
ax1.tick_params(axis='y', labelcolor='b')

ax2 = ax1.twinx()
ax2.plot(xtcavPhase, label='Phase', color='r')
ax2.set_ylabel('XTCAV Phase [deg]', color='r')
ax2.tick_params(axis='y', labelcolor='r')

plt.title('XTCAV Amplitude and Phase')
plt.show()

In [None]:
    
# --- Original scaling and splitting logic follows ---

x_scaler = MinMaxScaler()
x_scaled = x_scaler.fit_transform(predictor_tmp_cleaned)

# all dataset is used for testing purpose here
x_test_scaled = x_scaled
ntest = np.arange(x_test_scaled.shape[0])

# Convert to PyTorch tensors
X_test = torch.tensor(x_test_scaled, dtype=torch.float32)
# Evaluate model
pred_test_scaled = model.predict(X_test)

# Inverse transform predictions
pred_test_full = iz_scaler.inverse_transform(pred_test_scaled)
#pca.inverse_transform(pred_test_scaled)
# Compute R² score
def r2_score(true, pred):
    RSS = np.sum((true - pred)**2)
    TSS = np.sum((true - np.mean(true))**2)
    return 1 - RSS / TSS if TSS != 0 else s0

print("Test R² is Unknown (Iz_test_true not available)")


### Plot prediction

In [None]:
from ipywidgets import interact, IntSlider, Layout
def plot_xtcav_image_pred(idx):
    fig, (ax2, cx1) = plt.subplots(1,2,figsize=(8, 6), gridspec_kw={'width_ratios': [1, 0.02]})
    pred_params = pred_test_full.T.reshape(NCOMP,pred_test_full.shape[0])[:,idx]
    pred_im = model_cvae.decode_latent_mu(torch.tensor(pred_params, dtype=torch.float32).to(device)).cpu().detach().numpy().reshape(2*xrange, 2*yrange)
    im2 = ax2.imshow(pred_im*1e15, cmap = "jet", extent=(-xrange*xtcalibrationfactor*1e15, xrange*xtcalibrationfactor*1e15, 0, 2*yrange), aspect='auto')
    ax2.set(xlabel = "Time [fs]")
    ax2.set(ylabel = "y [pix]")
    ax2.set(title = f"LPS Prediction: {experiment}_{runname}, Shot Number: {ntest[idx]}")
    cbar = fig.colorbar(im2, cax=cx1, fraction=0.16, pad=0.04)
    cbar.set_label("Current [A per y pixel]")
    plt.subplots_adjust(wspace=0.4)
    # plt.tight_layout()
    # fig.show()
    # Also plot R² value for this index
    print(f"Displaying prediction for index: {idx}")
    print(f"Parameters: {pred_params}")

# Create slider
interact(plot_xtcav_image_pred, idx=IntSlider(min=0, max=pred_test_full.shape[0]-1, step=1, value=0, layout=Layout(width='1000px')))

In [None]:
from ipywidgets import interact, IntSlider, Layout
def plot_xtcav_image_pred_current(idx):
    FACTOR = 1e15
    fig, (ax0) = plt.subplots(1,1,figsize=(10, 6))

    x_grid = np.linspace(-xrange*xtcalibrationfactor*FACTOR, xrange*xtcalibrationfactor*FACTOR, xrange * 2)


    pred_params = pred_test_full.T.reshape(NCOMP,pred_test_full.shape[0])[:,idx]
    # Current profile is relative to zeta; transpose so that the drive bunch center agrees with the fit.
    
    if isChargePV:
        # Normalize predicted image so that its integral matches the charge
        charge_value = charge_filtered[ntest[idx]]
        pred_im = model_cvae.decode_latent_mu(torch.tensor(pred_params, dtype=torch.float32).to(device)).cpu().detach().numpy().reshape(2*xrange, 2*yrange)
    else:
        pred_im = model_cvae.decode_latent_mu(torch.tensor(pred_params, dtype=torch.float32).to(device)).cpu().detach().numpy().reshape(2*xrange, 2*yrange)
    pred_im = pred_im.T
    pred_im_proj = np.sum(pred_im, axis = 0) * FACTOR
    ax0.plot(x_grid, pred_im_proj, label = "prediction")
    # Fix x-axis limits
    ax0.set(xlim = (-xrange * xtcalibrationfactor * FACTOR, xrange * xtcalibrationfactor * FACTOR))
    ax0.set(ylim=(1e2, None))
    ax0.legend()
    # cbar.set_label("Current [arb. units]")
    plt.subplots_adjust(wspace=0.4)
    # plt.tight_layout()
    # fig.show()

    # Display image_to_bigaussian_params debug info
    #biGaussianTest = image_to_bigaussian_params(images[valid_rows][ntest[idx]].reshape((2*yrange, 2*xrange)), do_current_profile, debug=True)
    #print("Bi-Gaussian Parameters for Test Image:", biGaussianTest)
    

# Create slider
interact(plot_xtcav_image_pred_current, idx=IntSlider(min=0, max=pred_test_full.shape[0]-1, step=1, value=0, layout=Layout(width='80%')));

### EOS2 Comparison

In [None]:
eos_data = analyze_eos_and_cher(data_struct, experiment=experiment, runname=runname, skipEOSanalysis=False, EOS2ymin=50, EOS2ymax=250, mindels=90e-6, maxdels=140e-6, goosing=True, debug = True)

In [None]:
EOS2horzProj = eos_data["EOS2horzProj"]

def find_current_profile_peak_separation(idx, debug = False):
    pred_params = pred_test_full.T.reshape(NCOMP,pred_test_full.shape[0])[:,idx]
    pred_im = model_cvae.decode_latent_mu(torch.tensor(pred_params, dtype=torch.float32).to(device)).cpu().detach().numpy().reshape(2*xrange, 2*yrange)
    # Current profile is relative to zeta; transpose so that the drive bunch center agrees with the fit.
    pred_im = pred_im.T
    pred_im = gaussian_filter(pred_im, sigma=2)
    pred_im_proj = np.sum(pred_im, axis = 1)
    # Find peaks
    max = np.max(pred_im_proj)
    peaks = scipy.signal.find_peaks(pred_im_proj, height=max*0.05, prominence=max * 0.1)[0]
    if len(peaks) >= 2:
        try:
            # Fit bigaussian function
            x_coords = np.arange(2*xrange)
            # Initial guess for bigaussian fit: [amp1, sigma1, mean1, amp2, sigma2, mean2]
            # Sigma is chosen as 10 pixels arbitrarily. Amplitudes are peak heights.
            p0_x = [x_coords[peaks[0]], 10 ,pred_im_proj[peaks[0]], x_coords[peaks[1]], 10 ,pred_im_proj[peaks[1]]]
            popt_x, _ = curve_fit(bigaussian_1d, x_coords, pred_im_proj, p0=p0_x, maxfev=5000)
            peak_separation = popt_x[3] - popt_x[0]
            time_separation_fs = peak_separation * (xtcalibrationfactor * 1e15)
            if debug:
                # Plot for debugging
                fig, ax = plt.subplots(figsize=(8, 4))
                x_grid = np.linspace(-xrange*xtcalibrationfactor*1e15, xrange*xtcalibrationfactor*1e15, xrange * 2)
                ax.plot(x_grid, pred_im_proj*1e15, label = "prediction")
                ax.plot(x_grid[peaks], pred_im_proj[peaks]*1e15, "x", label="peaks")
                ax.plot(x_grid, bigaussian_1d(x_coords, popt_x[0], popt_x[1], popt_x[2], popt_x[3], popt_x[4], popt_x[5])*1e15, label = "fit")
                ax.set(xlim = (-xrange * xtcalibrationfactor * 1e15, xrange * xtcalibrationfactor * 1e15))
                ax.set(ylim=(1e2, None))
                ax.set_title(f'Current Profile with Peaks: Separation = {time_separation_fs:.2f} fs')
                ax.legend()
                plt.show()
            return time_separation_fs
        except:
            return np.nan
    else:
        return np.nan
def plot_eos2_projection(idx):
    fig, (axr, ax, axc, ax2, cx1) = plt.subplots(1, 5, figsize=(14, 4), gridspec_kw={'width_ratios': [1, 1, 1, 1, 0.02]})
    eos2_proj = EOS2horzProj[:, idx]
    axr.imshow(eos_data["shotROI"][:,:,idx])
    axr.set(xlim = (100, 175))
    axr.set(ylim=(75, 200))
    ax.plot(np.log(1+np.flip(eos2_proj)), label='EOS2 Horizontal Projection')
    ax.set_xlabel('Time [a.u.]')
    ax.set_ylabel('Log Intensity [arb. units]')
    ax.set_title(f'EOS2 Horizontal Projection: Shot Number: {ntest[idx]}')
    # Dels in text
    ax.text(0.5, 0.9, f'Dels: {eos_data["dels"][idx]*1e6:.2f} um', transform=ax.transAxes, ha='center', va='center', fontsize=12, bbox=dict(facecolor='white', alpha=0.8))
    # bc14BLEN in text
    ax.text(0.5, 0.8, f'BC14 BLEN: {eos_data["bc14BLEN"][idx]:.2f}', transform=ax.transAxes, ha='center', va='center', fontsize=12, bbox=dict(facecolor='white', alpha=0.8))
    ax.set(xlim = (200, 350))
    ax.set(ylim=(8, None))
    pred_params = pred_test_full.T.reshape(NCOMP,pred_test_full.shape[0])[:,idx]
    pred_im = model_cvae.decode_latent_mu(torch.tensor(pred_params, dtype=torch.float32).to(device)).cpu().detach().numpy().reshape(2*xrange, 2*yrange)
    # Normalize predicted image so that its integral matches the charge
    if isChargePV:
        charge_value = charge_filtered[ntest[idx]]
        pred_im = pred_im * (charge_value / np.sum(pred_im))
    # smooth predicted image
    pred_im = gaussian_filter(pred_im, sigma=2)
    im2 = ax2.imshow(pred_im*1e15, cmap = "jet", extent=(-xrange*xtcalibrationfactor*1e15, xrange*xtcalibrationfactor*1e15, 0, 2*yrange), aspect='auto')
    ax2.set(xlabel = "Time [fs]")
    ax2.set(ylabel = "y [pix]")
    ax2.set(title = f"LPS Prediction: {experiment}_{runname}, Shot Number: {ntest[idx]}")

    # Plot the current profile from LPS prediction
    current_profile = np.sum(pred_im, axis=0) * 1e15  # Sum over y to get current profile
    # Apply Gaussian smoothing to the current profile
    current_profile = gaussian_filter(current_profile, sigma=2)
    x_grid = np.linspace(-xrange*xtcalibrationfactor*1e15, xrange*xtcalibrationfactor*1e15, xrange * 2)
    axc.plot(x_grid, current_profile, label='LPS Predicted Current Profile', color='orange')
    axc.set(xlabel='Time [fs]', ylabel='Current [A]', title='LPS Predicted Current Profile')
    axc.text(0.5, 0.9, f'Sep: {find_current_profile_peak_separation(idx):.2f} fs', transform=axc.transAxes, ha='center', va='center', fontsize=12, bbox=dict(facecolor='white', alpha=0.8))
    
    cbar = fig.colorbar(im2, cax=cx1, fraction=0.16, pad=0.04)
    cbar.set_label("Current [A per y pixel]")
    plt.subplots_adjust(wspace=0.4)
    # plt.tight_layout()
    # fig.show()
    # Also plot R² value for this index
    print(f"Displaying prediction for index: {idx}")
    print(f"Parameters: {pred_params}")
    plt.show()
# Create slider for EOS2 projection
# Goosing with step of 2 to skip blank shots
interact(plot_eos2_projection, idx=IntSlider(min=1, max=EOS2horzProj.shape[1]-1, step=2, value=0, layout=Layout(width='80%')))

In [None]:
# find_current_profile_peak_separation for all shots and plot scatter shot separation vs dels
peak_separations = []
for i in range(EOS2horzProj.shape[1]):
    sep = find_current_profile_peak_separation(i)
    peak_separations.append(sep)
plt.figure(figsize=(8,6))
plt.scatter(eos_data["dels"]*1e6, peak_separations, alpha=0.7)
plt.xlabel('Dels (um)')
plt.ylabel('Predicted Peak Separation (fs)')
plt.title('Predicted Peak Separation vs EOS2 Dels')
plt.xlim(0, 200)
plt.ylim(0, 220)
plt.show()
print(peak_separations[399])

In [None]:
# Scatter plot of dels vs BC14 BLEN
plt.figure(figsize=(8,6))
plt.scatter(eos_data["bc14BLEN"], eos_data["dels"]*1e6, alpha=0.5)
plt.xlabel('BC14 BLEN')
plt.ylabel('Dels [um]')
plt.ylim(50, 200)
plt.xlim(10000, 30000)
plt.title(f'EOS2 Measured Dels vs BC14 BLEN: {experiment}_{runname}')

In [None]:
# Scatter plot of BC14 BLEN vs predicted peak separation
plt.figure(figsize=(8,6))
plt.scatter(eos_data["bc14BLEN"], peak_separations, alpha=0.5)
r_squared = np.corrcoef(eos_data["bc14BLEN"], peak_separations)[0, 1]**2
print(f"R-squared: {r_squared}")
plt.xlabel('BC14 BLEN')
plt.ylabel('Predicted Peak Separation (fs)')
plt.title(f'Predicted Peak Separation vs BC14 BLEN: {experiment}_{runname}')
plt.show()

In [None]:
syag_data = analyze_SYAG(data_struct, experiment=experiment, runname=runname, skipEOSanalysis=False, SYAGxmin=800, SYAGxmax=950, mindels=300, maxdels=500, goosing=True, debug = False, step_selector=[1,2,3])

In [None]:
print(syag_data["dels"])

# Plot dels with blen.
# valid indexes are the ones where dels is non-zero
valid_idxs = syag_data["dels"] != 0.0
plt.figure(figsize=(8,6))
plt.scatter(eos_data["bc14BLEN"][valid_idxs], syag_data["dels"][valid_idxs]*1e6, alpha=0.5)
plt.xlabel('BC14 BLEN')
plt.ylabel('SYAG Dels [um]')

In [None]:
def xtcav_image_mu(idx):
    FACTOR = 1e15

    pred_params = pred_test_full.T.reshape(NCOMP,pred_test_full.shape[0])[:,idx]
    return pred_params
mu_list = []
for i in range(500):
    mu_list.append(xtcav_image_mu(i))
plt.plot(mu_list)
plt.xlabel('Shot Index')
plt.ylabel('Distance between Centroids (fs)')
plt.title('Predicted Latent Z Components in {}'.format(f"{experiment}_{runname}"))
plt.show()

### PCA of PV Scalars

In [None]:
# plot bsaScalarData TCAV_LI20_2400_P and TCAV_LI20_2400_A, most important predictors.
bsaVarNames = bsaVars
var1_name = 'TCAV_LI20_2400_P'
var2_name = 'TCAV_LI20_2400_A'
var1_idx = bsaVarNames.index(var1_name)
var2_idx = bsaVarNames.index(var2_name)
print(f"Plotting BSA Scalars: {var1_name} (index {var1_idx}) and {var2_name} (index {var2_idx})")
plt.figure(figsize=(8,6))
plt.scatter(predictor_tmp[:, var1_idx], predictor_tmp[:, var2_idx], alpha=0.5)
plt.xlabel(var1_name)
plt.ylabel(var2_name)
plt.title('BSA Scalar Scatter Plot')
plt.grid(True)
plt.show()

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

# Assuming bsaScalarData has shape (N_variables, N_samples) from your function
# Transpose the data so features are columns and samples are rows for scikit-learn PCA
X = bsaScalarData.T 

# Apply MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Determine the maximum number of components to test
# X.shape is (N_samples, N_variables). max_components = min(N_samples, N_variables) - 1 for a stable PCA
max_components = min(X.shape) // 5 

# Lists to store results
n_components_list = []
reconstruction_losses = []

# Loop through possible number of components
for k in range(1, max_components + 1):
    # 1. Initialize and fit PCA
    pca_study = PCA(n_components=k)
    pca_study.fit(X)
    
    # 2. Transform and Inverse Transform (Reconstruct)
    X_reduced = pca_study.transform(X)
    X_reconstructed = pca_study.inverse_transform(X_reduced)
    
    # 3. Calculate Reconstruction Loss (Mean Squared Error)
    loss = mean_squared_error(X, X_reconstructed)
    
    # Store results
    n_components_list.append(k)
    reconstruction_losses.append(loss)
    
# 4. Plot the Results
plt.figure(figsize=(10, 10))
plt.plot(n_components_list, reconstruction_losses, marker='o', linestyle='-')
plt.xlabel('Number of Components (k)')
plt.ylabel('Reconstruction Loss (Mean Squared Error)')
# Log scale in y-axis for better visualization
plt.yscale('log')
plt.title('PCA of BSA Scalars: Reconstruction Loss vs. Number of Components')
plt.grid(True)
# 5. Identify the "Elbow" point visually after plotting plt.show()
plt.show() 

# After plotting, the optimal number of components is the 'elbow' point.

In [None]:

num_components = 11

pca_comp_study = PCA(n_components=num_components)
pca_comp_study.fit(X)
# 1. Get explained variance and component loadings

variance_ratios = pca_comp_study.explained_variance_ratio_
loadings = pca_comp_study.components_

# 2. Create the DataFrame for component composition
# Each row in 'loadings' is a principal component (PC)
# Each column corresponds to a feature (PV)
df_loadings = pd.DataFrame(loadings, columns=bsaVars)

# 3. Add Component labels and Significance
component_labels = [f'PC {i+1}' for i in range(num_components)]
df_loadings.insert(0, 'Component', component_labels)
df_loadings.insert(1, 'Significance (Explained Variance Ratio)', variance_ratios)

# 4. Format the output
# The components are already ordered by significance (PC 1 is most significant)
# Format the significance column as a percentage for clarity
df_loadings['Significance (Explained Variance Ratio)'] = \
    df_loadings['Significance (Explained Variance Ratio)'].map(lambda x: f'{x:.4f} ({x*100:.2f}%)')

# Format the loadings to a fixed number of decimal places
for col in bsaVars:
    df_loadings[col] = df_loadings[col].map(lambda x: f'{x:.4f}')

# Display the resulting table
composition_table = df_loadings.to_markdown(index=False)
print("Composition of Principal Components:")
print(composition_table)

In [None]:
# 1. Extract and Square the Loadings (Loadings matrix is: Components x Variables)
# loadings = pca_comp_study.components_ (from your provided code block)
squared_loadings = loadings**2

# 2. Sum Across Components (Sum columns to get total significance per variable)
# The result is an array where each element is the total squared loading for a variable
total_squared_loadings = np.sum(squared_loadings, axis=0)

# 3. Create a DataFrame for sorting
# bsaVars is the list of variable names (features)
df_var_significance = pd.DataFrame({
    'BSA Variable (PV)': bsaVars,
    'Total Squared Loading (Significance)': total_squared_loadings
})

# 4. Sort in descending order
df_var_significance = df_var_significance.sort_values(
    by='Total Squared Loading (Significance)', 
    ascending=False
).reset_index(drop=True)

# 5. Specify the number of top variables to display
N_top = 20  # Example: display the top few most significant variables

# 6. Format and display the table
df_top_vars = df_var_significance.head(N_top)
df_top_vars['Total Squared Loading (Significance)'] = \
    df_top_vars['Total Squared Loading (Significance)'].map(lambda x: f'{x:.4f}')

inverted_table = df_top_vars.to_markdown(index=False)
print(f"Top {N_top} BSA Variables Ordered by Significance in PCA Components:")
print(inverted_table)


In [None]:

df_filtered = df_var_significance[
    ~df_var_significance['BSA Variable (PV)'].str.contains('BPM', case=False, na=False)
]

# 5. Sort the filtered variables in descending order of significance
df_filtered = df_filtered.sort_values(
    by='Total Squared Loading (Significance)', 
    ascending=False
).reset_index(drop=True)

# 6. Select Top N and format
N_top = 20 # Display the top few most significant non-BPM variables
df_top_vars = df_filtered.head(N_top)

# Format the significance column
df_top_vars['Total Squared Loading (Significance)'] = \
    df_top_vars['Total Squared Loading (Significance)'].map(lambda x: f'{x:.4f}')

# Display the resulting table
inverted_table = df_top_vars.to_markdown(index=False)

print(f"Top {N_top} BSA Variables (EXCLUDING BPMS) Ordered by Significance in PCA Components:")
print(inverted_table)