##### Author = Trung Huynh, huynh.trung@mayo.edu
### For Elliana's, Voltage API F(t)

# Step A. Aggregates all csv files. Ensure script is in same file as csv's.

In [4]:
import pandas as pd
import glob
import os
from datetime import datetime

# --- Configuration ---
# Removed "td50Normalized" and "td90Normalized" from here to prevent raw extraction
metric_map = {
    "meanFullTransientAmplitude": "Amp",
    "meanBPMorZero": "BPM",
    "meanDecayTime": "Decay_raw",
    "meanRiseTime": "Rise_raw",
    "meanUpstrokeVelocity": "Up_Velocity",
    "td50": "TD50_raw",
    "td90": "TD90_raw",
    "td50Normalized": "TD50_Norm",
    "td90Normalized": "TD90_Norm"
}

prefixes = ["Voltage API F(t)"]

# --- Processing ---
files = glob.glob("*.csv")

# Store data as a dictionary of dictionaries (Sheet -> {Column: Series})
data_collectors = {} 

print(f"Found {len(files)} CSV files. Processing...")

for file in files:
    # 1. Identify Group Name
    try:
        group_name = os.path.basename(file).split('_')[1]
    except IndexError:
        group_name = os.path.splitext(os.path.basename(file))[0]
    
    # --- THIS LINE TO REMOVE .CSV suffix in output column headers ---
    if group_name.endswith('.csv'):
        group_name = group_name.replace('.csv', '')
    # ------------------------------------
    
    # 2. Find Header
    header_idx = 0
    with open(file, 'r') as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            # Change the condition below to match the actual header in your CSV
            if line.startswith("id,"): 
                header_idx = i
                break

    # 3. Load Data
    try:
        df = pd.read_csv(file, skiprows=header_idx)
    except Exception as e:
        print(f"Error reading {file}: {e}")
        continue
        
    # --- PRE-LOAD RELIABLE X-AXIS (API PERIOD) ---
    # We exclusively use the Voltage API Period for all normalization and plotting
    api_period_col = "Voltage API F(t) (meanFullTransientPeriod)"
    
    if api_period_col in df.columns:
        x_data_reliable = df[api_period_col]
        period_series = df[api_period_col] # For calculation
    else:
        x_data_reliable = None
        period_series = None

    
    # --- 1. Calculate Manual Normalization (TD / API Period) ---
    # Logic: Normalized Metric = Raw Metric / Voltage API Period
    if period_series is not None:
        for prefix in prefixes:
            func_type = prefix.split()[1] # "API" or "Stretch"
            
            # --- meanDecayTime Normalization ---
            raw_meanDecayTime_col = f"{prefix} (meanDecayTime)"
            if raw_meanDecayTime_col in df.columns:
                sheet_name_DecayTime = f"{func_type}_Decay_Norm"
                if sheet_name_DecayTime not in data_collectors:
                    data_collectors[sheet_name_DecayTime] = {}
                
                # Calculation
                norm_meanDecayTime = df[raw_meanDecayTime_col] / period_series
                data_collectors[sheet_name_DecayTime][f"{group_name}"] = norm_meanDecayTime.reset_index(drop=True)

            # --- TD90 Normalization ---
            raw_meanRiseTime_col = f"{prefix} (meanRiseTime)"
            if raw_meanRiseTime_col in df.columns:
                sheet_name_RiseTime = f"{func_type}_Rise_Norm"
                if sheet_name_RiseTime not in data_collectors:
                    data_collectors[sheet_name_RiseTime] = {}
                
                # Calculation
                norm_RiseTime = df[raw_meanRiseTime_col] / period_series
                data_collectors[sheet_name_RiseTime][f"{group_name}"] = norm_RiseTime.reset_index(drop=True)

    # --- 3. Extract Standard Metrics ---
    for prefix in prefixes:
        for metric, abbr in metric_map.items():
            col_name = f"{prefix} ({metric})"
            
            if col_name in df.columns:
                func_type = prefix.split()[1] # "API" or "Stretch"
                sheet_name = f"{func_type}_{abbr}"
                
                if sheet_name not in data_collectors:
                    data_collectors[sheet_name] = {}
                
                
                # Always store the Y (Metric) value
                data_collectors[sheet_name][f"{group_name}"] = df[col_name].reset_index(drop=True)

  
# --- Save to Excel ---
date_suffix = datetime.now().strftime("%m%d%Y")
output_file = f"Prism_Ready_Data_Voltage_{date_suffix}.xlsx"

with pd.ExcelWriter(output_file) as writer:
    for sheet_name, columns_dict in data_collectors.items():
        df_sheet = pd.DataFrame(columns_dict)
        
        # Sort columns to keep X and Y pairs together
        df_sheet = df_sheet.reindex(sorted(df_sheet.columns), axis=1)
        
        safe_name = sheet_name[:31]
        df_sheet.to_excel(writer, sheet_name=safe_name, index=False)

print(f"Done! Data saved to '{output_file}' with full row counts.")

Found 8 CSV files. Processing...
Done! Data saved to 'Prism_Ready_Data_Voltage_01132026.xlsx' with full row counts.


# Step B: Comparison of Means (One-way ANOVA)
For the metrics where rate correction is not required (e.g., Amplitude, BPM, CV) - only need _Y column from output aggregate excel:

1. New Data Table: Select Column graph type.

2. Data Entry: Paste your data into columns: Control, R541C, H222P, R225X.

3. Analyze: Click Analyze > One-way ANOVA (or Kruskal-Wallis if data is non-normal).

- Multiple Comparisons: Choose "Compare the mean of each column with the mean of a control column" (Dunnett's test). Select your Isogenic Control as the reference.

- Output: This will give you significance flags (astrisks) for every variant against the control.

# Step C: Rate Correction (Regression Analysis)
For the 4 rate-dependent metrics (meanDecayTime, meanRiseTime, td50, td90):

1. New Data Table: Select XY graph type.

2. Data Entry:

- X Values: Paste meanFullTransientPeriod (Beat Period) here.

- Y Values: Create Group columns for Control, R541C, H222P, R225X.

- Importantly: You must match the X (Period) and Y (Metric) for each individual cell. In Prism, you will list all X-values in the X column, and the corresponding Y-values in the specific Group column.

3. Analyze: Click Analyze > Simple Linear Regression (or Nonlinear if you have a specific correction formula like Fridericia).

- Compare: Check the box "Test whether the slopes and intercepts are significantly different".

4. Interpretation:

- Different Slopes: The variant reacts differently to changes in beat rate than the control.

- Different Intercepts (same slope): The variant has a fundamentally higher/lower value (e.g., prolonged decay) regardless of beat rate.

5. Visualization: This creates the plot you requested: Metric (Y) vs Period (X), with regression lines color-coded by group.

Each tab (e.g., API_meanDecayTime) corresponds to one analysis.

For XY plots: Copy the Control_X, Control_Y columns into Prism's XY table (Prism allows importing X/Y pairs or shared X; for unpaired cells with different periods, use "X" and "Y" columns for each group, or stack them: One X column, Multiple Y columns, but you must align rows correctly. Actually, Prism XY tables usually expect one X column. Since every cell has a different beat period, you should format the Excel such that you paste [Period, Value] for Control, then [Period, Value] for R541C below it, utilizing a "Group" column if importing, OR use Prism's "Multiple Variables" format.)

Easier Prism XY Method: Paste X (Period) and Y (Metric) for Control into the first 2 columns. Then X and Y for R541C into the next 2 columns. Prism treats these as separate datasets on the same graph.