In [None]:
import math
import numpy as np

def load_and_label_pfas_csv(csv_path):
    """
    Load a PFAS CSV file, combine headers, and return a labeled DataFrame.

    Parameters
    ----------
    csv_path : str
        Path to the raw CSV file.

    Returns
    -------
    pd.DataFrame
        DataFrame with combined headers for PFAS compounds.
    """
    # Read the first two lines to construct headers
    with open(csv_path, 'r', encoding='utf-8') as f:
        first_line = f.readline().strip().split(';')
        second_line = f.readline().strip().split(';')
    
    # Extract PFAS compound names from the first line (skip 'Sample')
    pfas_compounds = [name.replace(' Results', '') for name in first_line[1:] if 'Results' in name]
    
    # Create combined headers
    combined_headers = []
    pfas_index = 0
    
    for header in second_line:
        # Keep initial metadata columns as they are
        if header in ['Name', 'Data File', 'Type', 'Level', 'Acq. Date-Time']:
            combined_headers.append(header)
        # For RT, Final Conc., Accuracy, S/N - combine with PFAS name
        elif header in ['RT', 'Final Conc.', 'Accuracy', 'S/N']:
            if pfas_index < len(pfas_compounds):
                combined_headers.append(f"{pfas_compounds[pfas_index]}_{header}")
                if header == 'S/N':
                    pfas_index += 1
            else:
                combined_headers.append(header)
        else:
            combined_headers.append(header)
    
    # Read the actual data (skip first 2 lines)
    rawdata = pd.read_csv(csv_path, delimiter=';', skiprows=2, header=None)
    
    # Assign the combined headers (handle extra/fewer columns)
    if len(combined_headers) <= len(rawdata.columns):
        rawdata.columns = combined_headers + [f"Extra_Col_{i}" for i in range(len(combined_headers), len(rawdata.columns))]
    else:
        rawdata.columns = combined_headers[:len(rawdata.columns)]
    
    return rawdata


def calculate_rse(expected, measured, compound_name, p=2):
    if len(expected) != len(measured):
        raise ValueError("Expected and measured arrays must have the same length")

    def compute_rse(expected, measured, p, compound_name):
        n = len(expected)
        sum_relative_squared_error = 0
        detailed_results = []

        print(f"\n{compound_name} (p={p})")
        print("=" * 60)
        print("Point\tExpected\tMeasured\tDifference\t(Diff)²/xi²\t(Diff)²/xi²/(n-p)")
        print("-" * 80)

        for i in range(n):
            diff = measured[i] - expected[i]
            nominator = (diff / expected[i]) ** 2
            relative_squared_error = nominator / (n - p)
            sum_relative_squared_error += relative_squared_error

            detailed_results.append({
                'Point': f'CAL {i+1}',
                'Expected': expected[i],
                'Measured': measured[i],
                'Difference': diff,
                'Nominator': nominator,
                'Relative_Squared_Error_Term': relative_squared_error
            })

            print(f"CAL {i+1}\t{expected[i]}\t\t{measured[i]:.4f}\t\t{diff:.4f}\t\t"
                  f"{nominator:.6f}\t{relative_squared_error:.6f}")

        rse = 100 * math.sqrt(sum_relative_squared_error)

        print("-" * 80)
        print(f"Sum of relative squared errors: {sum_relative_squared_error:.6f}")
        print(f"RSE = 100 * √({sum_relative_squared_error:.6f}) = {rse:.3f}%")

        return rse, detailed_results, sum_relative_squared_error

    rse, details, sum_rse = compute_rse(expected, measured, p, compound_name)
    relationship_type = "linear" if p == 2 else "quadratic"

    if p == 2 and rse > 20:
        print("\nLinearity test failed (RSE > 20%), recalculating with p=3 (quadratic)...")
        p = 3
        rse, details, sum_rse = compute_rse(expected, measured, p, compound_name)
        relationship_type = "quadratic"

    return {
        'compound_name': compound_name,
        'n_points': len(expected),
        'parameters': p,
        'relationship_type': relationship_type,
        'rse_percent': rse,
        'sum_relative_squared_error': sum_rse,
        'detailed_results': details
    }

def calculate_eis_rse(df, expected_concentrations, p=2):
    eis_compounds = []
    for col in df.columns:
        if '_Final Conc.' in col:
            compound_name = col.replace('_Final Conc.', '')
            if any(compound_name.startswith(prefix) for prefix in ['13C', '18O', '15N', '2H', 'D']):
                eis_compounds.append(compound_name)

    print(f"Identified EIS compounds: {eis_compounds}")
    cal_data = df[df['Type'] == 'Cal'].copy()

    if cal_data.empty:
        print("No CAL samples found in the data")
        return {}

    rse_results_eis = {}

    for compound in eis_compounds:
        conc_col = f"{compound}_Final Conc."

        if conc_col not in df.columns:
            print(f"Warning: Column {conc_col} not found")
            continue

        if compound not in expected_concentrations:
            print(f"Warning: No expected concentrations provided for {compound}")
            continue

        actual_concs = cal_data[conc_col].values
        expected_conc = expected_concentrations[compound]
        valid_mask = ~np.isnan(actual_concs)
        actual_concs_clean = actual_concs[valid_mask]

        if len(actual_concs_clean) == 0:
            print(f"Warning: No valid data points for {compound}")
            continue

        expected_concs_list = [expected_conc] * len(actual_concs_clean)
        rse_result = calculate_rse(expected_concs_list, actual_concs_clean.tolist(), compound, p)
        rse_results_eis[compound] = rse_result

    return rse_results_eis

def calculate_target_rse(df, expected_concentrations, p=2):
    target_analytes = []
    for col in df.columns:
        if '_Final Conc.' in col:
            compound_name = col.replace('_Final Conc.', '')
            if compound_name in expected_concentrations:
                target_analytes.append(compound_name)

    print(f"Identified target analytes: {target_analytes}")
    cal_data = df[df['Type'] == 'Cal'].copy()

    if cal_data.empty:
        print("No CAL samples found in the data")
        return {}

    rse_results_target = {}

    for compound in target_analytes:
        conc_col = f"{compound}_Final Conc."
        if conc_col not in df.columns or compound not in expected_concentrations:
            continue

        actual_concs = cal_data[conc_col].values
        expected_list = np.array(expected_concentrations[compound], dtype=float)
        valid_mask = ~np.isnan(actual_concs)
        actual_concs_clean = actual_concs[valid_mask]
        expected_clean = expected_list[valid_mask]

        if len(actual_concs_clean) == 0:
            continue

        rse_result = calculate_rse(expected_clean.tolist(), actual_concs_clean.tolist(), compound, p)
        rse_results_target[compound] = rse_result

    return rse_results_target

def print_rse_summary(rse_results):
    print("\n" + "=" * 60)
    print("COMPOUND RSE SUMMARY")
    print("=" * 60)

    for compound, results in rse_results.items():
        print(f"\n{compound}:")
        print(f"  RSE: {results['rse_percent']:.3f}%")
        print(f"  Calibration Points: {results['n_points']}")
        print(f"  Relationship type: {results['relationship_type']} (p = {results['parameters']})")

        if results['relationship_type'] == "quadratic" and results['rse_percent'] > 20:
            print("  ⚠ WARNING: Even after quadratic fit (p=3), RSE > 20% — system fails.")
        elif results['relationship_type'] == "quadratic":
            print("  ✔ Quadratic fit passed RSE < 20%.")
        elif results['rse_percent'] > 20:
            print("  ⚠ Linear fit failed RSE > 20%.")
