## Feature engineering 
Adding all the formulas and important derivations which will be necessary for the Models

In [3]:
"""
Simple feature engineering script for MOSFET EMI surrogate data.

- Reads all CSVs from:   mosfets_step_3_final_cleaned/
- Writes augmented CSVs to: feature_engineered_MOSFET_data/
- Adds **physics-informed input-only** features (no target leakage).
- Keeps the code compact and easy to follow.

Citations (why these features?):
- Ott, H. W., *Electromagnetic Compatibility Engineering*, Wiley, 2009.
  → Overshoot/ringing scale with loop inductance and LC resonance.
- Erickson & Maksimović, *Fundamentals of Power Electronics*, 2nd ed.
  → Slew/energy relations, parasitics’ effects; L/R, R·C time constants.
- Bogatin, E., *Signal and Power Integrity – Simplified*, 3rd ed.
  → LC resonance f0=1/(2π√LC), impedance √(L/C), damping ratio trends.
- Paul, C. R., *Introduction to Electromagnetic Compatibility*, 2nd ed.
  → Coupling paths, effect of capacitances on switching noise.

Note: We only use columns you already have in your cleaned files header.
We **do not** touch target columns (rise/fall/overshoot/undershoot/ringing).
"""
import os
import glob
import math
import pandas as pd
import numpy as np

INPUT_DIR = "mosfets_step3_final_cleaned"
OUTPUT_DIR = "feature_engineered_MOSFET_data"

# Create output folder if needed
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Show input folder contents at the start
print("Input folder:", os.path.abspath(INPUT_DIR))
files = sorted(glob.glob(os.path.join(INPUT_DIR, "*_cleaned.csv")))
if not files:
    print("No input files found. Expected pattern: *_cleaned.csv")
else:
    for f in files:
        print(" -", os.path.basename(f))

# Helper: safe log for positive values only
def safe_log(series):
    x = series.copy()
    x = x.replace([np.inf, -np.inf], np.nan)
    # Ensure strictly positive for log; tiny floor avoids log(0)
    tiny = np.finfo(float).tiny
    x = x.astype(float)
    x = x.where(x > 0, np.nan)
    return np.log(x.clip(lower=tiny))

# Feature engineering on one DataFrame (simple & explicit)
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    # ---- 1) Loop inductance sum (Ott; affects overshoot & f0) ----
    L_cols = ["Ls4","Ls5","Ls6","Ls7","Ls8","Ls9","Ls10","Ls11"]
    present_L = [c for c in L_cols if c in df.columns]
    df["L_loop_total"] = df[present_L].sum(axis=1)

    # ---- 2) Loop series resistance (for damping, Bogatin) ----
    if "R1" in df.columns:
        df["R_loop"] = df["R1"]

    # ---- 3) Effective power-path capacitance (use Coss if present; Paul/Ott) ----
    if "Coss" in df.columns:
        df["C_eff_power"] = df["Coss"].astype(float)

    # ---- 4) LC resonance & impedance (f0, Z0) (Bogatin/Ott) ----
    if {"L_loop_total","C_eff_power"}.issubset(df.columns):
        L = df["L_loop_total"].astype(float)
        C = df["C_eff_power"].astype(float)
        with np.errstate(divide='ignore', invalid='ignore'):
            df["f0_LC_power_Hz"] = 1.0 / (2.0 * math.pi * np.sqrt(L * C))
            df["Z0_LC_power"] = np.sqrt(L / C)

    # ---- 5) Damping ratio (zeta ≈ R / (2*√(L/C))) (Bogatin/Ott) ----
    if {"R_loop","L_loop_total","C_eff_power"}.issubset(df.columns):
        L = df["L_loop_total"].astype(float)
        C = df["C_eff_power"].astype(float)
        R = df["R_loop"].astype(float)
        with np.errstate(divide='ignore', invalid='ignore'):
            df["zeta_power_loop"] = R / (2.0 * np.sqrt(L / C))

    # ---- 6) Reverse-recovery indicators (Erickson&Maksimović; EMI stress) ----
    if "Qrr_typ" in df.columns:
        df["Qrr"] = df["Qrr_typ"].astype(float)
    if "Irrm_typ" in df.columns:
        df["Irrm"] = df["Irrm_typ"].astype(float)

    # ---- 7) Normalized capacitance ratio (Crss/Coss ~ Cgd/Coss) (Paul) ----
    if {"Crss","Coss"}.issubset(df.columns):
        with np.errstate(divide='ignore', invalid='ignore'):
            df["Cgd_over_Ceff"] = df["Crss"].astype(float) / df["Coss"].astype(float)

    # ---- 8) Gate resistance vs LC impedance (dimensionless) (Bogatin) ----
    if {"Rg","Z0_LC_power"}.issubset(df.columns):
        with np.errstate(divide='ignore', invalid='ignore'):
            df["Rg_over_Z0"] = df["Rg"].astype(float) / df["Z0_LC_power"].astype(float)

    # ---- 9) Overshoot risk index (heuristic) (Ott) ----
    if {"Vbus","Z0_LC_power"}.issubset(df.columns):
        base = df["Vbus"].astype(float) * df["Z0_LC_power"].astype(float)
        if "zeta_power_loop" in df.columns:
            df["overshoot_risk_index"] = base / (1.0 + df["zeta_power_loop"].astype(float))
        else:
            df["overshoot_risk_index"] = base

    # ---- 10) Time constants (L/R and R*C) (Erickson&Maksimović) ----
    if {"L_loop_total","R_loop"}.issubset(df.columns):
        with np.errstate(divide='ignore', invalid='ignore'):
            df["tau_L_over_R"] = df["L_loop_total"].astype(float) / df["R_loop"].astype(float)
    if {"R_loop","C_eff_power"}.issubset(df.columns):
        df["tau_R_times_C"] = df["R_loop"].astype(float) * df["C_eff_power"].astype(float)

    # ---- 11) Helpful log-scales for wide-range variables (common ML practice) ----
    for col in [
        "L_loop_total","C_eff_power","Z0_LC_power","f0_LC_power_Hz",
        "Qrr","Irrm","tau_L_over_R","tau_R_times_C",
    ]:
        if col in df.columns:
            df[f"log_{col}"] = safe_log(df[col])

    return df

# Process all files
for path in files:
    name = os.path.splitext(os.path.basename(path))[0]  
    out_path = os.path.join(OUTPUT_DIR, f"{name}_feature_engineered.csv")
    try:
        df = pd.read_csv(path)
        df_fe = add_features(df)
        df_fe.to_csv(out_path, index=False)
        print(f"Saved: {os.path.relpath(out_path)}  (rows={len(df_fe)}, cols={df_fe.shape[1]})")
    except Exception as e:
        print(f"ERROR processing {path}: {e}")

print("\nDone. Feature-engineered files are in:", os.path.abspath(OUTPUT_DIR))


Input folder: c:\Users\pc\Desktop\TRAIL\mosfets_step3_final_cleaned
 - C2M0025120D_cleaned.csv
 - C2M0040120D_cleaned.csv
 - C2M0080120D_cleaned.csv
 - C2M0160120D_cleaned.csv
 - C2M0280120D_cleaned.csv
 - C2M1000170D_cleaned.csv
Saved: feature_engineered_MOSFET_data\C2M0025120D_cleaned_feature_engineered.csv  (rows=119730, cols=71)
Saved: feature_engineered_MOSFET_data\C2M0040120D_cleaned_feature_engineered.csv  (rows=298411, cols=71)
Saved: feature_engineered_MOSFET_data\C2M0080120D_cleaned_feature_engineered.csv  (rows=404934, cols=71)
Saved: feature_engineered_MOSFET_data\C2M0160120D_cleaned_feature_engineered.csv  (rows=158214, cols=71)
Saved: feature_engineered_MOSFET_data\C2M0280120D_cleaned_feature_engineered.csv  (rows=418047, cols=71)
Saved: feature_engineered_MOSFET_data\C2M1000170D_cleaned_feature_engineered.csv  (rows=130528, cols=71)

Done. Feature-engineered files are in: c:\Users\pc\Desktop\TRAIL\feature_engineered_MOSFET_data
