In [7]:
#!/usr/bin/env python
"""
Unified EIS → SoC (classification) & SoH (regression) Model – Realistic v9‑fixed
================================================================================

• Random‑Forest classifier for SoC
• Gaussian‑Process regressor for SoH
• Optional capacity-test refinement & CPP‑based RUL
• End‑to‑end artefacts written to models_eis_phase2_phys/
"""

# ──────────────────────────────────────────────────────────────────────────
# 0. Imports
# ──────────────────────────────────────────────────────────────────────────
from __future__ import annotations
import re, json, math, random, warnings, joblib
from dataclasses import dataclass, asdict, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.io import loadmat
from scipy.interpolate import interp1d

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    mean_squared_error, r2_score
)

import matplotlib.pyplot as plt

# ──────────────────────────────────────────────────────────────────────────
# 1. Configuration
# ──────────────────────────────────────────────────────────────────────────
@dataclass
class Config:
    # ---------------------------------------------------------------------
    # paths
    EIS_DIR: Path = Path(r"C:\Users\tmgon\OneDrive - Edith Cowan University\00 - Megallan Power\NMC Batteries Warwick Station\NMC\DIB_Data\.matfiles\EIS_Test")
    CAP_DIR: Path = Path(r"C:\Users\tmgon\OneDrive - Edith Cowan University\00 - Megallan Power\NMC Batteries Warwick Station\NMC\DIB_Data\.matfiles\Capacity_Check")
    MODEL_DIR: Path = Path("models_eis_phase2_phys")

    # evaluate exactly these files after training
    EIS_TEST_FILES: List[Path] = field(default_factory=lambda: [Path("Mazda-Battery-Cell1.xlsx")])

    # ---------------------------------------------------------------------
    # frequency grid
    F_MIN: float = 1e-2
    F_MAX: float = 1e4
    N_FREQ: int  = 60

    # ---------------------------------------------------------------------
    # train / test split
    TEST_FRAC: float = 0.2           # by CellID
    RANDOM_STATE: int = 42

    # ---------------------------------------------------------------------
    # dimensionality options
    USE_PCA_SOC: bool = True
    PCA_SOC_COMPONENTS: int = 25
    USE_PCA_SOH: bool = False
    PCA_SOH_COMPONENTS: int = 30

    # ---------------------------------------------------------------------
    # feature‑toggles  (only BASIC & RAW in this minimal script)
    INCLUDE_RAW_RE_IM:  bool = True
    INCLUDE_BASICS:     bool = True

    # ---------------------------------------------------------------------
    # DRT (disabled in this trimmed version – set True & add code if needed)
    INCLUDE_DRT: bool = False
    DRT_POINTS: int   = 60
    DRT_TAU_MIN: float = 1e-4
    DRT_TAU_MAX: float = 1e4
    DRT_LAMBDA:  float = 1e-2

    # ---------------------------------------------------------------------
    # capacity / RUL
    REFINE_SOH_WITH_CAPACITY: bool = True
    CPP_ROLLING_WINDOW: int = 5
    CPP_MIN_POINTS:    int = 6
    CPP_FALLBACK:      float = 20.0      # cycles‑per‑percent if no cap‑data
    DECISION_SOH_PERCENT: float = 50.0
    ILLUSTRATIVE_MIN_SOH: float = 40.0

    # ---------------------------------------------------------------------
    # misc
    TEST_TEMPERATURE_OVERRIDE: Optional[float] = 25.0
    MAX_GPR_TRAIN_SAMPLES: int = 3500
    FORCE_RETRAIN: bool = True
    VERBOSE: bool = True
    FEATURE_VERSION: int = 8

cfg = Config()
cfg.MODEL_DIR.mkdir(parents=True, exist_ok=True)

random.seed(cfg.RANDOM_STATE)
np.random.seed(cfg.RANDOM_STATE)

CANON_FREQ = np.geomspace(cfg.F_MAX, cfg.F_MIN, cfg.N_FREQ)

# ──────────────────────────────────────────────────────────────────────────
# 2. Regex helpers
# ──────────────────────────────────────────────────────────────────────────
EIS_META_PATTERN = re.compile(
    r"Cell(?P<CellID>\d+)_(?P<SOH>80|85|90|95|100)SOH_"
    r"(?P<Temp>\d+)degC_(?P<SOC>\d+)SOC_(?P<RealSOH>\d+)"
)
CAP_META_PATTERN = re.compile(
    r"Cell(?P<CellID>\d+)_(?P<SOH>80|85|90|95|100)SOH_"
    r"Capacity_Check_(?P<Temp>\d+)degC_(?P<Cycle>\d+)cycle"
)

def parse_eis_metadata(stem: str) -> Optional[Dict[str,Any]]:
    m = EIS_META_PATTERN.search(stem)
    if not m: return None
    d = m.groupdict()
    return {"CellID": f"Cell{d['CellID']}",
            "SOH_stage": int(d["SOH"]),
            "SOC": int(d["SOC"]),
            "Temp": int(d["Temp"]),
            "RealSOH_file": int(d["RealSOH"]) / 100.0}

def parse_cap_metadata(stem: str) -> Optional[Dict[str,Any]]:
    m = CAP_META_PATTERN.search(stem)
    if not m: return None
    d = m.groupdict()
    return {"CellID": f"Cell{d['CellID']}",
            "SOH_stage": int(d["SOH"]),
            "Temp": int(d["Temp"]),
            "CycleIndex": int(d["Cycle"])}

# ──────────────────────────────────────────────────────────────────────────
# 3. Loading helpers
# ──────────────────────────────────────────────────────────────────────────
def _find_matrix(mat_dict: dict):
    for v in mat_dict.values():
        if isinstance(v, np.ndarray) and v.ndim == 2 and v.shape[1] >= 3:
            return v
    return None

def _interp_channel(freq_raw, y_raw, freq_target):
    freq_raw = np.asarray(freq_raw, float)
    y_raw    = np.asarray(y_raw,   float)
    if freq_raw[0] < freq_raw[-1]:           # ensure descending
        freq_raw = freq_raw[::-1]
        y_raw    = y_raw[::-1]
    uniq, idx = np.unique(freq_raw, return_index=True)
    if len(uniq) != len(freq_raw):           # drop dups
        order   = np.argsort(idx)
        freq_raw, y_raw = uniq[order], y_raw[idx][order]
    f = interp1d(freq_raw, y_raw, bounds_error=False,
                 fill_value=(y_raw[0], y_raw[-1]), kind="linear")
    return f(freq_target)

FREQ_CANDS = ["frequency","freq","f","hz","Frequency(Hz)"]
RE_CANDS   = ["zreal","re","real","Zreal","Zreal (ohm)"]
IM_CANDS   = ["-zimag","zimag","im","imag","Zimag","Zimag (ohm)"]

def _sel_col(df, cands):
    lower={c.lower():c for c in df.columns}
    for c in cands:
        if c.lower() in lower: return lower[c.lower()]
    for c in cands:
        for col in df.columns:
            if c.lower() in col.lower(): return col
    return None

def load_mat_eis(path: Path):
    arr=_find_matrix(loadmat(path))
    if arr is None: raise ValueError("no valid matrix")
    return arr[:,0], arr[:,1], arr[:,2]

def load_table_eis(path: Path):
    df = pd.read_csv(path) if path.suffix.lower()==".csv" else pd.read_excel(path)

    fcol = _sel_col(df, FREQ_CANDS)
    rcol = _sel_col(df, RE_CANDS)
    icol = _sel_col(df, IM_CANDS)
    if rcol is None or icol is None:
        raise ValueError(f"Missing Re/Im columns in {path.name}")

    re   = pd.to_numeric(df[rcol], errors="coerce").values
    im   = pd.to_numeric(df[icol], errors="coerce").values
    if fcol:
        freq = pd.to_numeric(df[fcol], errors="coerce").values
    else:
        freq = np.geomspace(cfg.F_MAX, cfg.F_MIN, len(re))

    if np.nanmean(im) > 0:                        # enforce capacitive sign
        im = -im
    n = min(len(freq), len(re), len(im))
    return freq[:n], re[:n], im[:n]

def load_any(path: Path):
    suf = path.suffix.lower()
    if suf == ".mat":  return load_mat_eis(path)
    if suf in (".csv",".xls",".xlsx"): return load_table_eis(path)
    raise ValueError(f"unsupported file type {suf}")

# ──────────────────────────────────────────────────────────────────────────
# 4. Features   (RAW + BASIC only for brevity)
# ──────────────────────────────────────────────────────────────────────────
def basic_feats(re_i, im_i):
    z = np.hypot(re_i, im_i)
    return [re_i[0], re_i[-1], re_i[-1]-re_i[0], z.max(), z.mean(), z.std()]

def build_feature_vector(re_i, im_i, temp, freq):
    parts=[]
    if cfg.INCLUDE_RAW_RE_IM:
        parts += [re_i, im_i]
    if cfg.INCLUDE_BASICS:
        parts.append(basic_feats(re_i, im_i))
    parts.append([temp])
    v = np.concatenate(parts).astype(float)
    return np.nan_to_num(v, nan=0.0, posinf=0.0, neginf=0.0)

# ──────────────────────────────────────────────────────────────────────────
# 5. Capacity helpers (CPP / label refinement)
# ──────────────────────────────────────────────────────────────────────────
def load_capacity_df(cap_dir: Path):
    if not cap_dir.exists(): return pd.DataFrame()
    recs=[]
    for fp in cap_dir.rglob("*.mat"):
        meta = parse_cap_metadata(fp.stem)
        if not meta: continue
        try:
            arr = _find_matrix(loadmat(fp))
            col = np.argmax(np.abs(arr[-50:,:]).mean(axis=0))
            meta["MeasuredCapacity_Ah"] = float(np.nanmax(arr[:,col]))
            recs.append(meta)
        except Exception:
            pass
    df = pd.DataFrame(recs)
    if df.empty: return df
    ref=df.groupby("CellID")["MeasuredCapacity_Ah"].transform("max")
    df["NormCapacity"]=df["MeasuredCapacity_Ah"]/ref
    df["SoH_percent"]=df["NormCapacity"]*100.0
    return df

def estimate_cpp(cap_df: pd.DataFrame):
    cpp={}
    for cid, grp in cap_df.groupby("CellID"):
        if grp.shape[0] < cfg.CPP_MIN_POINTS: continue
        tail = grp.sort_values("CycleIndex").tail(cfg.CPP_ROLLING_WINDOW)
        x, y = tail.CycleIndex.values, tail.SoH_percent.values
        if len(np.unique(x)) < 2: continue
        slope = np.polyfit(x, y, 1)[0]
        if slope < -1e-6:
            cpp[cid] = 1.0/abs(slope)
    return cpp

# ──────────────────────────────────────────────────────────────────────────
# 6. Build dataset
# ──────────────────────────────────────────────────────────────────────────
def load_single_mat(fp: Path):
    meta=parse_eis_metadata(fp.stem)
    freq,re_raw,im_raw = load_mat_eis(fp)
    re_i=_interp_channel(freq,re_raw,CANON_FREQ)
    im_i=_interp_channel(freq,im_raw,CANON_FREQ)
    return meta, re_i, im_i

def build_dataset(eis_dir: Path, cap_df: pd.DataFrame):
    files = sorted(eis_dir.rglob("*.mat"))
    if not files: raise FileNotFoundError("No .mat spectra found")

    rows, feats = [], []
    for fp in tqdm(files, desc="Load spectra"):
        try:
            meta, re_i, im_i = load_single_mat(fp)
            vec = build_feature_vector(re_i, im_i, meta["Temp"], CANON_FREQ)
            rows.append(meta); feats.append(vec)
        except Exception as e:
            if cfg.VERBOSE: print(f"[skip] {fp.name}: {e}")

    meta_df = pd.DataFrame(rows)
    X_raw   = np.vstack(feats)

    # --- refine SoH labels with capacity tests ---------------------------
    if cfg.REFINE_SOH_WITH_CAPACITY and not cap_df.empty:
        lk = cap_df.set_index(["CellID","SOH_stage"])["NormCapacity"].to_dict()
        refined=[]
        for cid, stage, fallback in zip(meta_df.CellID, meta_df.SOH_stage, meta_df.RealSOH_file):
            v = lk.get((cid, stage))
            refined.append(100.0*v if v is not None else fallback*100.0)
        meta_df["SoH_cont"] = refined
    else:
        meta_df["SoH_cont"] = meta_df["RealSOH_file"]*100

    return meta_df, X_raw, meta_df.SOC.values, meta_df.SoH_cont.values

# ──────────────────────────────────────────────────────────────────────────
# 7. Train & plot
# ──────────────────────────────────────────────────────────────────────────
def split_mask(df):
    cells = df.CellID.unique()
    rng   = np.random.default_rng(cfg.RANDOM_STATE)
    test  = rng.choice(cells, size=max(1,int(len(cells)*cfg.TEST_FRAC)), replace=False)
    return df.CellID.isin(test)

def train_models(meta_df, X_raw, y_soc, y_soh):
    mtest = split_mask(meta_df)

    # --- SoC --------------------------------------------------------------
    soc_scaler = StandardScaler().fit(X_raw)
    X_soc_s    = soc_scaler.transform(X_raw)
    soc_pca    = PCA(n_components=min(cfg.PCA_SOC_COMPONENTS,X_soc_s.shape[1]-1),
                     random_state=cfg.RANDOM_STATE).fit(X_soc_s) if cfg.USE_PCA_SOC else None
    X_soc_in   = soc_pca.transform(X_soc_s) if soc_pca else X_soc_s
    soc_model  = RandomForestClassifier(n_estimators=600,class_weight="balanced",
                                        random_state=cfg.RANDOM_STATE).fit(
                     X_soc_in[~mtest], y_soc[~mtest])

    y_pred = soc_model.predict(X_soc_in[mtest])
    cm     = confusion_matrix(y_soc[mtest], y_pred, labels=soc_model.classes_)
    plt.figure(figsize=(4,4))
    plt.imshow(cm, cmap="Blues")
    plt.xlabel("Predicted"); plt.ylabel("True")
    plt.xticks(range(len(soc_model.classes_)), soc_model.classes_)
    plt.yticks(range(len(soc_model.classes_)), soc_model.classes_)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j,i,cm[i,j],ha="center",va="center")
    plt.tight_layout(); plt.savefig(cfg.MODEL_DIR/"soc_confusion.png", dpi=150); plt.close()

    if cfg.VERBOSE:
        print("\n[SoC] accuracy:",
              accuracy_score(y_soc[mtest], y_pred))
        print(classification_report(y_soc[mtest], y_pred, digits=4))

    # --- SoH --------------------------------------------------------------
    soh_scaler = StandardScaler().fit(X_raw)
    X_soh_s    = soh_scaler.transform(X_raw)
    soh_pca    = PCA(n_components=min(cfg.PCA_SOH_COMPONENTS,X_soh_s.shape[1]-1),
                     random_state=cfg.RANDOM_STATE).fit(X_soh_s) if cfg.USE_PCA_SOH else None
    X_soh_in   = soh_pca.transform(X_soh_s) if soh_pca else X_soh_s

    kernel = RBF(length_scale=np.ones(X_soh_in.shape[1])*3.0,
                 length_scale_bounds=(1e-1,1e4)) + WhiteKernel(1e-2,(1e-6,1e-1))
    gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True,
                                   n_restarts_optimizer=3, random_state=cfg.RANDOM_STATE)
    if X_soh_in.shape[0] > cfg.MAX_GPR_TRAIN_SAMPLES:
        idx = np.random.default_rng(cfg.RANDOM_STATE)\
                 .choice(X_soh_in.shape[0], size=cfg.MAX_GPR_TRAIN_SAMPLES, replace=False)
        gpr.fit(X_soh_in[idx], y_soh[idx])
    else:
        gpr.fit(X_soh_in, y_soh)

    # quick scatter plot
    y_pred_test = gpr.predict(X_soh_in[mtest])
    plt.figure(figsize=(4,4))
    plt.scatter(y_soh[mtest], y_pred_test, s=18)
    lo,hi=min(y_soh[mtest].min(),y_pred_test.min()), max(y_soh[mtest].max(),y_pred_test.max())
    plt.plot([lo,hi],[lo,hi],'k--',lw=1)
    plt.xlabel("True SoH"); plt.ylabel("Pred"); plt.tight_layout()
    plt.savefig(cfg.MODEL_DIR/"soh_regression_test.png", dpi=150); plt.close()

    print(f"[SoH] R² test = {r2_score(y_soh[mtest], y_pred_test):.3f}")

    return {"soc_scaler":soc_scaler, "soc_pca":soc_pca, "soc_model":soc_model,
            "soh_scaler":soh_scaler, "soh_pca":soh_pca, "soh_model":gpr,
            "freq_grid":CANON_FREQ}

# ──────────────────────────────────────────────────────────────────────────
# 8. Inference helpers
# ──────────────────────────────────────────────────────────────────────────
def featurize(path: Path, bundle):
    meta = parse_eis_metadata(path.stem) or {}
    f,re,im = load_any(path)
    re_i = _interp_channel(f,re,bundle["freq_grid"])
    im_i = _interp_channel(f,im,bundle["freq_grid"])
    temp = meta.get("Temp", cfg.TEST_TEMPERATURE_OVERRIDE or -1)
    return build_feature_vector(re_i, im_i, temp, bundle["freq_grid"]), meta

def predict(path: Path, bundle):
    v, meta = featurize(path, bundle)

    # SoC
    x = bundle["soc_scaler"].transform(v.reshape(1,-1))
    if bundle["soc_pca"] is not None:
        x = bundle["soc_pca"].transform(x)
    soc_pred = int(bundle["soc_model"].predict(x)[0])
    soc_prob = bundle["soc_model"].predict_proba(x)[0]

    # SoH
    xs = bundle["soh_scaler"].transform(v.reshape(1,-1))
    if bundle["soh_pca"] is not None:
        xs = bundle["soh_pca"].transform(xs)
    soh_mean, soh_std = bundle["soh_model"].predict(xs, return_std=True)
    return {"file": str(path),
            "predicted_SoC": soc_pred,
            "SoC_probabilities": {int(c):float(p) for c,p in zip(bundle["soc_model"].classes_, soc_prob)},
            "predicted_SoH_percent": float(soh_mean[0]),
            "SoH_std_estimate": float(soh_std[0]),
            "parsed_metadata": meta}

# ──────────────────────────────────────────────────────────────────────────
# 9. RUL projection (optional, uses CPP map)
# ──────────────────────────────────────────────────────────────────────────
def build_projection(soh,cpp,lower,exp=1.25,n=150):
    if soh<=lower or cpp<=0: return np.array([0]), np.array([soh])
    total=(soh-lower)*cpp
    cycles=np.linspace(0,total,n)
    curve=lower+(soh-lower)*(1-cycles/total)**exp
    return cycles,curve

def plot_projection(base,soh,std,cpp,out):
    cyc,cur=build_projection(soh,cpp,cfg.ILLUSTRATIVE_MIN_SOH)
    plt.figure(figsize=(5.5,4))
    plt.plot(cyc,cur,lw=2); plt.axhline(cfg.DECISION_SOH_PERCENT,ls="--",color="orange")
    plt.axhline(cfg.ILLUSTRATIVE_MIN_SOH,ls=":",color="red")
    plt.scatter([0],[soh],c="green"); plt.text(0,soh+0.5,f"{soh:.1f}±{std:.1f}%")
    plt.xlabel("Remaining cycles"); plt.ylabel("SoH (%)")
    plt.title(f"RUL projection – {base}"); plt.tight_layout()
    plt.savefig(out,dpi=140); plt.close()

# ──────────────────────────────────────────────────────────────────────────
# 10. Main
# ──────────────────────────────────────────────────────────────────────────
def main():
    # --- capacity data (optional) ----------------------------------------
    cap_df = load_capacity_df(cfg.CAP_DIR) if cfg.REFINE_SOH_WITH_CAPACITY else pd.DataFrame()
    cpp_map = estimate_cpp(cap_df) if not cap_df.empty else {}
    global_cpp = np.median(list(cpp_map.values())) if cpp_map else cfg.CPP_FALLBACK

    # --- dataset + training ----------------------------------------------
    meta_df,X,y_soc,y_soh = build_dataset(cfg.EIS_DIR, cap_df)
    bundle = train_models(meta_df,X,y_soc,y_soh)
    joblib.dump(bundle, cfg.MODEL_DIR/"eis_soc_soh_phys_models.joblib")

    # --- inference on each test file -------------------------------------
    for tf in cfg.EIS_TEST_FILES:
        if not tf.exists():
            print(f"[WARN] {tf} not found"); continue
        res = predict(tf,bundle)
        print("\n",json.dumps(res, indent=2))

        cpp = cpp_map.get(res["parsed_metadata"].get("CellID"), global_cpp)
        plot_projection(tf.stem, res["predicted_SoH_percent"],
                        res["SoH_std_estimate"], cpp,
                        cfg.MODEL_DIR/f"{tf.stem}_projection.png")

        with open(cfg.MODEL_DIR/f"{tf.stem}_prediction.json","w") as f:
            json.dump(res,f,indent=2)

    print("\nArtefacts written to", cfg.MODEL_DIR)

if __name__ == "__main__":
    main()


Load spectra: 100%|██████████| 360/360 [00:00<00:00, 635.49it/s]



[SoC] accuracy: 0.9833333333333333
              precision    recall  f1-score   support

           5     1.0000    0.9167    0.9565        12
          20     1.0000    1.0000    1.0000        12
          50     0.9231    1.0000    0.9600        12
          70     1.0000    1.0000    1.0000        12
          95     1.0000    1.0000    1.0000        12

    accuracy                         0.9833        60
   macro avg     0.9846    0.9833    0.9833        60
weighted avg     0.9846    0.9833    0.9833        60





[SoH] R² test = 0.994

 {
  "file": "Mazda-Battery-Cell1.xlsx",
  "predicted_SoC": 5,
  "SoC_probabilities": {
    "5": 0.34958333333333336,
    "20": 0.30333333333333334,
    "50": 0.04541666666666666,
    "70": 0.12333333333333334,
    "95": 0.17833333333333334
  },
  "predicted_SoH_percent": 9034.75,
  "SoH_std_estimate": 724.4900289733498,
  "parsed_metadata": {}
}

Artefacts written to models_eis_phase2_phys
