┌─[1] Config & constants  → cfg

│
├─[2] Low-level loaders (MAT, CSV/XLSX, interpolation)

│
├─[3] Feature-engineering helpers
│     • raw Re/Im          • “F” heuristics
│     • physical features  • band statistics
│     • diff-segment slopes• DRT features  ←★
│

├─[4] Capacity parser  → cycles-per-percent (CPP) map
│

├─[5] Dataset builder  → X_raw, y_SoC, y_SoH (+ optional shape model)
│

├─[6] Model trainer
│     • RandomForest        → SoC (classification)
│     • GPR / HGB ensemble  → SoH (regression)
│     • optional shape-normalised GPR for SoH
│
├─[7] Bundle saver/loader (.joblib)
│
├─[8] featurize_any()       ← handles *any* test file ext.
│
├─[9] predict_file()        ← single-spectrum inference
│
└─[10] main()
       ↳ build/load bundle → predict test_fp
       ↳ save JSON + RUL projection plot


In [2]:
# ================================================================
# Unified EIS Training + Inference + Dynamic RUL  (v8 – single file)
# ================================================================
#   • Accepts ONE EIS test file (cfg.EIS_TEST_FILE or --test path)
#   • Caps SoH uncertainty to ±5 percentage-points
#   • Outputs the single most-likely SoC class
#   • Back-compatible with legacy model bundles
# ================================================================

from __future__ import annotations
import sys, argparse, json, math, random, re, warnings, joblib
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.io import loadmat
from scipy import linalg
from scipy.interpolate import interp1d

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report,
    mean_squared_error, r2_score
)
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# =========================
# 1. CONFIGURATION
# =========================
@dataclass
class Config:
    # --- local folders -------------------------------------------------
    EIS_DIR: Path = Path(r"C:\Users\tgondal0\OneDrive - Edith Cowan University\00 - Megallan Power\NMC Batteries Warwick Station\NMC\DIB_Data\.matfiles\EIS_Test")
    CAP_DIR: Path = Path(r"C:\Users\tgondal0\OneDrive - Edith Cowan University\00 - Megallan Power\NMC Batteries Warwick Station\NMC\DIB_Data\.matfiles\Capacity_Check")
    MODEL_DIR: Path = Path("models_eis_phase2_phys")
    EIS_TEST_FILE: Path = Path(r"C:\Users\tgondal0\OneDrive - Edith Cowan University\00 - Megallan Power\NMC Batteries Warwick Station\NMC\TestFile\Mazda-Battery-Cell5.xlsx")

    # --- spectrum grid -------------------------------------------------
    F_MIN: float = 1e-2
    F_MAX: float = 1e4
    N_FREQ: int  = 60

    # --- split & random ------------------------------------------------
    TEST_FRAC: float = 0.2

    # --- PCA flags -----------------------------------------------------
    USE_PCA_SOC: bool = True
    USE_PCA_SOH: bool = False
    PCA_SOC_COMPONENTS: int = 25
    PCA_SOH_COMPONENTS: int = 30

    # --- feature toggles ----------------------------------------------
    INCLUDE_RAW_RE_IM: bool = True
    INCLUDE_BASICS: bool = True
    INCLUDE_F_FEATS: bool = True
    INCLUDE_PHYSICAL: bool = True
    INCLUDE_DRT: bool = True
    INCLUDE_BAND_STATS: bool = True
    INCLUDE_DIFF_SLOPES: bool = True

    # --- DRT params ----------------------------------------------------
    DRT_POINTS: int = 60
    DRT_TAU_MIN: float = 1e-4
    DRT_TAU_MAX: float = 1e4
    DRT_LAMBDA: float = 1e-2

    # --- SoH / RUL -----------------------------------------------------
    REFINE_SOH_WITH_CAPACITY: bool = True
    MAX_GPR_TRAIN_SAMPLES: int = 3500
    INCLUDE_NORMALIZED_SHAPE_MODEL: bool = True
    ENSEMBLE_SOH: bool = True
    NORMALIZE_SHAPE_BY_HF_RE: bool = True
    DECISION_SOH_PERCENT: float = 50.0
    ILLUSTRATIVE_MIN_SOH: float = 40.0
    CPP_ROLLING_WINDOW: int = 5
    CPP_MIN_POINTS: int = 6
    CPP_FALLBACK: float = 20.0

    # --- extras --------------------------------------------------------
    TEST_TEMPERATURE_OVERRIDE: Optional[float] = 25.0
    FORCE_RETRAIN: bool = False
    SAVE_FEATURE_TABLE: bool = True
    VERBOSE: bool = True
    FEATURE_VERSION: int = 8           # ← bumped
    MAHAL_THRESHOLD: float = 10.0
    GP_ARD_NORM_THRESHOLD: float = 6.0
    PLOT_EXPONENT: float = 1.25

cfg = Config()
cfg.FORCE_RETRAIN = False
cfg.MODEL_DIR.mkdir(parents=True, exist_ok=True)

# =========================
# 2. UTILITIES
# =========================
def set_seed(seed: int):
    random.seed(seed); np.random.seed(seed)
set_seed(cfg.RANDOM_STATE)

def to_jsonable(x):
    if isinstance(x, Path): return str(x)
    if isinstance(x, dict): return {k: to_jsonable(v) for k, v in x.items()}
    if isinstance(x, (list, tuple)): return [to_jsonable(i) for i in x]
    return x

CANON_FREQ = np.geomspace(cfg.F_MAX, cfg.F_MIN, cfg.N_FREQ)

# =========================
# 3. REGEX & METADATA PARSERS
# =========================
EIS_META_PATTERN = re.compile(
    r"Cell(?P<CellID>\d+)_(?P<SOH>80|85|90|95|100)SOH_(?P<Temp>\d+)degC_(?P<SOC>\d+)SOC_(?P<RealSOH>\d+)"
)
CAP_META_PATTERN = re.compile(
    r"Cell(?P<CellID>\d+)_(?P<SOH>80|85|90|95|100)SOH_Capacity_Check_(?P<Temp>\d+)degC_(?P<Cycle>\d+)cycle"
)

def parse_eis_metadata(stem: str)->Optional[Dict[str,Any]]:
    m = EIS_META_PATTERN.search(stem)
    if not m: return None
    d = m.groupdict()
    return {"CellID":f"Cell{d['CellID']}", "SOH_stage":int(d["SOH"]),
            "SOC":int(d["SOC"]), "Temp":int(d["Temp"]),
            "RealSOH_file":int(d["RealSOH"])/100.0}

def parse_cap_metadata(stem:str)->Optional[Dict[str,Any]]:
    m = CAP_META_PATTERN.search(stem)
    if not m: return None
    d=m.groupdict()
    return {"CellID":f"Cell{d['CellID']}", "SOH_stage":int(d["SOH"]),
            "Temp":int(d["Temp"]), "CycleIndex":int(d["Cycle"])}

# =========================
# 4. LOW-LEVEL LOADERS
# =========================
def _find_matrix(mat_dict:dict):
    for v in mat_dict.values():
        if isinstance(v,np.ndarray) and v.ndim==2 and v.shape[1]>=3 and v.shape[0]>=10:
            return v
    return None

def _interp_channel(freq_raw,y_raw,freq_target):
    freq_raw=np.asarray(freq_raw,float); y_raw=np.asarray(y_raw,float)
    if freq_raw[0] < freq_raw[-1]:
        freq_raw=freq_raw[::-1]; y_raw=y_raw[::-1]
    uniq,idx=np.unique(freq_raw,return_index=True)
    if len(uniq)!=len(freq_raw):
        order=np.argsort(idx); freq_raw=uniq[order]; y_raw=y_raw[idx][order]
    f=interp1d(freq_raw,y_raw,bounds_error=False,
               fill_value=(y_raw[0],y_raw[-1]),kind="linear")
    return f(freq_target)

FREQ_CANDS=["frequency","freq","f","hz","frequency(hz)","Frequency(Hz)"]
RE_CANDS  =["zreal","re(z)","re","real","z_re","zreal(ohm)","Re (ohm)","re(z) (ohm)","Zreal","Zreal (ohm)"]
IM_CANDS  =["-zimag","zimag","im(z)","im","imag","imaginary","z_im","zimg"," -Zimag (ohm)","-Zimag","Zimag","Zimag (ohm)"]

def _select_column(df:pd.DataFrame,cands:List[str])->Optional[str]:
    low={c.lower():c for c in df.columns}
    for c in cands:
        if c.lower() in low: return low[c.lower()]
    for c in cands:
        for col in df.columns:
            if c.lower() in col.lower(): return col
    return None

def load_mat_eis(path:Path):
    mat=loadmat(path); arr=_find_matrix(mat)
    if arr is None: raise ValueError(f"No EIS matrix in {path.name}")
    return arr[:,0].astype(float),arr[:,1].astype(float),arr[:,2].astype(float)

def load_table_eis(path:Path):
    df=pd.read_csv(path) if path.suffix.lower()==".csv" else pd.read_excel(path)
    if df.empty: raise ValueError("Empty table")
    fcol=_select_column(df,FREQ_CANDS); recol=_select_column(df,RE_CANDS); imcol=_select_column(df,IM_CANDS)
    if recol is None or imcol is None: raise ValueError(f"Missing Re/Im columns in {path.name}")
    re_vals=pd.to_numeric(df[recol],errors="coerce").to_numpy()
    im_vals=pd.to_numeric(df[imcol],errors="coerce").to_numpy()
    freq_vals=pd.to_numeric(df[fcol],errors="coerce").to_numpy() if fcol else \
               np.geomspace(cfg.F_MAX,cfg.F_MIN,min(len(re_vals),len(im_vals)))
    n=min(len(freq_vals),len(re_vals),len(im_vals))
    freq_vals,re_vals,im_vals=freq_vals[:n],re_vals[:n],im_vals[:n]
    if np.nanmean(im_vals)>0: im_vals=-im_vals
    return freq_vals,re_vals.astype(float),im_vals.astype(float)

def load_any_inference(path:Path):
    suf=path.suffix.lower()
    if suf==".mat": return load_mat_eis(path)
    if suf in(".csv",".xls",".xlsx"): return load_table_eis(path)
    raise ValueError(f"Unsupported test file ext: {suf}")

# =========================
# 5. FEATURE ENGINEERING
# =========================
def compute_F_features(freq,re_i,im_i):
    neg_im=-im_i; idx_peak=int(np.argmax(neg_im))
    F1,F2,F3=re_i[0],re_i[idx_peak],re_i[-1]
    sc=np.where(np.sign(im_i[:-1])!=np.sign(im_i[1:]))[0]
    if len(sc):
        k=sc[0]; y0,y1=im_i[k],im_i[k+1]; w=-y0/(y1-y0+1e-12)
        F4=re_i[k]+w*(re_i[k+1]-re_i[k])
    else: F4=np.nan
    F5=(re_i[idx_peak]-F1) if idx_peak>0 else np.nan
    F6=np.min(im_i)
    idx_mid=int(np.argmin(np.abs(freq-10.0))); F7=re_i[idx_mid]
    return [F1,F2,F3,F4,F5,F6,F7]

PHYSICAL_FEATURE_NAMES=["Rs","Rct","tau_peak","warburg_sigma","arc_quality",
                        "phase_mean_mid","phase_std_mid","phase_min","lf_slope_negIm","norm_arc"]

def physical_features(freq,re_i,im_i):
    neg_im=-im_i; idx_peak=int(np.argmax(neg_im))
    Rs=float(re_i[0]); Rpeak=float(re_i[idx_peak]); Rlow=float(re_i[-1])
    Rct=max(Rpeak-Rs,0.0); arc_diam=Rlow-Rs; norm_arc=arc_diam/(Rs+1e-9)
    f_peak=float(freq[idx_peak]); tau_peak=1.0/(2*math.pi*f_peak) if f_peak>0 else np.nan
    K=min(10,len(freq)//3)
    warburg_sigma=np.nan
    if K>=4:
        w_section=(2*np.pi*freq[-K:])**(-0.5); re_section=re_i[-K:]
        if len(np.unique(w_section))>2: warburg_sigma=float(np.polyfit(w_section,re_section,1)[0])
    phase=np.arctan2(-im_i,re_i); mid_mask=(freq>=1)&(freq<=100)
    phase_mean_mid=float(phase[mid_mask].mean()) if mid_mask.sum()>2 else np.nan
    phase_std_mid=float(phase[mid_mask].std())  if mid_mask.sum()>2 else np.nan
    phase_min=float(phase.min())
    lf_mask=(freq<=1.0); lf_slope=np.nan
    if lf_mask.sum()>=4:
        x=np.log10(freq[lf_mask]+1e-12); y=neg_im[lf_mask]; lf_slope=np.polyfit(x,y,1)[0]
    arc_quality=(neg_im.max()-neg_im.min())/(abs(neg_im.mean())+1e-9)
    return [Rs,Rct,tau_peak,warburg_sigma,arc_quality,
            phase_mean_mid,phase_std_mid,phase_min,lf_slope,norm_arc]

BANDS=[(1e4,1e3),(1e3,1e2),(1e2,10),(10,1),(1,1e-1),(1e-1,1e-2)]
def band_stats(freq,re_i,im_i):
    feats=[]; freq=np.asarray(freq)
    for hi,lo in BANDS:
        m=(freq<=hi)&(freq>=lo)
        if m.sum()>1:
            z=np.hypot(re_i[m],im_i[m]); feats+=[z.mean(),z.std()]
        else: feats+=[np.nan,np.nan]
    return feats

def diff_slopes(freq,re_i,im_i,segments=5):
    logf=np.log10(freq); edges=np.linspace(logf.min(),logf.max(),segments+1); out=[]
    for i in range(segments):
        m=(logf>=edges[i])&(logf<=edges[i+1])
        if m.sum()>=3:
            x=logf[m]; out+=[np.polyfit(x,re_i[m],1)[0], np.polyfit(x,(-im_i)[m],1)[0]]
        else: out+=[np.nan,np.nan]
    return out

DRT_FEATURE_NAMES=["drt_sum","drt_mean_logtau","drt_var_logtau","drt_peak_tau",
                   "drt_peak_gamma","drt_frac_low_tau","drt_frac_high_tau"]

def compute_drt(freq,re_i,im_i,tau_min,tau_max,n_tau,lam):
    w=2*np.pi*freq; tau=np.geomspace(tau_max,tau_min,n_tau); WT=w[:,None]*tau[None,:]
    K_re=1.0/(1+WT**2); K_im=-WT/(1+WT**2)
    R_inf=re_i[0]; y_re=re_i-R_inf; y_im=im_i
    Y=np.concatenate([y_re,y_im]); K=np.vstack([K_re,K_im])
    A=K.T@K + lam*np.eye(n_tau); b=K.T@Y
    gamma=linalg.solve(A,b,assume_a='pos'); return tau,np.clip(gamma,0,None)

def drt_features(freq,re_i,im_i):
    try:
        tau,gamma=compute_drt(freq,re_i,im_i,
                              cfg.DRT_TAU_MIN,cfg.DRT_TAU_MAX,cfg.DRT_POINTS,cfg.DRT_LAMBDA)
        log_tau=np.log10(tau); g_sum=gamma.sum()+1e-12; w_norm=gamma/g_sum
        mean_logtau=float((w_norm*log_tau).sum())
        var_logtau=float((w_norm*(log_tau-mean_logtau)**2).sum())
        p=int(np.argmax(gamma)); peak_tau=float(tau[p]); peak_gamma=float(gamma[p])
        mid=np.median(log_tau); frac_low=float(w_norm[log_tau<=mid].sum()); frac_high=1-frac_low
        return [g_sum,mean_logtau,var_logtau,peak_tau,peak_gamma,frac_low,frac_high]
    except Exception: return [np.nan]*7

def build_feature_vector(re_i,im_i,temp,freq,include_names=False):
    parts=[]; names=[]
    if cfg.INCLUDE_RAW_RE_IM:
        parts+=[re_i,im_i]
        names+=[f"Re_{i}" for i in range(len(re_i))]+[f"Im_{i}" for i in range(len(im_i))]
    if cfg.INCLUDE_BASICS:
        z=np.hypot(re_i,im_i)
        basics=[re_i[0],re_i[-1],re_i[-1]-re_i[0],z.max(),z.mean(),z.std()]
        parts.append(np.array(basics)); names+=["hf_re","lf_re","arc_diam","zmag_max","zmag_mean","zmag_std"]
    if cfg.INCLUDE_F_FEATS:
        parts.append(np.array(compute_F_features(freq,re_i,im_i))); names+=[f"F{i}" for i in range(1,8)]
    if cfg.INCLUDE_PHYSICAL:
        parts.append(np.array(physical_features(freq,re_i,im_i))); names+=PHYSICAL_FEATURE_NAMES
    if cfg.INCLUDE_BAND_STATS:
        parts.append(np.array(band_stats(freq,re_i,im_i)))
        for i in range(len(BANDS)): names+=[f"band{i}_mean",f"band{i}_std"]
    if cfg.INCLUDE_DIFF_SLOPES:
        Ds=diff_slopes(freq,re_i,im_i); parts.append(np.array(Ds))
        for i in range(len(Ds)//2): names+=[f"slope_re_seg{i}",f"slope_negIm_seg{i}"]
    if cfg.INCLUDE_DRT:
        parts.append(np.array(drt_features(freq,re_i,im_i))); names+=DRT_FEATURE_NAMES
    parts.append(np.array([temp])); names+=["Temp_feat"]
    vec=np.concatenate(parts).astype(float); vec=np.nan_to_num(vec,0,0,0)
    return (vec,names) if include_names else vec

def build_shape_normalized(re_i,im_i):
    hf=re_i[0] if re_i[0]!=0 else 1.0
    return re_i/hf, im_i/hf

# =========================
# 6. CAPACITY → CPP
# =========================
def load_capacity_info(cap_dir:Path)->pd.DataFrame:
    if not (cap_dir.exists() and cfg.REFINE_SOH_WITH_CAPACITY): return pd.DataFrame()
    recs=[]
    for fp in cap_dir.rglob("*.mat"):
        meta=parse_cap_metadata(fp.stem); 
        if not meta: continue
        try:
            mat=loadmat(fp); arr=_find_matrix(mat); 
            if arr is None: continue
            col=np.argmax(np.abs(arr[-50:,:]).mean(axis=0)); cap=float(np.nanmax(arr[:,col]))
            meta["MeasuredCapacity_Ah"]=cap; recs.append(meta)
        except Exception: pass
    df=pd.DataFrame(recs)
    if df.empty: return df
    ref=df.groupby("CellID")["MeasuredCapacity_Ah"].transform("max")
    df["NormCapacity"]=df["MeasuredCapacity_Ah"]/ref; df["SoH_percent"]=df["NormCapacity"]*100.0
    return df

def estimate_cpp_per_cell(cap_df:pd.DataFrame,window:int,min_pts:int)->Dict[str,float]:
    cpp={}
    for cid,grp in cap_df.groupby("CellID"):
        g=grp.sort_values("CycleIndex")
        if g.shape[0]<min_pts: continue
        tail=g.tail(window); x=tail["CycleIndex"].values; y=tail["SoH_percent"].values
        if len(np.unique(x))<2: continue
        slope=np.polyfit(x,y,1)[0]
        if slope>=-1e-6: continue
        cpp[cid]=1.0/abs(slope)
    return cpp

def build_cpp_map(df:pd.DataFrame):
    if df.empty: return {},cfg.CPP_FALLBACK
    cpp_map=estimate_cpp_per_cell(df[["CellID","CycleIndex","SoH_percent"]],
                                  cfg.CPP_ROLLING_WINDOW,cfg.CPP_MIN_POINTS)
    return (cpp_map,float(np.median(list(cpp_map.values())) if cpp_map else cfg.CPP_FALLBACK))

def get_cpp(meta:dict,cpp_map:Dict[str,float],global_cpp:float)->float:
    if not meta: return global_cpp
    return cpp_map.get(meta.get("CellID"),global_cpp)

# =========================
# 7. DATASET BUILD & TRAIN
# =========================
def load_single_eis_mat(fp:Path):
    meta=parse_eis_metadata(fp.stem); 
    if meta is None: raise ValueError(f"Bad filename {fp.name}")
    f,r,i=load_mat_eis(fp); re_i=_interp_channel(f,r,CANON_FREQ); im_i=_interp_channel(f,i,CANON_FREQ)
    vec=build_feature_vector(re_i,im_i,meta["Temp"],CANON_FREQ)
    return vec,meta,re_i,im_i

def build_dataset(eis_dir:Path,cap_df:Optional[pd.DataFrame]):
    files=sorted(eis_dir.rglob("*.mat")); 
    if not files: raise FileNotFoundError("No .mat spectra in training dir")
    f0,r0,i0=load_mat_eis(files[0]); re0=_interp_channel(f0,r0,CANON_FREQ); im0=_interp_channel(f0,i0,CANON_FREQ)
    _,feat_names=build_feature_vector(re0,im0,25.0,CANON_FREQ,include_names=True)
    feats,rows,shape_feats=[],[],[]
    for fp in tqdm(files,desc="Loading"):
        try:
            v,m,rei,imi=load_single_eis_mat(fp); feats.append(v); rows.append(m)
            if cfg.INCLUDE_NORMALIZED_SHAPE_MODEL and cfg.NORMALIZE_SHAPE_BY_HF_RE:
                rsh,ish=build_shape_normalized(rei,imi)
                shape_feats.append(build_feature_vector(rsh,ish,m["Temp"],CANON_FREQ))
        except Exception as e:
            if cfg.VERBOSE: print(f"[Skip]{fp.name}: {e}")
    if not rows: raise RuntimeError("No valid spectra")
    X=np.vstack(feats); X_shape=np.vstack(shape_feats) if shape_feats else None
    meta_df=pd.DataFrame(rows)
    # SoH refinement
    if cap_df is not None and not cap_df.empty and cfg.REFINE_SOH_WITH_CAPACITY:
        lookup=cap_df.set_index(["CellID","SOH_stage"])["NormCapacity"].to_dict()
        meta_df["SoH_cont"]=[
            100.0*lookup.get((cid,stage), fallback)
            for cid,stage,fallback in zip(meta_df.CellID,meta_df.SOH_stage,meta_df.RealSOH_file)
        ]
    else: meta_df["SoH_cont"]=meta_df["RealSOH_file"]
    y_soc=meta_df["SOC"].values; y_soh=meta_df["SoH_cont"].values
    if cfg.SAVE_FEATURE_TABLE:
        pd.concat([meta_df.reset_index(drop=True),
                   pd.DataFrame(X,columns=feat_names)],axis=1).to_parquet(cfg.MODEL_DIR/"training_features.parquet",index=False)
    return meta_df,X,(X_shape,feat_names),y_soc,y_soh

def cell_split_mask(meta_df:pd.DataFrame):
    cells=meta_df.CellID.unique(); rng=np.random.default_rng(cfg.RANDOM_STATE)
    n_test=max(1,int(len(cells)*cfg.TEST_FRAC)); test_cells=rng.choice(cells,size=n_test,replace=False)
    return meta_df.CellID.isin(test_cells)

def train_models(meta_df,X_raw,shape_bundle,y_soc,y_soh):
    X_shape,feat_names=shape_bundle; mask_test=cell_split_mask(meta_df)
    # --- SoC pipeline -------------------------------------------
    soc_scaler=StandardScaler(); X_soc_s=soc_scaler.fit_transform(X_raw)
    soc_pca=None; X_soc_in=X_soc_s
    if cfg.USE_PCA_SOC:
        soc_pca=PCA(n_components=min(cfg.PCA_SOC_COMPONENTS,X_soc_s.shape[1]-1),random_state=cfg.RANDOM_STATE)
        X_soc_in=soc_pca.fit_transform(X_soc_s)
    soc_model=RandomForestClassifier(n_estimators=600,min_samples_leaf=2,class_weight='balanced',
                                     n_jobs=-1,random_state=cfg.RANDOM_STATE)
    soc_model.fit(X_soc_in[~mask_test],y_soc[~mask_test])
    if cfg.VERBOSE:
        preds=soc_model.predict(X_soc_in[mask_test])
        print(f"[SoC] Acc={accuracy_score(y_soc[mask_test],preds):.3f} F1={f1_score(y_soc[mask_test],preds,average='macro'):.3f}")
    # --- SoH raw pipeline ---------------------------------------
    soh_scaler=StandardScaler(); X_soh_s=soh_scaler.fit_transform(X_raw)
    soh_pca=None; X_soh_in=X_soh_s
    if cfg.USE_PCA_SOH:
        soh_pca=PCA(n_components=min(cfg.PCA_SOH_COMPONENTS,X_soh_s.shape[1]-1),random_state=cfg.RANDOM_STATE)
        X_soh_in=soh_pca.fit_transform(X_soh_s)
    kernel=RBF(length_scale=np.ones(X_soh_in.shape[1])*3.0,length_scale_bounds=(1e-1,1e4))+\
           WhiteKernel(noise_level=1e-2,noise_level_bounds=(1e-6,1e-1))
    gpr=GaussianProcessRegressor(kernel=kernel,alpha=0.0,normalize_y=True,
                                 random_state=cfg.RANDOM_STATE,n_restarts_optimizer=3)
    idx=np.random.default_rng(cfg.RANDOM_STATE).choice(X_soh_in.shape[0],
            size=min(cfg.MAX_GPR_TRAIN_SAMPLES,X_soh_in.shape[0]),replace=False)
    gpr.fit(X_soh_in[idx],y_soh[idx])
    r2_gpr=r2_score(y_soh[mask_test],gpr.predict(X_soh_in[mask_test]))
    # Alt model
    hgb=HistGradientBoostingRegressor(learning_rate=0.05,max_iter=500,l2_regularization=1e-3,
                                      random_state=cfg.RANDOM_STATE)
    hgb.fit(X_soh_in[~mask_test],y_soh[~mask_test])
    r2_hgb=r2_score(y_soh[mask_test],hgb.predict(X_soh_in[mask_test]))
    best_model,model_name=(gpr,"gpr_raw") if r2_gpr>=r2_hgb else (hgb,"hgb_raw")
    # shape model
    shape_scaler=shape_pca=shape_model=shape_metrics=None
    if cfg.INCLUDE_NORMALIZED_SHAPE_MODEL and X_shape is not None:
        shape_scaler=StandardScaler(); Xs=shape_scaler.fit_transform(X_shape)
        shape_pca=None; Xs_in=Xs
        if cfg.USE_PCA_SOH:
            shape_pca=PCA(n_components=min(cfg.PCA_SOH_COMPONENTS,Xs.shape[1]-1),random_state=cfg.RANDOM_STATE)
            Xs_in=shape_pca.fit_transform(Xs)
        kernel_s=RBF(length_scale=np.ones(Xs_in.shape[1])*3.0,length_scale_bounds=(1e-1,1e4))+\
                 WhiteKernel(noise_level=1e-2,noise_level_bounds=(1e-6,1e-1))
        shape_model=GaussianProcessRegressor(kernel=kernel_s,alpha=0.0,normalize_y=True,
                                             random_state=cfg.RANDOM_STATE,n_restarts_optimizer=3)
        idx_s=np.random.default_rng(cfg.RANDOM_STATE).choice(Xs_in.shape[0],
              size=min(cfg.MAX_GPR_TRAIN_SAMPLES,Xs_in.shape[0]),replace=False)
        shape_model.fit(Xs_in[idx_s],y_soh[idx_s])
        r2_shape=r2_score(y_soh[mask_test],shape_model.predict(Xs_in[mask_test]))
        shape_metrics={"r2":r2_shape}
    # Mahalanobis stats
    cov=np.cov(X_soh_s.T); cov_inv=np.linalg.pinv(cov); center=X_soh_s.mean(axis=0)
    bundle={
        "soc_scaler":soc_scaler,"soc_pca":soc_pca,"soc_model":soc_model,
        "soh_scaler":soh_scaler,"soh_pca":soh_pca,"soh_model":best_model,"soh_model_name":model_name,
        "shape_scaler":shape_scaler,"shape_pca":shape_pca,"shape_model":shape_model,
        "freq_grid":CANON_FREQ,"feature_version":cfg.FEATURE_VERSION,"feature_manifest":feat_names,
        "train_mahal":{"center":center.tolist(),"cov_inv":cov_inv.tolist()}
    }
    joblib.dump(bundle,cfg.MODEL_DIR/"eis_soc_soh_phys_models.joblib")
    return bundle

# =========================
# 8. LOAD (with legacy shim)
# =========================
def load_bundle():
    path=cfg.MODEL_DIR/"eis_soc_soh_phys_models.joblib"; b=joblib.load(path)
    if "soc_scaler" not in b:   # legacy
        b["soc_scaler"]=b["scaler"]; b["soh_scaler"]=b["scaler"]
        b["soc_pca"]=b.get("pca"); b["soh_pca"]=b.get("pca")
    return b


# =========================
# 8½.  INFERENCE FEATURISATION  (missing helper)
# =========================
def featurize_any(file_path: Path, bundle):
    """
    • Loads ANY test file (.mat, .csv, .xls, .xlsx)
    • Interpolates it onto the model’s canonical frequency grid
    • Builds the feature vector (+ optional shape-normalised vector)
    • Returns: (raw_vec, shape_vec_or_None, parsed_metadata_or_None)
    """
    freq_grid = bundle["freq_grid"]          # grid saved inside the bundle
    meta = parse_eis_metadata(file_path.stem)

    # raw spectrum
    freq, re_raw, im_raw = load_any_inference(file_path)
    re_i = _interp_channel(freq, re_raw, freq_grid)
    im_i = _interp_channel(freq, im_raw, freq_grid)

    # temperature (use override if no metadata)
    if meta is None:
        temp = cfg.TEST_TEMPERATURE_OVERRIDE if cfg.TEST_TEMPERATURE_OVERRIDE is not None else -1
    else:
        temp = meta["Temp"]

    # main feature vector
    vec = build_feature_vector(re_i, im_i, temp, freq_grid)

    # shape-normalised vector (for the ensemble GP)
    norm_vec = None
    if cfg.INCLUDE_NORMALIZED_SHAPE_MODEL and bundle.get("shape_model") is not None:
        rsh, ish = build_shape_normalized(re_i, im_i)
        norm_vec = build_feature_vector(rsh, ish, temp, freq_grid)

    return vec, norm_vec, meta



# =========================
# 9. PROJECTION PLOT
# =========================
def build_projection(soh,cpp,lower,exp=None,n=160):
    if soh<=lower or cpp<=0: return np.array([0.0]),np.array([soh])
    total=(soh-lower)*cpp; cycs=np.linspace(0,total,n)
    S0=soh; exp=cfg.PLOT_EXPONENT if exp is None else exp
    curve=lower+(S0-lower)*(1-cycs/total)**exp
    return cycs,curve

def plot_projection(name,soh,soh_std,cyc_t,cyc_l,cpp,ood,out):
    if cyc_l<=0: return
    cycs,curve=build_projection(soh,cpp,cfg.ILLUSTRATIVE_MIN_SOH)
    plt.figure(figsize=(6.4,4)); plt.plot(cycs,curve,lw=2,label="Projected SoH")
    plt.axhline(cfg.DECISION_SOH_PERCENT,color="orange",ls="--"); plt.axhline(cfg.ILLUSTRATIVE_MIN_SOH,color="red",ls=":")
    plt.scatter([0],[soh],c="green",s=55); plt.text(0,soh+0.7,f"±{soh_std:.2f}",color="green",fontsize=8)
    if cyc_t>0:
        plt.axvline(cyc_t,color="orange",ls="-."); plt.scatter([cyc_t],[cfg.DECISION_SOH_PERCENT],c="orange",s=45)
    plt.scatter([cycs[-1]],[cfg.ILLUSTRATIVE_MIN_SOH],c="red",s=50)
    plt.xlabel("Remaining Cycles"); plt.ylabel("SoH (%)"); plt.title(f"RUL Projection – {name}")
    plt.grid(alpha=0.35); plt.tight_layout(); plt.savefig(out,dpi=140); plt.close()

# =========================
# 10. INFERENCE (single file)
# =========================
def mahalanobis_distance(x,center,cov_inv):
    d=x-center; return float(np.sqrt(d@cov_inv@d.T))

def predict_file(fp:Path,bundle,cpp_map,global_cpp):
    vec,norm_vec,meta=featurize_any(fp,bundle)
    # SoC
    soc_scaler,b_soc_pca,b_soc_model=bundle["soc_scaler"],bundle.get("soc_pca"),bundle["soc_model"]
    X_soc=soc_scaler.transform(vec.reshape(1,-1)); X_soc=b_soc_pca.transform(X_soc) if b_soc_pca else X_soc
    probs=b_soc_model.predict_proba(X_soc)[0]; soc=int(b_soc_model.classes_[probs.argmax()])
    # SoH
    s_scaler,s_pca,s_model=bundle["soh_scaler"],bundle.get("soh_pca"),bundle["soh_model"]
    Xs=s_scaler.transform(vec.reshape(1,-1)); Xs=s_pca.transform(Xs) if s_pca else Xs
    if isinstance(s_model,GaussianProcessRegressor):
        mu,sigma=s_model.predict(Xs,return_std=True); soh,sd=float(mu[0]),float(sigma[0])
    else: soh=float(s_model.predict(Xs)[0]); sd=float(bundle["train_mahal"]["center"][0])*0+5.0
    sd=min(sd,5.0)
    # ensemble
    if cfg.ENSEMBLE_SOH and bundle.get("shape_model") and norm_vec is not None:
        sh_scl,sh_pca,sh_mdl=bundle["shape_scaler"],bundle.get("shape_pca"),bundle["shape_model"]
        Xn=sh_scl.transform(norm_vec.reshape(1,-1)); Xn=sh_pca.transform(Xn) if sh_pca else Xn
        mu2=float(sh_mdl.predict(Xn)[0]); soh=0.5*(soh+mu2)
    cpp=get_cpp(meta,cpp_map,global_cpp)
    cyc_t=max((soh-cfg.DECISION_SOH_PERCENT)*cpp,0.0); cyc_l=max((soh-cfg.ILLUSTRATIVE_MIN_SOH)*cpp,0.0)
    return {"file":str(fp),"parsed_metadata":meta,"predicted_SoC":soc,
            "SoC_probabilities":{int(c):float(p) for c,p in zip(b_soc_model.classes_,probs)},
            "predicted_SoH_percent":soh,"SoH_std_estimate":sd,"cycles_per_percent_used":cpp,
            "cycles_to_target":cyc_t,"cycles_to_lower":cyc_l,
            "decision_threshold_percent":cfg.DECISION_SOH_PERCENT,
            "lower_threshold_percent":cfg.ILLUSTRATIVE_MIN_SOH,
            "feature_version":bundle["feature_version"],
            "soh_model_chosen":bundle.get("soh_model_name","raw")}, cyc_t, cyc_l

# =========================
# 11. MAIN
# =========================
def main(argv=None):
    p=argparse.ArgumentParser(add_help=False)
    p.add_argument("--test",dest="test_file"); args,_=p.parse_known_args([] if argv is None else argv)
    if args.test_file:                       # ← finish the truncated line
        cfg.EIS_TEST_FILE = Path(args.test_file)

    # ---------- sanity checks on folders ----------------------------
    assert cfg.EIS_DIR.exists(), f"EIS_DIR missing: {cfg.EIS_DIR}"
    if cfg.REFINE_SOH_WITH_CAPACITY:
        assert cfg.CAP_DIR.exists(), f"CAP_DIR missing: {cfg.CAP_DIR}"

    if cfg.VERBOSE:
        print("Configuration:\n", json.dumps(to_jsonable(asdict(cfg)), indent=2))

    # ---------- capacity ⇒ cycles-per-percent map -------------------
    cap_df = load_capacity_info(cfg.CAP_DIR)
    cpp_map, global_cpp = build_cpp_map(cap_df) if not cap_df.empty else ({}, cfg.CPP_FALLBACK)

    # ---------- train or load model bundle --------------------------
    bundle_path = cfg.MODEL_DIR / "eis_soc_soh_phys_models.joblib"
    if bundle_path.exists() and not cfg.FORCE_RETRAIN:
        bundle = load_bundle()
        if cfg.VERBOSE: print(f"[LOAD] Using bundle → {bundle_path}")
    else:
        if cfg.VERBOSE: print("[TRAIN] Building dataset & training models …")
        meta_df, X_raw, shape_bundle, y_soc, y_soh = build_dataset(cfg.EIS_DIR, cap_df)
        bundle = train_models(meta_df, X_raw, shape_bundle, y_soc, y_soh)

    # ---------- single-file inference -------------------------------
    test_fp = cfg.EIS_TEST_FILE
    if not test_fp.exists():
        raise FileNotFoundError(f"Test file not found: {test_fp}")

    result, cyc_target, cyc_lower = predict_file(test_fp, bundle, cpp_map, global_cpp)

    # ---------- save outputs (plot + JSON) --------------------------
    out_plot = cfg.MODEL_DIR / f"{test_fp.stem}_projection.png"
    plot_projection(
        test_fp.stem,
        result["predicted_SoH_percent"],
        result["SoH_std_estimate"],
        result["cycles_to_target"],
        result["cycles_to_lower"],
        result["cycles_per_percent_used"],
        False,
        out_plot
    )

    out_json = cfg.MODEL_DIR / f"{test_fp.stem}_prediction.json"
    cfg.MODEL_DIR.mkdir(exist_ok=True)
    with out_json.open("w", encoding="utf-8") as f:
        json.dump(result, f, indent=2)

    print(json.dumps(result, indent=2))
    print(f"[PLOT]  {out_plot}")
    print(f"[JSON]  {out_json}\nDone.")

# =========================
# 12. RUN (works in notebooks & terminal)
# =========================
main([])          # pass [] so Jupyter’s hidden “-f …” flag is ignored


Configuration:
 {
  "EIS_DIR": "C:\\Users\\tgondal0\\OneDrive - Edith Cowan University\\00 - Megallan Power\\NMC Batteries Warwick Station\\NMC\\DIB_Data\\.matfiles\\EIS_Test",
  "CAP_DIR": "C:\\Users\\tgondal0\\OneDrive - Edith Cowan University\\00 - Megallan Power\\NMC Batteries Warwick Station\\NMC\\DIB_Data\\.matfiles\\Capacity_Check",
  "MODEL_DIR": "models_eis_phase2_phys",
  "EIS_TEST_FILE": "C:\\Users\\tgondal0\\OneDrive - Edith Cowan University\\00 - Megallan Power\\NMC Batteries Warwick Station\\NMC\\TestFile\\Mazda-Battery-Cell5.xlsx",
  "F_MIN": 0.01,
  "F_MAX": 10000.0,
  "N_FREQ": 60,
  "TEST_FRAC": 0.2,
  "RANDOM_STATE": 42,
  "USE_PCA_SOC": true,
  "USE_PCA_SOH": false,
  "PCA_SOC_COMPONENTS": 25,
  "PCA_SOH_COMPONENTS": 30,
  "INCLUDE_RAW_RE_IM": true,
  "INCLUDE_BASICS": true,
  "INCLUDE_F_FEATS": true,
  "INCLUDE_PHYSICAL": true,
  "INCLUDE_DRT": true,
  "INCLUDE_BAND_STATS": true,
  "INCLUDE_DIFF_SLOPES": true,
  "DRT_POINTS": 60,
  "DRT_TAU_MIN": 0.0001,
  "DRT_TAU

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


[LOAD] Using bundle → models_eis_phase2_phys\eis_soc_soh_phys_models.joblib
{
  "file": "C:\\Users\\tgondal0\\OneDrive - Edith Cowan University\\00 - Megallan Power\\NMC Batteries Warwick Station\\NMC\\TestFile\\Mazda-Battery-Cell5.xlsx",
  "parsed_metadata": null,
  "predicted_SoC": 5,
  "SoC_probabilities": {
    "5": 0.4346845238095238,
    "20": 0.17007440476190472,
    "50": 0.14090327380952378,
    "70": 0.1471875,
    "95": 0.10715029761904762
  },
  "predicted_SoH_percent": 90.34750000003794,
  "SoH_std_estimate": 5.0,
  "cycles_per_percent_used": 20.0,
  "cycles_to_target": 806.9500000007588,
  "cycles_to_lower": 1006.9500000007588,
  "decision_threshold_percent": 50.0,
  "lower_threshold_percent": 40.0,
  "feature_version": 9,
  "soh_model_chosen": "gpr_raw"
}
[PLOT]  models_eis_phase2_phys\Mazda-Battery-Cell5_projection.png
[JSON]  models_eis_phase2_phys\Mazda-Battery-Cell5_prediction.json
Done.


In [5]:
# ════════════════════════════════════════════════════════════════════════
# Gradio demo – upload ONE EIS file → projection plot + predicted SoC
# ════════════════════════════════════════════════════════════════════════
import gradio as gr
import tempfile, shutil
from pathlib import Path
from PIL import Image
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# ---------- 1.  one-time model / CPP setup -----------------------------
cap_df   = load_capacity_info(cfg.CAP_DIR)
cpp_map, global_cpp = build_cpp_map(cap_df) if not cap_df.empty else ({}, cfg.CPP_FALLBACK)

bundle_path = cfg.MODEL_DIR / "eis_soc_soh_phys_models.joblib"
if bundle_path.exists() and not cfg.FORCE_RETRAIN:
    bundle = load_bundle()
    print(f"[GRADIO] Loaded bundle → {bundle_path}")
else:
    print("[GRADIO] Training bundle – first-run only …")
    meta_df, X_raw, shape_bundle, y_soc, y_soh = build_dataset(cfg.EIS_DIR, cap_df)
    bundle = train_models(meta_df, X_raw, shape_bundle, y_soc, y_soh)

# ---------- 2.  inference wrapper for Gradio ---------------------------
def run_inference(uploaded_file):
    """Gradio callback → returns (PIL.Image, int)"""
    # keep the original filename & extension
    tmp_path = Path(tempfile.gettempdir()) / Path(uploaded_file.name).name
    shutil.copy(uploaded_file.name, tmp_path)

    result, *_ = predict_file(tmp_path, bundle, cpp_map, global_cpp)

    # regenerate a fresh projection plot (saved under MODEL_DIR)
    plot_path = cfg.MODEL_DIR / f"{tmp_path.stem}_projection.png"
    plot_projection(
        tmp_path.stem,
        result["predicted_SoH_percent"],
        result["SoH_std_estimate"],
        result["cycles_to_target"],
        result["cycles_to_lower"],
        result["cycles_per_percent_used"],
        False,
        plot_path
    )
    return Image.open(plot_path), int(result["predicted_SoC"])

# ---------- 3.  build & launch the UI ----------------------------------
demo = gr.Interface(
    fn=run_inference,
    inputs=gr.File(label="Upload EIS test file"),
    outputs=[
        gr.Image(type="pil", label="RUL projection"),
        gr.Number(label="Predicted SoC (%)")
    ],
    title="EIS RUL / SoC predictor",
    description="Upload a single EIS spectrum. The model returns the projected RUL chart and the most-likely SoC class."
)

demo.launch()


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


[GRADIO] Loaded bundle → models_eis_phase2_phys\eis_soc_soh_phys_models.joblib
* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




# Updated