# Real‑Time Alertness Score (0‑100) Pipeline
Trains on **Wake (100)** vs **Deep (0)**, then checks whether REM & Light fall in between.


In [1]:

import os, pickle, warnings
from pathlib import Path
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from scipy.signal import butter, filtfilt, welch
import mne

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")


In [2]:

DATA_ROOT = Path('sample_data')  # <-- make sure this folder exists
FS = 125
EPOCH_SEC = 30
MIN_GOOD_RATIO = 0.6

TRAIN_POS, TRAIN_NEG = 'Wake', 'Deep'


In [3]:

def bandpass(x, fs=FS, low=0.5, high=45, order=4):
    nyq = 0.5*fs
    b,a = butter(order, [low/nyq, high/nyq], btype='band')
    return filtfilt(b,a,x)

def epoch_features(epoch, fs=FS):
    bands={'delta':(0.5,4),'theta':(4,8),'alpha':(8,12),'beta':(12,30),'gamma':(30,45)}
    fdict={}
    for ch in range(epoch.shape[0]):
        sig=epoch[ch]
        fdict[f'ch{ch}_mean']=sig.mean()
        fdict[f'ch{ch}_std']=sig.std()
        f,P=welch(sig, fs=fs, nperseg=4*fs)
        total=np.trapz(P,f)
        for band,(lo,hi) in bands.items():
            mask=(f>=lo)&(f<hi)
            power=np.trapz(P[mask],f[mask])
            fdict[f'ch{ch}_{band}_rel']=power/total if total else 0
    fdict['theta_alpha_ratio']=(
        fdict['ch0_theta_rel']/fdict['ch0_alpha_rel']
        if fdict['ch0_alpha_rel']>0 else 0)
    return fdict


In [4]:

def build_epochs(sample_dir:Path):
    edf=next(sample_dir.glob('*.edf'))
    stage_csv=next(sample_dir.glob('*[0-9].csv'))
    sqc_csv=next(sample_dir.glob('*SQC.csv'))

    raw=mne.io.read_raw_edf(edf, preload=True, verbose=False)
    data=bandpass(raw.get_data()[:4], FS)

    df_stage=pd.read_csv(stage_csv)
    df_sqc=pd.read_csv(sqc_csv)
    start_ts=df_stage['Timestamp'].min()

    out=[]
    for _,row in df_stage.iterrows():
        ts=row['Timestamp']; lbl=row['Sleep stage']
        st=int((ts-start_ts)*FS); en=st+EPOCH_SEC*FS
        if en>data.shape[1]: break
        mask=(df_sqc['Timestamp']>=ts)&(df_sqc['Timestamp']<ts+EPOCH_SEC)
        good_ratio=(df_sqc.loc[mask,'Signal quality']=='Good').mean()
        if good_ratio<MIN_GOOD_RATIO: continue
        feats=epoch_features(data[:,st:en])
        feats.update({'good_ratio':good_ratio,'stage':lbl,'record':sample_dir.name})
        out.append(feats)
    return pd.DataFrame(out)


In [5]:

sample_dirs=[p for p in DATA_ROOT.iterdir() if p.is_dir()]
if not sample_dirs:
    raise FileNotFoundError("No sample_data directory found or it's empty.")
all_epochs=pd.concat([build_epochs(p) for p in sample_dirs], ignore_index=True)
print("Epoch dataframe:", all_epochs.shape)


Epoch dataframe: (16088, 32)


In [6]:

train=all_epochs[all_epochs['stage'].isin([TRAIN_POS,TRAIN_NEG])].copy()
train['y']=(train['stage']==TRAIN_POS).astype(int)
X=train.drop(columns=['stage','record','y'])
y=train['y']; groups=train['record']
gkf=GroupKFold(n_splits=5)
tr_idx,te_idx=next(gkf.split(X,y,groups))
X_tr,X_te=X.iloc[tr_idx],X.iloc[te_idx]
y_tr,y_te=y.iloc[tr_idx],y.iloc[te_idx]


In [7]:

pipe_lr=Pipeline([('scaler',StandardScaler()),
                  ('clf',LogisticRegression(max_iter=1000))]).fit(X_tr,y_tr)

auc_tr=roc_auc_score(y_tr,pipe_lr.predict_proba(X_tr)[:,1])
auc_te=roc_auc_score(y_te,pipe_lr.predict_proba(X_te)[:,1])
print(f"LogReg AUC train={auc_tr:.3f} test={auc_te:.3f}")


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:

def score(model,X): return (model.predict_proba(X)[:,1]*100).clip(0,100)
all_epochs['score']=score(pipe_lr, all_epochs.drop(columns=['stage','record']))


In [None]:

import matplotlib.pyplot as plt, seaborn as sns
plt.figure(figsize=(6,4))
sns.boxplot(x='stage',y='score',data=all_epochs,
            order=['Deep','REM','Light','Wake'])
plt.title('Alertness score by stage')
plt.ylabel('Score (0‑100)')
plt.show()


In [None]:

with open('alertness_model.pkl','wb') as f: pickle.dump(pipe_lr,f)
print("Model saved.")
