In [1]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
import random
import os
import torch
import torch.nn as nn
from models.rnn import CRNN_Model
from torch.utils.data import Dataset, DataLoader, TensorDataset, Subset
from neurokit2 import ecg
import neurokit2 as nk
import numpy as np
import stumpy
from joblib import Parallel, delayed
import matplotlib.pyplot as plt

SAMPLING_RATE = 1000


In [3]:
%load_ext jupyternotify

  from pkg_resources import resource_filename


<IPython.core.display.Javascript object>

In [11]:
class ECGDataset(Dataset):
    def __init__(self, data_folder, class_folders, files_per_class=200, mp_window = 1000):
        self.samples = []
        self.leads = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
        self.mp_window = mp_window
        for folder, label in class_folders.items():
            files = glob.glob(os.path.join(data_folder, folder, '*.parquet.gzip'))
            # enforce exact files_per_class per class (downsample or upsample with replacement)
            if len(files) >= files_per_class:
                files = random.sample(files, files_per_class)
            else:
                files = random.choices(files, k=files_per_class)

            for f in files:
                try:
                    df = pd.read_parquet(f, engine='fastparquet')
                except Exception as e:
                    print(f"Failed to read {f}: {e}")
                    continue

                # ensure required lead columns exist
                if not set(self.leads).issubset(df.columns):
                    print(f"Missing leads in {f}, skipping")
                    continue

                # convert lead columns to numeric, coerce non-numeric to NaN, then fill and cast
                df_leads = df[self.leads].apply(pd.to_numeric, errors='coerce').fillna(0).astype(np.float32)

                # shape -> (12, time)
                signal = df_leads.values.T

                matrix_profiles = []
                for i in range(signal.shape[0]):
                    mp = stumpy.stump(signal[i].astype(np.float64), m=self.mp_window)[:, 0].astype(np.float32)
                    pad_width = signal.shape[1] - len(mp)
                    padded_mp = np.pad(mp, (0, pad_width), 'constant', constant_values=0)
                    padded_mp[np.isinf(padded_mp)] = 1e9
                    matrix_profiles.append(padded_mp)
                matrix_profiles_np = np.array(matrix_profiles, dtype=np.float32)
                combined_signal = np.concatenate((signal, matrix_profiles_np), axis=0)
                # self.samples.append((torch.tensor(combined_signal, dtype=torch.float32), label, os.path.basename(f)))
                self.samples.append((torch.tensor(matrix_profiles_np, dtype=torch.float32), label, os.path.basename(f)))
                # self.samples.append((torch.tensor(signal, dtype=torch.float32), label, os.path.basename(f)))
    def process_file(self, f, label):
        try:
            df = pd.read_parquet(f, engine='fastparquet')
        except Exception as e:
            print(f"Failed to read {f}: {e}")
            return None
        
        if not set(self.leads).issubset(df.columns):
            print(f"Missing leads in {f}, skipping")
            return None

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        signal, label, ecg_id = self.samples[idx]
        return signal, label, ecg_id

# Usage example
class_folders = {
    'arritmia': 0,
    'block': 1,
    'fibrilation': 2,
    'normal': 3
}
data_folder = 'data'
dataset = ECGDataset(data_folder, class_folders, files_per_class=1970)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Inspect one batch
for signals, labels, ecg_ids in dataloader:
    print('Signals shape:', signals.shape)  # (batch, 12, time)
    print('Labels:', labels)
    print('ECG IDs:', ecg_ids)
    break

Signals shape: torch.Size([8, 12, 10000])
Labels: tensor([1, 2, 1, 0, 0, 3, 0, 1])
ECG IDs: ('329963.parquet.gzip', '439396.parquet.gzip', '350990.parquet.gzip', '404857.parquet.gzip', '541222.parquet.gzip', '295076.parquet.gzip', '339988.parquet.gzip', '523886.parquet.gzip')


In [7]:
indices = list(range(len(dataset)))
labels_arr = [dataset.samples[i][1] for i in indices]
train_idx, val_idx = train_test_split(indices, test_size=0.2, stratify=labels_arr, random_state=42)

serializable = {
    'samples': [(sig.cpu().numpy(), int(label), str(ecg_id)) for sig, label, ecg_id in dataset.samples],
    'train_idx': train_idx,
    'val_idx': val_idx,
    'leads': dataset.leads,
    'mp_window': dataset.mp_window
}
torch.save(serializable, 'loader12MP_CRNN.pth')
print(f"Saved {len(serializable['samples'])} samples + split indices to loader12MP_CRNN.pth")


Saved 40 samples + split indices to loader12MP_CRNN.pth


In [8]:
class PreloadedECGDataset(Dataset):
    def __init__(self, samples):
        # Convert numpy arrays back to tensors
        self.samples = [(torch.tensor(sig, dtype=torch.float32), label, ecg_id) for sig, label, ecg_id in samples]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

# --- Load the pre-processed data ---
print("Loading pre-processed data from loader12MP_CRNN.pth...")
loaded_data = torch.load('loader12MP_CRNN.pth', weights_only=False)

# Extract the components
all_samples = loaded_data['samples']
train_idx = loaded_data['train_idx']
val_idx = loaded_data['val_idx']

print(f"Loaded {len(all_samples)} total samples.")
print(f"Training samples: {len(train_idx)}, Validation samples: {len(val_idx)}")

# --- Create Datasets and DataLoaders ---
# Create a full dataset object from the loaded samples
full_dataset = PreloadedECGDataset(all_samples)

# Create subsets for training and validation using the saved indices
train_dataset = Subset(full_dataset, train_idx)
val_dataset = Subset(full_dataset, val_idx)

# Create the DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

print("\nDataLoaders created successfully.")

# Optional: Inspect a batch to verify
signals, labels, ids = next(iter(train_loader))
print(f"Sample batch shape: {signals.shape}")
print(f"Sample batch labels: {labels}")

Loading pre-processed data from loader12MP_CRNN.pth...
Loaded 40 total samples.
Training samples: 32, Validation samples: 8

DataLoaders created successfully.
Sample batch shape: torch.Size([8, 12, 10000])
Sample batch labels: tensor([3, 3, 2, 2, 1, 1, 0, 1])


In [9]:
class EarlyStopping:
    def __init__(self, patience=10, min_delta=0, mode='min', checkpoint_path='best_model.pth'):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.checkpoint_path = checkpoint_path
        self.counter = 0
        self.best_score = np.inf if mode == 'min' else -np.inf
        self.early_stop = False

    def __call__(self, current_score, model):
        is_better = False
        if self.mode == 'min':
            is_better = current_score < (self.best_score - self.min_delta)
        else:
            is_better = current_score > (self.best_score + self.min_delta)

        if is_better:
            self.best_score = current_score
            self.counter = 0
            print(f"Mejora detectada. Guardando modelo en {self.checkpoint_path}")
            torch.save(model.state_dict(), self.checkpoint_path)
        else:
            self.counter += 1
            print(f"Sin mejora. Contador de paciencia: {self.counter} / {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
                print("--- EARLY STOPPING ACTIVADO ---")

ECG eliminados por peso (2kb):

- block: 8846, 314864
- normal: 74424

In [13]:
%%notify
indices = list(range(len(dataset)))
labels_arr = [dataset.samples[i][1] for i in indices]
train_idx, val_idx = train_test_split(indices, test_size=0.2, stratify=labels_arr, random_state=42)

train_dataset = Subset(dataset, train_idx)
val_dataset = Subset(dataset, val_idx)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = CRNN_Model(
    n_channels_cnn=12,
    rnn_hidden_size=128, 
    rnn_num_layers=2,    
    num_classes=4,
    bidirectional=True
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

early_stopper = EarlyStopping(patience=10, mode='min', checkpoint_path='crnn_stumpy2.pth')

num_epochs = 100

print(f"--- Iniciando entrenamiento de CRNN en {device} ---")

for epoch in range(num_epochs):
    
    model.train()
    total_train_loss = 0.0
    total_train = 0
    correct_train = 0
    
    for signals, labels, ids in train_loader:
        signals = signals.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        logits = model(signals) 
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item() * signals.size(0)
        total_train += signals.size(0)
        preds = logits.argmax(dim=1)
        correct_train += (preds == labels).sum().item()
        
    train_acc = correct_train / total_train if total_train else 0.0
    avg_train_loss = total_train_loss / total_train if total_train else 0.0

    model.eval()
    total_val_loss = 0.0
    total_val = 0
    correct_val = 0
    
    with torch.inference_mode():
        for signals, labels, ids in val_loader:
            signals = signals.to(device)
            labels = labels.to(device)
            logits = model(signals)
            
            loss = criterion(logits, labels)
            total_val_loss += loss.item() * signals.size(0)
            
            preds = logits.argmax(dim=1)
            total_val += signals.size(0)
            correct_val += (preds == labels).sum().item()
            
    val_acc = correct_val / total_val if total_val else 0.0
    avg_val_loss = total_val_loss / total_val if total_val else 0.0

    print(f'Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} - Train Acc: {train_acc:.4f} | Val Loss: {avg_val_loss:.4f} - Val Acc: {val_acc:.4f}')
    early_stopper(avg_val_loss, model)
    
    if early_stopper.early_stop:
        print("Deteniendo el entrenamiento anticipadamente.")
        break

print("--- Entrenamiento Finalizado ---")

print(f"Cargando el mejor modelo desde {early_stopper.checkpoint_path} (Mejor Val Loss: {early_stopper.best_score:.6f})")
model.load_state_dict(torch.load(early_stopper.checkpoint_path))

--- Iniciando entrenamiento de CRNN en cuda ---
Epoch 1/100 | Train Loss: 1.2504 - Train Acc: 0.3828 | Val Loss: 1.0820 - Val Acc: 0.4511
Mejora detectada. Guardando modelo en crnn_stumpy2.pth
Epoch 2/100 | Train Loss: 1.1076 - Train Acc: 0.4561 | Val Loss: 1.0540 - Val Acc: 0.4778
Mejora detectada. Guardando modelo en crnn_stumpy2.pth
Epoch 3/100 | Train Loss: 1.0362 - Train Acc: 0.4998 | Val Loss: 1.0191 - Val Acc: 0.4860
Mejora detectada. Guardando modelo en crnn_stumpy2.pth
Epoch 4/100 | Train Loss: 1.0070 - Train Acc: 0.5208 | Val Loss: 0.9682 - Val Acc: 0.5381
Mejora detectada. Guardando modelo en crnn_stumpy2.pth
Epoch 5/100 | Train Loss: 0.9760 - Train Acc: 0.5412 | Val Loss: 0.9698 - Val Acc: 0.5406
Sin mejora. Contador de paciencia: 1 / 10
Epoch 6/100 | Train Loss: 0.9712 - Train Acc: 0.5327 | Val Loss: 0.9499 - Val Acc: 0.5577
Mejora detectada. Guardando modelo en crnn_stumpy2.pth
Epoch 7/100 | Train Loss: 0.9625 - Train Acc: 0.5539 | Val Loss: 0.9403 - Val Acc: 0.5647
Mejor

<All keys matched successfully>

<IPython.core.display.Javascript object>

Epoch 16/100 | Train Loss: 0.4743 - Train Acc: 0.8200 | Val Loss: 0.4341 - Val Acc: 0.8293
Mejora detectada. Guardando modelo en crnn_best_model.pth



Epoch 25/100 | Train Loss: 0.4435 - Train Acc: 0.8307 | Val Loss: 0.4710 - Val Acc: 0.8236
Mejora detectada. Guardando modelo en crnn_stumpy.pth



Epoch 51/100 | Train Loss: 0.5656 - Train Acc: 0.7762 | Val Loss: 0.6319 - Val Acc: 0.7456
Mejora detectada. Guardando modelo en crnn_stumpy2.pth

In [17]:
import pandas as pd
import glob
import random
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import shap  # Necesitarás instalarlo: pip install shap
from models.rnn import CRNN_Model

# --- PARÁMETROS ---
N_CHANNELS = 12  # <-- ¡Asegúrate que coincida con el modelo guardado!
NUM_CLASSES = 4
RNN_HIDDEN = 128
RNN_LAYERS = 2
MODEL_PATH = 'crnn_stumpy2.pth' # <-- El path de tu notebook
DATA_FOLDER = 'data'
LEADS = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
mp = ['mp_' + i for i in LEADS]
# LEADS = LEADS + mp
# print(LEADS)

CLASS_FOLDERS = {
    'arritmia': 0,
    'block': 1,
    'fibrilation': 2,
    'normal': 3
}

# --- 1. Cargar Datos para SHAP ---
print("Cargando datos...")
# Usar pocos archivos por clase para SHAP, es más rápido
# Necesitamos datos de fondo (entrenamiento) y datos de prueba (validación)
dataset = ECGDataset(DATA_FOLDER, CLASS_FOLDERS, files_per_class=50) 
indices = list(range(len(dataset)))
labels_arr = [dataset.samples[i][1] for i in indices]
train_idx, val_idx = train_test_split(indices, test_size=0.5, stratify=labels_arr, random_state=42)

train_dataset = Subset(dataset, train_idx)
val_dataset = Subset(dataset, val_idx)

# SHAP necesita lotes de datos
# Un lote de fondo (background) y un lote de prueba (test)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False) # Lote más pequeño para explicar

# Tomar un lote de fondo (para el 'baseline' de SHAP)
background_signals, _, _ = next(iter(train_loader))
# Tomar un lote de prueba (los que queremos explicar)
test_signals, test_labels, _ = next(iter(val_loader))

print(f"Background signals shape: {background_signals.shape}")
print(f"Test signals shape: {test_signals.shape}")

# --- 2. Cargar Modelo ---
print("Cargando modelo...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = CRNN_Model(
    n_channels_cnn=N_CHANNELS,
    rnn_hidden_size=RNN_HIDDEN,
    rnn_num_layers=RNN_LAYERS,
    num_classes=NUM_CLASSES,
    bidirectional=True
).to(device)

try:
    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
except RuntimeError as e:
    print(f"--- ¡ERROR AL CARGAR EL MODELO! ---")
    print(e)
    print("\nEsto suele pasar si 'N_CHANNELS' no coincide con el modelo guardado.")
    print(f"Estás intentando cargar {N_CHANNELS} canales.")
    print("Verifica el 'n_channels_cnn' en tu notebook (era 13) vs los leads de 'ECGDataset' (eran 12).")
    print("Ajusta 'N_CHANNELS' en este script para que coincida con el modelo .pth guardado.")
    exit() # Salir si no se puede cargar

model.eval()
print("Modelo cargado exitosamente.")

Cargando datos...
Background signals shape: torch.Size([8, 12, 10000])
Test signals shape: torch.Size([8, 12, 10000])
Cargando modelo...
Modelo cargado exitosamente.


In [18]:
# --- 3. Calcular SHAP Values ---
print("Calculando SHAP values...")

background_signals = background_signals.to(device)
test_signals = test_signals.to(device)

# Desactivar CuDNN fuerza a usar la implementación nativa de PyTorch (más lenta pero flexible)
torch.backends.cudnn.enabled = False 

try:
    # Aquí puedes dejar el modelo en eval() o train(), usualmente eval() funciona sin CuDNN
    model.eval() 
    
    explainer = shap.GradientExplainer(model, background_signals)
    shap_values = explainer.shap_values(test_signals)
    
finally:
    # ¡Muy importante reactivarlo al terminar!
    torch.backends.cudnn.enabled = True 

print("Cálculo de SHAP finalizado.")

# shap_values es una lista (una por clase) de arrays
# Cada array tiene la forma (batch_size, 12, 10000)
print(f"SHAP values tiene {len(shap_values)} elementos (uno por clase)")
print(f"El shape de los SHAP para la clase 0 es: {shap_values[0].shape}")


# --- 4. Analizar Importancia por Lead ---
print("\n--- Importancia Absoluta Media por Lead ---")

# Para obtener la importancia general de cada lead, promediamos el valor absoluto
# de SHAP a través de todas las clases, muestras y tiempo.
# Convertimos la lista de (N_samples, N_leads, N_time) a (N_classes, N_samples, N_leads, N_time)
shap_values_np = np.array(shap_values)

# (N_classes, N_samples, N_leads, N_time) -> (N_leads)
# Promediamos sobre clases, muestras y tiempo
mean_abs_shap = np.mean(np.abs(shap_values_np), axis=(0, 1, 3))

lead_importance = sorted(zip(mp, mean_abs_shap), key=lambda x: x[1], reverse=True)
for lead, importance in lead_importance:
    print(f"{lead}: {importance:.6f}")


Calculando SHAP values...
Cálculo de SHAP finalizado.
SHAP values tiene 8 elementos (uno por clase)
El shape de los SHAP para la clase 0 es: (12, 10000, 4)

--- Importancia Absoluta Media por Lead ---
mp_V2: 0.000274
mp_aVR: 0.000271
mp_aVF: 0.000269
mp_II: 0.000254
mp_I: 0.000246
mp_V1: 0.000244
mp_V4: 0.000238
mp_V3: 0.000232
mp_aVL: 0.000221
mp_III: 0.000220
mp_V6: 0.000143
mp_V5: 0.000100


--- Importancia Absoluta Media por Lead ---
aVL: 0.000084
V2: 0.000084
V4: 0.000077
V3: 0.000071
V6: 0.000070
aVF: 0.000066
aVR: 0.000066
II: 0.000065
V1: 0.000064
V5: 0.000062
III: 0.000060
I: 0.000057

Cargando datos...
Background signals shape: torch.Size([8, 24, 10000])
Test signals shape: torch.Size([8, 24, 10000])
Cargando modelo...
Modelo cargado exitosamente.
Calculando SHAP values (esto puede tardar)...
Cálculo de SHAP finalizado.
SHAP values tiene 8 elementos (uno por clase)
El shape de los SHAP para la clase 0 es: (24, 10000, 4)

#### --- Importancia Absoluta Media por Lead ---
- mp_aVR: 0.000084
- mp_V2: 0.000082
- V2: 0.000081
- mp_V6: 0.000080
- mp_aVF: 0.000080
- V6: 0.000076
- mp_V4: 0.000074
- aVR: 0.000074
- mp_V1: 0.000072
- mp_aVL: 0.000070
- V4: 0.000069
- mp_II: 0.000069
- mp_V3: 0.000066
- mp_III: 0.000065
- mp_V5: 0.000063
- aVF: 0.000060
- V3: 0.000060
- mp_I: 0.000060
- aVL: 0.000059
- II: 0.000058
- V1: 0.000057
- III: 0.000057
- I: 0.000054
- V5: 0.000048

Calculando SHAP values...
Cálculo de SHAP finalizado.
SHAP values tiene 8 elementos (uno por clase)
El shape de los SHAP para la clase 0 es: (12, 10000, 4)

### --- Importancia Absoluta Media por Lead ---
- mp_V2: 0.000274
- mp_aVR: 0.000271
- mp_aVF: 0.000269
- mp_II: 0.000254
- mp_I: 0.000246
- mp_V1: 0.000244
- mp_V4: 0.000238
mp_V3: 0.000232
mp_aVL: 0.000221
mp_III: 0.000220
mp_V6: 0.000143
mp_V5: 0.000100