In [None]:
import pandas as pd
import glob
import os
import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from joblib import Parallel, delayed
import multiprocessing
import neurokit2 as nk # Asumo que esto está instalado por tus nbs anteriores

# --- CONFIGURACIÓN ---
DATA_FOLDER = 'data' # Ajusta esto a tu ruta real
SAMPLES_PER_CLASS = 2000 # Para asegurar balanceo base
MAX_WORKERS = -1
PRE_DISPATCH = '2*n_jobs'
SAMPLING_RATE = 1000
LEADS = ['I','II','III','aVR','aVL','aVF','V1','V2','V3','V4','V5','V6']

# Mapeo Maestro (Normal = 0 facilita la lógica binaria)
CLASS_MAP = {
    'normal': 0,
    'arritmia': 1,
    'block': 2,
    'fibrilation': 3
}

In [None]:

# --- TU FUNCIÓN DE EXTRACCIÓN (Copiada de tus nbs) ---
def toFeature(signal: pd.core.frame.DataFrame, time = False):
    time_features = ["HRV_MeanNN", "HRV_SDNN", "HRV_RMSSD", "HRV_pNN50"]
    F = []
    for lead in LEADS:
        clean = nk.ecg_clean(signal[lead], sampling_rate=SAMPLING_RATE)
        _, rpeaks = nk.ecg_peaks(clean, sampling_rate=SAMPLING_RATE)
        rpeak_indices = rpeaks['ECG_R_Peaks']
        # Pasar de largo si no hay suficientes R-peaks
        if np.sum(rpeak_indices) < 2:
            F += [np.nan, np.nan, np.nan, np.nan]
            continue
        try:
            _, waves_peak = nk.ecg_delineate(clean, rpeaks, sampling_rate=SAMPLING_RATE, method="peak")
            mean_r = np.mean([clean[i] if not np.isnan(i) else 0 for i in rpeaks['ECG_R_Peaks']]) if np.any(rpeaks['ECG_R_Peaks']) else np.nan
            mean_p = np.mean([clean[i] if not np.isnan(i) else 0 for i in waves_peak['ECG_P_Peaks']]) if 'ECG_P_Peaks' in waves_peak else np.nan
            mean_q = np.mean([clean[i] if not np.isnan(i) else 0 for i in waves_peak['ECG_Q_Peaks']]) if 'ECG_Q_Peaks' in waves_peak else np.nan
            mean_s = np.mean([clean[i] if not np.isnan(i) else 0 for i in waves_peak['ECG_S_Peaks']]) if 'ECG_S_Peaks' in waves_peak else np.nan
        except Exception:
            mean_r = mean_p = mean_q = mean_s = np.nan
        F += [mean_r, mean_p, mean_q, mean_s]
    # Features temporales con lead II:
    clean2 = nk.ecg_clean(signal["II"], sampling_rate=SAMPLING_RATE)
    _, rpeaks = nk.ecg_peaks(clean2, sampling_rate=SAMPLING_RATE)
    valid_rpeaks = [r for r in rpeaks['ECG_R_Peaks'] if not np.isnan(r)]
    if len(valid_rpeaks) >= 2: # Seguir de largo si no hay R-peaks suficientes
        if time:
            t = nk.hrv_time(rpeaks, sampling_rate=SAMPLING_RATE)
            F.extend(t[time_features].values.flatten().tolist())
    else:
        nan_count = 0
        if time: nan_count += len(time_features)
        F.extend([np.nan] * nan_count)
    return np.array(F)

def _process_file(item):
    path, label = item
    try:
        df = pd.read_parquet(path, engine='fastparquet')
        if not set(LEADS).issubset(df.columns): return None
        df_leads = df[LEADS].apply(pd.to_numeric, errors='coerce').fillna(0).astype(np.float32)
        feat = toFeature(df_leads, time=True) 
        
        return feat, label
    except Exception:
        return None


In [None]:

# --- 1. CARGA Y PREPARACIÓN DEL DATASET MAESTRO ---
print("1. Generando lista de archivos balanceada...")
file_items = []
for folder, label in CLASS_MAP.items():
    folder_path = os.path.join(DATA_FOLDER, folder)
    paths = glob.glob(os.path.join(folder_path, '*.parquet.gzip'))
    
    # Manejo si no hay datos (para probar el script)
    if not paths: 
        print(f"Warning: No data for {folder}, skipping/mocking")
        continue 

    if len(paths) >= SAMPLES_PER_CLASS:
        sampled = random.sample(paths, SAMPLES_PER_CLASS)
    else:
        sampled = random.choices(paths, k=SAMPLES_PER_CLASS) # Upsample
    
    file_items.extend([(p, label) for p in sampled])

random.shuffle(file_items)


# Extracción Paralela Real
cpu_count = multiprocessing.cpu_count()
workers = max(1, cpu_count - 1)
results = Parallel(n_jobs=workers, backend='loky', verbose=1)(
    delayed(_process_file)(item) for item in file_items
)

# Limpieza
valid_results = [r for r in results if r is not None]
X_raw = np.vstack([r[0] for r in valid_results])
y_all = np.array([r[1] for r in valid_results])

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_raw)

print(f"Dataset Maestro: X={X_imputed.shape}, y={y_all.shape}")

# --- 2. SPLIT SAGRADO (El mismo test set para ambos) ---
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y_all, test_size=0.2, random_state=42, stratify=y_all
)


In [None]:

# --- ESTRATEGIA 1: RANDOM FOREST MULTICLASE (PLANO) ---
print("\n--- Estrategia 1: Multiclase Directa ---")
rf_flat = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=42)
rf_flat.fit(X_train, y_train)

y_pred_flat = rf_flat.predict(X_test)
acc_flat = accuracy_score(y_test, y_pred_flat)
print(f"Accuracy (Plano): {acc_flat:.4f}")

In [None]:

# --- ESTRATEGIA 2: JERÁRQUICA (NORMAL vs ANORMAL -> CLASIFICAR ANORMAL) ---
print("\n--- Estrategia 2: Jerárquica (Segmentada) ---")

# Paso 2.1: Preparar datos para Fase 1 (Binaria)
# 0 = Normal, 1 = Anormal (Arritmia, Block, Fib)
y_train_binary = (y_train != 0).astype(int) 
# Nota: En train tenemos 25% normal y 75% anormal. 
# Es VITAL usar class_weight='balanced' para que no se sesgue a predecir siempre "Anormal".

rf_binary = RandomForestClassifier(n_estimators=1000, class_weight='balanced', n_jobs=-1, random_state=42)
rf_binary.fit(X_train, y_train_binary)

# Paso 2.2: Preparar datos para Fase 2 (Solo Anormales)
# Filtramos el set de entrenamiento para quedarnos solo con las patologías
mask_anormal_train = y_train != 0
X_train_sub = X_train[mask_anormal_train]
y_train_sub = y_train[mask_anormal_train]

rf_subclass = RandomForestClassifier(n_estimators=1000, class_weight='balanced', n_jobs=-1, random_state=42)
rf_subclass.fit(X_train_sub, y_train_sub)

# Paso 2.3: Inferencia Jerárquica sobre X_test (La lógica de combinación)
# Primero, predecimos si es normal o anormal
y_pred_binary_test = rf_binary.predict(X_test)

# Creamos un array para guardar las predicciones finales
y_pred_hierarchical = np.zeros_like(y_pred_binary_test)

# Caso A: Si el modelo binario dice "Normal" (0), la predicción final es 0 (Normal)
# (Esto ya está hecho porque inicializamos con ceros, pero es conceptual)
y_pred_hierarchical[y_pred_binary_test == 0] = 0 

# Caso B: Si el modelo binario dice "Anormal" (1), pasamos esos datos al modelo subclase
mask_pred_anormal = y_pred_binary_test == 1

if np.any(mask_pred_anormal):
    # Solo predecimos sobre los que pasaron el primer filtro
    subclass_predictions = rf_subclass.predict(X_test[mask_pred_anormal])
    y_pred_hierarchical[mask_pred_anormal] = subclass_predictions

# Evaluación
acc_hier = accuracy_score(y_test, y_pred_hierarchical)
print(f"Accuracy (Jerárquico): {acc_hier:.4f}")

In [None]:

# --- COMPARACIÓN FINAL ---
print("\n=== RESULTADOS FINALES ===")
print(f"Estrategia 1 (Plano):      {acc_flat:.4f}")
print(f"Estrategia 2 (Jerárquico): {acc_hier:.4f}")

print("\nDetalle Estrategia 1:")
print(classification_report(y_test, y_pred_flat, target_names=list(CLASS_MAP.keys())))

print("\nDetalle Estrategia 2:")
print(classification_report(y_test, y_pred_hierarchical, target_names=list(CLASS_MAP.keys())))

# Matriz de confusión para entender errores de la jerárquica
# Ver si el error viene de confundir Normal con Anormal, o de confundir patologías entre sí
print("\nMatriz de Confusión (Jerárquica):")
print(confusion_matrix(y_test, y_pred_hierarchical))