In [38]:
import numpy as np
import os
import re
import time
import io
import gc
import warnings
from scipy.signal import welch
from scipy.stats import skew, kurtosis
import antropy
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import pyedflib

warnings.filterwarnings("ignore", category=RuntimeWarning)
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']

def get_drive_service():
    creds = None
    creds_folder = 'credentials'
    token_path = os.path.join(creds_folder, 'token.json')
    credentials_path = os.path.join(creds_folder, 'credentials.json')

    os.makedirs(creds_folder, exist_ok=True)

    if os.path.exists(token_path):
        creds = Credentials.from_authorized_user_file(token_path, SCOPES)

    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            if not os.path.exists(credentials_path):
                raise FileNotFoundError(
                    f"ERRO CRÍTICO: O arquivo 'credentials.json' não foi encontrado dentro da pasta '{creds_folder}'."
                )
            flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES)
            creds = flow.run_local_server(port=0)

        with open(token_path, 'w') as token:
            token.write(creds.to_json())

    try:
        service = build('drive', 'v3', credentials=creds)
        print("Serviço do Google Drive conectado com sucesso.")
        return service
    except Exception as e:
        print(f"Erro ao construir o serviço do Drive: {e}")
        return None


def find_folder_id(service, folder_name, parent_id='root'):
    query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and '{parent_id}' in parents"
    results = service.files().list(q=query, fields="files(id, name)").execute()
    items = results.get('files', [])
    return items[0]['id'] if items else None


def find_folder_id_by_path(service, path_components):
    current_parent_id = 'root'
    for folder_name in path_components:
        query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and '{current_parent_id}' in parents"
        results = service.files().list(q=query, fields="files(id)").execute()
        items = results.get('files', [])
        if not items:
            print(f"Pasta '{folder_name}' não encontrada em '{current_parent_id}'.")
            return None
        current_parent_id = items[0]['id']
    print("Caminho do dataset encontrado no Drive!")
    return current_parent_id


def get_files_from_drive_folder(service, folder_id):
    query = f"'{folder_id}' in parents"
    results = service.files().list(q=query, fields="files(id, name)").execute()
    return {file['name']: file['id'] for file in results.get('files', [])}


def download_file_locally(service, file_id, local_filename):
    request = service.files().get_media(fileId=file_id)
    with io.FileIO(local_filename, 'wb') as fh:
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while not done:
            status, done = downloader.next_chunk()
            if status:
                print(f"Download {int(status.progress() * 100)}% concluído...", end="\r")
    print(f"Arquivo salvo em: {local_filename}")

def parse_summary_file(file_path):
    seizure_info = {}
    with open(file_path, 'r', errors='ignore') as f:
        content = f.read()

    file_blocks = re.split(r'File Name:\s*', content)

    for block in file_blocks:
        if not block.strip():
            continue
        
        lines = block.strip().split('\n')
        file_name = lines[0].strip()
        seizure_info[file_name] = []

        start_time_pattern = re.compile(r"Seizure\s*\d*\s*Start Time:\s*(\d+)\s*seconds")
        end_time_pattern = re.compile(r"Seizure\s*\d*\s*End Time:\s*(\d+)\s*seconds")

        for i in range(len(lines)):
            start_match = start_time_pattern.search(lines[i])
            if start_match:
                start_time = int(start_match.group(1))
                if i + 1 < len(lines):
                    end_match = end_time_pattern.search(lines[i+1])
                    if end_match:
                        end_time = int(end_match.group(1))
                        seizure_info[file_name].append((start_time, end_time))
    return seizure_info


def extract_single_feature_vector(eeg_window, fs=256):
    freqs, psd = welch(eeg_window, fs=fs, nperseg=len(eeg_window)); total_power = np.sum(psd)
    def get_band_power(f_low, f_high): return np.sum(psd[np.logical_and(freqs >= f_low, freqs <= f_high)])
    delta, theta, alpha, beta, gamma = get_band_power(0.5, 4), get_band_power(4, 8), get_band_power(8, 13), get_band_power(13, 30), get_band_power(30, 80)
    band_powers = [p / total_power if total_power > 0 else 0 for p in [delta, theta, alpha, beta, gamma]]
    ratios = [beta / alpha if alpha > 0 else 0, (delta + theta) / (alpha + beta) if (alpha + beta) > 0 else 0]
    entropies = [antropy.perm_entropy(eeg_window, normalize=True), antropy.spectral_entropy(eeg_window, sf=fs, method='welch', normalize=True), antropy.sample_entropy(eeg_window)]
    stats = [np.mean(np.abs(eeg_window)), np.std(eeg_window), skew(eeg_window), kurtosis(eeg_window)]
    features = [total_power] + band_powers + ratios + entropies + stats
    features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)
    if len(features) < 46:
        padding = np.zeros(46 - len(features)); features = np.concatenate((features, padding))
    return np.array(features[:46])

class HDC:
    def __init__(self, dimensions, num_features, num_levels, num_classes=2, seed=None):
        if seed is not None: np.random.seed(seed)
        self.D, self.num_features, self.num_levels, self.num_classes = dimensions, num_features, num_levels, num_classes
        self.level_vectors = np.random.choice([-1, 1], size=(num_levels, self.D)); self.feature_vectors = np.random.choice([-1, 1], size=(num_features, self.D))
        self.class_prototypes = np.zeros((self.num_classes, self.D))
    def _quantize(self, data, num_levels):
        min_val, max_val = np.min(data), np.max(data)
        if max_val == min_val:
            return np.zeros_like(data, dtype=int)
        
        return np.round((data - min_val) / (max_val - min_val) * (num_levels - 1)).astype(int)
    def encode(self, x_data):
        num_samples, num_features = x_data.shape; x_quantized = np.array([self._quantize(x_data[:, i], self.num_levels) for i in range(num_features)]).T
        encoded_data = np.zeros((num_samples, self.D))
        for i in range(num_samples):
            sample_hv = np.sum([self.feature_vectors[f] * self.level_vectors[x_quantized[i, f]] for f in range(num_features)], axis=0)
            encoded_data[i] = np.sign(sample_hv) if np.any(sample_hv) else np.zeros(self.D)
        return encoded_data
    def predict(self, x_encoded):
        return np.argmax(cosine_similarity(x_encoded, self.class_prototypes), axis=1)
    def train_standard(self, x_encoded, y_train):
        self.class_prototypes = np.array([np.sum(x_encoded[y_train == i], axis=0) for i in range(self.num_classes)])
    def train_multipass(self, x_encoded, y_train, epochs, lr, initial_training=True, subtract_wrong=False):
        if initial_training: self.train_standard(x_encoded, y_train)
        for epoch in range(epochs):
            y_pred = self.predict(x_encoded)
            if np.sum(y_pred != y_train) == 0: break
            for i in range(len(y_train)):
                if y_pred[i] != y_train[i]:
                    self.class_prototypes[y_train[i]] += lr * x_encoded[i]
                    if subtract_wrong:
                        self.class_prototypes[y_pred[i]] -= lr * x_encoded[i]
    def train_multicentroid(self, x_encoded, y_train, threshold):
        prototypes, proto_labels = [], []
        for i in range(len(y_train)):
            sample_hv, correct_label = x_encoded[i], y_train[i]; best_sim, best_proto_idx = -1, -1
            if prototypes:
                similarities = cosine_similarity(sample_hv.reshape(1, -1), np.array(prototypes))[0]
                for j, label in enumerate(proto_labels):
                    if label == correct_label and similarities[j] > best_sim: best_sim, best_proto_idx = similarities[j], j
            if best_sim < threshold: prototypes.append(sample_hv); proto_labels.append(correct_label)
            else: prototypes[best_proto_idx] += sample_hv
        final_prototypes = np.zeros((self.num_classes, self.D))
        for label in range(self.num_classes):
            indices = [i for i, l in enumerate(proto_labels) if l == label]
            if indices: final_prototypes[label] = np.sum(np.array(prototypes)[indices], axis=0)
        self.class_prototypes = final_prototypes

def post_process_predictions(predictions, window_size=5, merge_gap=30, fs=1.0):
    smoothed = np.copy(predictions)
    for i in range(len(predictions)):
        start = max(0, i - window_size // 2); end = min(len(predictions), i + window_size // 2 + 1)
        if np.mean(predictions[start:end]) < 0.5: smoothed[i] = 0
    if merge_gap <= 0: return smoothed
    merged = np.copy(smoothed)
    change_indices = np.where(np.diff(smoothed) != 0)[0] + 1
    seizure_blocks = []
    if smoothed[0] == 1: seizure_blocks.append([0])
    for idx in change_indices:
        if smoothed[idx] == 1: seizure_blocks.append([idx])
        else:
            if len(seizure_blocks) > 0 and len(seizure_blocks[-1]) == 1:
                seizure_blocks[-1].append(idx)
    if len(seizure_blocks) > 0 and len(seizure_blocks[-1]) == 1: seizure_blocks[-1].append(len(smoothed))
    for i in range(len(seizure_blocks) - 1):
        end_first = seizure_blocks[i][1]; start_second = seizure_blocks[i+1][0]
        gap_duration = (start_second - end_first) / fs
        if gap_duration < merge_gap: merged[end_first:start_second] = 1
    return merged


Checagem da quantidade de arquivos com crise dentro dos 24 pacientes

In [39]:
DRIVE_PATH_COMPONENTS = ['TCC EPILEPSIA DATA', 'chb-mit-scalp-eeg-database-1.0.0']

service = get_drive_service()
main_folder_id = find_folder_id_by_path(service, DRIVE_PATH_COMPONENTS)

pacientes = [f"chb{str(i).zfill(2)}" for i in range(1, 25)]
total_arquivos_com_crise = 0
total_eventos = 0
resumo = {}

for paciente in pacientes:
    try:
        patient_folder_id = find_folder_id(service, paciente, parent_id=main_folder_id)
        if not patient_folder_id:
            print(f"{paciente}: pasta não encontrada no Drive.")
            continue

        drive_files = get_files_from_drive_folder(service, patient_folder_id)
        summary_filename = f"{paciente}-summary.txt"
        if summary_filename not in drive_files:
            print(f"{paciente}: summary não encontrado.")
            continue

        local_summary_path = f"./temp_{paciente}_summary.txt"
        download_file_locally(service, drive_files[summary_filename], local_summary_path)
        seizure_times = parse_summary_file(local_summary_path)
        os.remove(local_summary_path)

        arquivos_com_crise = sum(1 for _, pares in seizure_times.items() if len(pares) > 0)
        eventos = sum(len(pares) for pares in seizure_times.values())

        resumo[paciente] = (arquivos_com_crise, eventos)
        total_arquivos_com_crise += arquivos_com_crise
        total_eventos += eventos

        print(f"{paciente}: {arquivos_com_crise} arquivos com crise | {eventos} crises")
    except Exception as e:
        print(f"Erro em {paciente}: {e}")

print("\n=== RESUMO FINAL ===")
for p in pacientes:
    a, e = resumo.get(p, (0, 0))
    print(f"{p}: {a} arquivos com crise | {e} crises")

print(f"\nTOTAL de arquivos com crise: {total_arquivos_com_crise}")
print(f"TOTAL de crises (eventos):   {total_eventos}")


Serviço do Google Drive conectado com sucesso.
Caminho do dataset encontrado no Drive!
Arquivo salvo em: ./temp_chb01_summary.txt
chb01: 7 arquivos com crise | 7 crises
Arquivo salvo em: ./temp_chb02_summary.txt
chb02: 3 arquivos com crise | 3 crises
Arquivo salvo em: ./temp_chb03_summary.txt
chb03: 7 arquivos com crise | 7 crises
Arquivo salvo em: ./temp_chb04_summary.txt
chb04: 3 arquivos com crise | 4 crises
Arquivo salvo em: ./temp_chb05_summary.txt
chb05: 5 arquivos com crise | 5 crises
Arquivo salvo em: ./temp_chb06_summary.txt
chb06: 7 arquivos com crise | 10 crises
Arquivo salvo em: ./temp_chb07_summary.txt
chb07: 3 arquivos com crise | 3 crises
Arquivo salvo em: ./temp_chb08_summary.txt
chb08: 5 arquivos com crise | 5 crises
Arquivo salvo em: ./temp_chb09_summary.txt
chb09: 3 arquivos com crise | 4 crises
Arquivo salvo em: ./temp_chb10_summary.txt
chb10: 7 arquivos com crise | 7 crises
Arquivo salvo em: ./temp_chb11_summary.txt
chb11: 3 arquivos com crise | 3 crises
Arquivo sa

HDC - PADRÃO EM 5 PACIENTES

In [40]:
PATIENTS = [f"chb{str(i).zfill(2)}" for i in range(1, 6)]
DRIVE_PATH_COMPONENTS = ['TCC EPILEPSIA DATA', 'chb-mit-scalp-eeg-database-1.0.0']

DIMENSIONS = 10000
NUM_LEVELS = 100

WINDOW_SECONDS = 2
MAX_CHANNELS = 20
BATCH_SIZE = 5
SEED = 42

SMOOTHING_WINDOW_SIZE = 5
MERGE_SEIZURES_THRESHOLD = 30

start_time = time.time()
try:
    service = get_drive_service()
    root_id = find_folder_id_by_path(service, DRIVE_PATH_COMPONENTS)

    all_features, all_labels = [], []
    fs_signal = -1 

    for patient in PATIENTS:
        print(f"\n{'='*15} {patient} {'='*15}")
        patient_id = find_folder_id(service, patient, parent_id=root_id)
        files_map = get_files_from_drive_folder(service, patient_id)

        summary_path = f"./temp_{patient}_summary.txt"
        download_file_locally(service, files_map[f"{patient}-summary.txt"], summary_path)
        seiz = parse_summary_file(summary_path)
        os.remove(summary_path)

        edf_with_seiz = sorted([f for f, ivals in seiz.items() if len(ivals) > 0])
        print(f"{patient}: Processando {len(edf_with_seiz)} arquivos com crise...")

        for k in range(0, len(edf_with_seiz), BATCH_SIZE):
            batch = edf_with_seiz[k:k+BATCH_SIZE]
            print(f"\n>>> Lote {k//BATCH_SIZE + 1}:")

            for edf_name in batch:
                local_path = f"./temp_{edf_name}"
                print(f"  Processando: {edf_name}")
                try:
                    download_file_locally(service, files_map[edf_name], local_path)
                    with pyedflib.EdfReader(local_path) as r:
                        fs_signal = r.getSampleFrequency(0)
                        n_ch = min(int(MAX_CHANNELS), r.signals_in_file)
                        signals = np.array([r.readSignal(c) for c in range(n_ch)], dtype=np.float32)

                        win_samples = int(fs_signal * WINDOW_SECONDS)
                        step = win_samples // 2
                        
                        for j in range(0, signals.shape[1] - win_samples + 1, step):
                            window = signals[:, j : j + win_samples]
                            start_time_sec, end_time_sec = j / fs_signal, (j + win_samples) / fs_signal
                            is_seiz = any(max(start_time_sec, s_start) < min(end_time_sec, s_end) for s_start, s_end in seiz.get(edf_name, []))
                            feats = np.mean([extract_single_feature_vector(window[c, :], fs_signal) for c in range(n_ch)], axis=0)
                            all_features.append(feats); all_labels.append(1 if is_seiz else 0)
                except Exception as e:
                    print(f"  [ERRO] Falha ao processar {edf_name}: {e}")
                finally:
                    if os.path.exists(local_path): os.remove(local_path)
            gc.collect()
    
    print("\n" + "="*40 + "\n" + "FASE DE TREINAMENTO E AVALIAÇÃO".center(40) + "\n" + "="*40)
    
    X = np.array(all_features); y = np.array(all_labels)
    X_scaled = StandardScaler().fit_transform(X)

    X_pos, X_neg = X_scaled[y == 1], X_scaled[y == 0]
    X_neg_balanced = resample(X_neg, replace=False, n_samples=len(X_pos), random_state=SEED)
    X_balanced = np.vstack([X_pos, X_neg_balanced])
    y_balanced = np.hstack([np.ones(len(X_pos)), np.zeros(len(X_pos))])
    
    X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3, stratify=y_balanced, random_state=SEED)

    NUM_FEATURES = X_train.shape[1]
    
    encoder = HDC(DIMENSIONS, NUM_FEATURES, NUM_LEVELS, seed=SEED)
    X_train_hd = encoder.encode(X_train)
    X_test_hd = encoder.encode(X_test)

    model = HDC(DIMENSIONS, NUM_FEATURES, NUM_LEVELS, seed=SEED)
    model.train_standard(X_train_hd, y_train)
    y_pred_raw = model.predict(X_test_hd)

    win_samples_final = int(fs_signal * WINDOW_SECONDS)
    step_final = win_samples_final // 2
    fs_windows = fs_signal / step_final
    y_pred_pp = post_process_predictions(y_pred_raw, window_size=SMOOTHING_WINDOW_SIZE,
                                       merge_gap=MERGE_SEIZURES_THRESHOLD, fs=fs_windows)

    cm = confusion_matrix(y_test, y_pred_pp, labels=[0,1])
    print("\n=== RESULTADO FINAL (HDC Padrão) ===")
    print(f"F1: {f1_score(y_test, y_pred_pp, zero_division=0):.4f} | Precisão: {precision_score(y_test, y_pred_pp, zero_division=0):.4f} | Sensibilidade: {recall_score(y_test, y_pred_pp, zero_division=0):.4f}")
    print("Matriz de Confusão:\n", cm)

except Exception as e:
    print("\nERRO GERAL:", e)
finally:
    end_time = time.time()
    print(f"\nTempo total de execução: {(time.time()-start_time)/60:.2f} min", flush=True)

Serviço do Google Drive conectado com sucesso.
Caminho do dataset encontrado no Drive!

Arquivo salvo em: ./temp_chb01_summary.txt
chb01: Processando 7 arquivos com crise...

>>> Lote 1:
  Processando: chb01_03.edf
Arquivo salvo em: ./temp_chb01_03.edf
  Processando: chb01_04.edf
Arquivo salvo em: ./temp_chb01_04.edf
  Processando: chb01_15.edf
Arquivo salvo em: ./temp_chb01_15.edf
  Processando: chb01_16.edf
Arquivo salvo em: ./temp_chb01_16.edf
  Processando: chb01_18.edf
Arquivo salvo em: ./temp_chb01_18.edf

>>> Lote 2:
  Processando: chb01_21.edf
Arquivo salvo em: ./temp_chb01_21.edf
  Processando: chb01_26.edf
Arquivo salvo em: ./temp_chb01_26.edf

Arquivo salvo em: ./temp_chb02_summary.txt
chb02: Processando 3 arquivos com crise...

>>> Lote 1:
  Processando: chb02_16+.edf
Arquivo salvo em: ./temp_chb02_16+.edf
  Processando: chb02_16.edf
Arquivo salvo em: ./temp_chb02_16.edf
  Processando: chb02_19.edf
Arquivo salvo em: ./temp_chb02_19.edf

Arquivo salvo em: ./temp_chb03_summar

HDC - MULTI-PASS EM 5 PACIENTES

In [41]:
PATIENTS = [f"chb{str(i).zfill(2)}" for i in range(1, 6)] 
DRIVE_PATH_COMPONENTS = ['TCC EPILEPSIA DATA', 'chb-mit-scalp-eeg-database-1.0.0']

DIMENSIONS = 10000; NUM_LEVELS = 100; EPOCHS_MULTIPASS = 12
LEARNING_RATE_MULTIPASS = 0.1

WINDOW_SECONDS = 2; MAX_CHANNELS = 20; BATCH_SIZE = 5; SEED = 42

SMOOTHING_WINDOW_SIZE = 5; MERGE_SEIZURES_THRESHOLD = 30

start_time = time.time()
try:
    service = get_drive_service()
    root_id = find_folder_id_by_path(service, DRIVE_PATH_COMPONENTS)

    all_features, all_labels = [], []
    fs_signal = -1

    for patient in PATIENTS:
        print(f"\n{'='*15} {patient} {'='*15}")
        patient_id = find_folder_id(service, patient, parent_id=root_id)
        files_map = get_files_from_drive_folder(service, patient_id)

        summary_path = f"./temp_{patient}_summary.txt"
        download_file_locally(service, files_map[f"{patient}-summary.txt"], summary_path)
        seiz = parse_summary_file(summary_path)
        os.remove(summary_path)

        edf_with_seiz = sorted([f for f, ivals in seiz.items() if len(ivals) > 0])
        print(f"{patient}: Processando {len(edf_with_seiz)} arquivos com crise...")

        for k in range(0, len(edf_with_seiz), BATCH_SIZE):
            batch = edf_with_seiz[k:k+BATCH_SIZE]
            print(f"\n>>> Lote {k//BATCH_SIZE + 1}:")

            for edf_name in batch:
                local_path = f"./temp_{edf_name}"
                print(f"  Processando: {edf_name}")
                try:
                    download_file_locally(service, files_map[edf_name], local_path)
                    with pyedflib.EdfReader(local_path) as r:
                        fs_signal = r.getSampleFrequency(0)
                        n_ch = min(int(MAX_CHANNELS), r.signals_in_file)
                        signals = np.array([r.readSignal(c) for c in range(n_ch)], dtype=np.float32)
                        win_samples = int(fs_signal * WINDOW_SECONDS); step = win_samples // 2
                        for j in range(0, signals.shape[1] - win_samples + 1, step):
                            window = signals[:, j : j + win_samples]
                            start_time_sec, end_time_sec = j / fs_signal, (j + win_samples) / fs_signal
                            is_seiz = any(max(start_time_sec, s_start) < min(end_time_sec, s_end) for s_start, s_end in seiz.get(edf_name, []))
                            feats = np.mean([extract_single_feature_vector(window[c, :], fs_signal) for c in range(n_ch)], axis=0)
                            all_features.append(feats); all_labels.append(1 if is_seiz else 0)
                except Exception as e: print(f"  [ERRO] Falha ao processar {edf_name}: {e}")
                finally:
                    if os.path.exists(local_path): os.remove(local_path)
            gc.collect()
    
    print("\n" + "="*40 + "\n" + "FASE DE TREINAMENTO E AVALIAÇÃO".center(40) + "\n" + "="*40)
    
    X = np.array(all_features); y = np.array(all_labels)
    X_scaled = StandardScaler().fit_transform(X)
    X_scaled = np.nan_to_num(X_scaled, nan=0.0, posinf=0.0, neginf=0.0)

    X_pos, X_neg = X_scaled[y == 1], X_scaled[y == 0]
    if len(X_pos) == 0: raise ValueError("Nenhuma amostra de crise foi encontrada.")
        
    X_neg_balanced = resample(X_neg, replace=False, n_samples=len(X_pos), random_state=SEED)
    X_balanced = np.vstack([X_pos, X_neg_balanced])
    
    y_balanced = np.hstack([np.ones(len(X_pos)), np.zeros(len(X_pos))]).astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3, stratify=y_balanced, random_state=SEED)

    NUM_FEATURES = X_train.shape[1]
    
    encoder = HDC(DIMENSIONS, NUM_FEATURES, NUM_LEVELS, seed=SEED)
    X_train_hd = encoder.encode(X_train)
    X_test_hd = encoder.encode(X_test)

    model = HDC(DIMENSIONS, NUM_FEATURES, NUM_LEVELS, seed=SEED)
    model.train_multipass(X_train_hd, y_train, epochs=EPOCHS_MULTIPASS, lr=LEARNING_RATE_MULTIPASS, subtract_wrong=True)
    y_pred_raw = model.predict(X_test_hd)

    win_samples_final = int(fs_signal * WINDOW_SECONDS)
    step_final = win_samples_final // 2
    fs_windows = fs_signal / step_final 
    y_pred_pp = post_process_predictions(y_pred_raw, window_size=SMOOTHING_WINDOW_SIZE,
                                       merge_gap=MERGE_SEIZURES_THRESHOLD, fs=fs_windows)

    cm = confusion_matrix(y_test, y_pred_pp, labels=[0,1])
    print("\n=== RESULTADO FINAL (HDC Multi-Pass) ===")
    print(f"F1: {f1_score(y_test, y_pred_pp, zero_division=0):.4f} | Precisão: {precision_score(y_test, y_pred_pp, zero_division=0):.4f} | Sensibilidade: {recall_score(y_test, y_pred_pp, zero_division=0):.4f}")
    print("Matriz de Confusão:\n", cm)

except Exception as e:
    raise e
finally:
    end_time = time.time()
    print(f"\nTempo total de execução: {(end_time-start_time)/60:.2f} min", flush=True)

Serviço do Google Drive conectado com sucesso.
Caminho do dataset encontrado no Drive!

Arquivo salvo em: ./temp_chb01_summary.txt
chb01: Processando 7 arquivos com crise...

>>> Lote 1:
  Processando: chb01_03.edf
Arquivo salvo em: ./temp_chb01_03.edf
  Processando: chb01_04.edf
Arquivo salvo em: ./temp_chb01_04.edf
  Processando: chb01_15.edf
Arquivo salvo em: ./temp_chb01_15.edf
  Processando: chb01_16.edf
Arquivo salvo em: ./temp_chb01_16.edf
  Processando: chb01_18.edf
Arquivo salvo em: ./temp_chb01_18.edf

>>> Lote 2:
  Processando: chb01_21.edf
Arquivo salvo em: ./temp_chb01_21.edf
  Processando: chb01_26.edf
Arquivo salvo em: ./temp_chb01_26.edf

Arquivo salvo em: ./temp_chb02_summary.txt
chb02: Processando 3 arquivos com crise...

>>> Lote 1:
  Processando: chb02_16+.edf
Arquivo salvo em: ./temp_chb02_16+.edf
  Processando: chb02_16.edf
Arquivo salvo em: ./temp_chb02_16.edf
  Processando: chb02_19.edf
Arquivo salvo em: ./temp_chb02_19.edf

Arquivo salvo em: ./temp_chb03_summar

HDC - MULTI-CENTROID EM 5 PACIENTES

In [42]:
PATIENTS = [f"chb{str(i).zfill(2)}" for i in range(1, 6)]
DRIVE_PATH_COMPONENTS = ['TCC EPILEPSIA DATA', 'chb-mit-scalp-eeg-database-1.0.0']

DIMENSIONS = 10000; NUM_LEVELS = 100; THRESHOLD_MULTICENTROID = 0.25

WINDOW_SECONDS = 2; MAX_CHANNELS = 20; BATCH_SIZE = 5; SEED = 42

SMOOTHING_WINDOW_SIZE = 5; MERGE_SEIZURES_THRESHOLD = 30

start_time = time.time()
try:
    service = get_drive_service()
    root_id = find_folder_id_by_path(service, DRIVE_PATH_COMPONENTS)

    all_features, all_labels = [], []
    fs_signal = -1

    for patient in PATIENTS:
        print(f"\n{'='*15} {patient} {'='*15}")
        patient_id = find_folder_id(service, patient, parent_id=root_id)
        files_map = get_files_from_drive_folder(service, patient_id)

        summary_path = f"./temp_{patient}_summary.txt"
        download_file_locally(service, files_map[f"{patient}-summary.txt"], summary_path)
        seiz = parse_summary_file(summary_path)
        os.remove(summary_path)

        edf_with_seiz = sorted([f for f, ivals in seiz.items() if len(ivals) > 0])
        print(f"{patient}: Processando {len(edf_with_seiz)} arquivos com crise...")

        for k in range(0, len(edf_with_seiz), BATCH_SIZE):
            batch = edf_with_seiz[k:k+BATCH_SIZE]
            print(f"\n>>> Lote {k//BATCH_SIZE + 1}:")

            for edf_name in batch:
                local_path = f"./temp_{edf_name}"
                print(f"  Processando: {edf_name}")
                try:
                    download_file_locally(service, files_map[edf_name], local_path)
                    with pyedflib.EdfReader(local_path) as r:
                        fs_signal = r.getSampleFrequency(0)
                        n_ch = min(int(MAX_CHANNELS), r.signals_in_file)
                        signals = np.array([r.readSignal(c) for c in range(n_ch)], dtype=np.float32)

                        win_samples = int(fs_signal * WINDOW_SECONDS)
                        step = win_samples // 2
                        
                        for j in range(0, signals.shape[1] - win_samples + 1, step):
                            window = signals[:, j : j + win_samples]
                            start_time_sec, end_time_sec = j / fs_signal, (j + win_samples) / fs_signal
                            is_seiz = any(max(start_time_sec, s_start) < min(end_time_sec, s_end) for s_start, s_end in seiz.get(edf_name, []))
                            feats = np.mean([extract_single_feature_vector(window[c, :], fs_signal) for c in range(n_ch)], axis=0)
                            all_features.append(feats)
                            all_labels.append(1 if is_seiz else 0)
                except Exception as e: print(f"  [ERRO] Falha ao processar {edf_name}: {e}")
                finally:
                    if os.path.exists(local_path): os.remove(local_path)
            gc.collect()
    
    print("\n" + "="*40 + "\n" + "FASE DE TREINAMENTO E AVALIAÇÃO".center(40) + "\n" + "="*40)
    
    X = np.array(all_features); y = np.array(all_labels)
    X_scaled = StandardScaler().fit_transform(X)
    X_scaled = np.nan_to_num(X_scaled, nan=0.0, posinf=0.0, neginf=0.0)

    X_pos, X_neg = X_scaled[y == 1], X_scaled[y == 0]
    if len(X_pos) == 0: raise ValueError("Nenhuma amostra de crise foi encontrada.")
        
    X_neg_balanced = resample(X_neg, replace=False, n_samples=len(X_pos), random_state=SEED)
    X_balanced = np.vstack([X_pos, X_neg_balanced])
    
    y_balanced = np.hstack([np.ones(len(X_pos)), np.zeros(len(X_pos))]).astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3, stratify=y_balanced, random_state=SEED)

    NUM_FEATURES = X_train.shape[1]
    
    encoder = HDC(DIMENSIONS, NUM_FEATURES, NUM_LEVELS, seed=SEED)
    X_train_hd = encoder.encode(X_train)
    X_test_hd = encoder.encode(X_test)

    model = HDC(DIMENSIONS, NUM_FEATURES, NUM_LEVELS, seed=SEED)
    model.train_multicentroid(X_train_hd, y_train, threshold=THRESHOLD_MULTICENTROID)
    y_pred_raw = model.predict(X_test_hd)

    win_samples_final = int(fs_signal * WINDOW_SECONDS)
    step_final = win_samples_final // 2
    fs_windows = fs_signal / step_final 
    y_pred_pp = post_process_predictions(y_pred_raw, window_size=SMOOTHING_WINDOW_SIZE,
                                       merge_gap=MERGE_SEIZURES_THRESHOLD, fs=fs_windows)

    cm = confusion_matrix(y_test, y_pred_pp, labels=[0,1])
    print("\n=== RESULTADO FINAL (HDC Multi-Centroid) ===")
    print(f"F1: {f1_score(y_test, y_pred_pp, zero_division=0):.4f} | Precisão: {precision_score(y_test, y_pred_pp, zero_division=0):.4f} | Sensibilidade: {recall_score(y_test, y_pred_pp, zero_division=0):.4f}")
    print("Matriz de Confusão:\n", cm)

except Exception as e:
    raise e
finally:
    end_time = time.time()
    print(f"\nTempo total de execução: {(time.time()-start_time)/60:.2f} min", flush=True)

Serviço do Google Drive conectado com sucesso.
Caminho do dataset encontrado no Drive!

Arquivo salvo em: ./temp_chb01_summary.txt
chb01: Processando 7 arquivos com crise...

>>> Lote 1:
  Processando: chb01_03.edf
Arquivo salvo em: ./temp_chb01_03.edf
  Processando: chb01_04.edf
Arquivo salvo em: ./temp_chb01_04.edf
  Processando: chb01_15.edf
Arquivo salvo em: ./temp_chb01_15.edf
  Processando: chb01_16.edf
Arquivo salvo em: ./temp_chb01_16.edf
  Processando: chb01_18.edf
Arquivo salvo em: ./temp_chb01_18.edf

>>> Lote 2:
  Processando: chb01_21.edf
Arquivo salvo em: ./temp_chb01_21.edf
  Processando: chb01_26.edf
Arquivo salvo em: ./temp_chb01_26.edf

Arquivo salvo em: ./temp_chb02_summary.txt
chb02: Processando 3 arquivos com crise...

>>> Lote 1:
  Processando: chb02_16+.edf
Arquivo salvo em: ./temp_chb02_16+.edf
  Processando: chb02_16.edf
Arquivo salvo em: ./temp_chb02_16.edf
  Processando: chb02_19.edf
Arquivo salvo em: ./temp_chb02_19.edf

Arquivo salvo em: ./temp_chb03_summar

HDC - MP + MC EM 5 PACIENTES

In [None]:
PATIENTS = [f"chb{str(i).zfill(2)}" for i in range(1, 6)]
DRIVE_PATH_COMPONENTS = ['TCC EPILEPSIA DATA', 'chb-mit-scalp-eeg-database-1.0.0']

DIMENSIONS = 10000; NUM_LEVELS = 100; EPOCHS_MULTIPASS = 12
LEARNING_RATE_MULTIPASS = 0.1; THRESHOLD_MULTICENTROID = 0.25

WINDOW_SECONDS = 2; MAX_CHANNELS = 20; BATCH_SIZE = 5; SEED = 42

SMOOTHING_WINDOW_SIZE = 5; MERGE_SEIZURES_THRESHOLD = 30


start_time = time.time()
try:
    service = get_drive_service()
    root_id = find_folder_id_by_path(service, DRIVE_PATH_COMPONENTS)

    all_features, all_labels = [], []
    fs_signal = -1

    for patient in PATIENTS:
        print(f"\n{'='*15} {patient} {'='*15}")
        patient_id = find_folder_id(service, patient, parent_id=root_id)
        files_map = get_files_from_drive_folder(service, patient_id)

        summary_path = f"./temp_{patient}_summary.txt"
        download_file_locally(service, files_map[f"{patient}-summary.txt"], summary_path)
        seiz = parse_summary_file(summary_path)
        os.remove(summary_path)

        edf_with_seiz = sorted([f for f, ivals in seiz.items() if len(ivals) > 0])
        print(f"{patient}: Processando {len(edf_with_seiz)} arquivos com crise...")

        for k in range(0, len(edf_with_seiz), BATCH_SIZE):
            batch = edf_with_seiz[k:k+BATCH_SIZE]
            print(f"\n>>> Lote {k//BATCH_SIZE + 1}:")

            for edf_name in batch:
                local_path = f"./temp_{edf_name}"
                print(f"  Processando: {edf_name}")
                try:
                    download_file_locally(service, files_map[edf_name], local_path)
                    with pyedflib.EdfReader(local_path) as r:
                        fs_signal = r.getSampleFrequency(0)
                        n_ch = min(int(MAX_CHANNELS), r.signals_in_file)
                        signals = np.array([r.readSignal(c) for c in range(n_ch)], dtype=np.float32)

                        win_samples = int(fs_signal * WINDOW_SECONDS)
                        step = win_samples // 2
                        
                        for j in range(0, signals.shape[1] - win_samples + 1, step):
                            window = signals[:, j : j + win_samples]
                            start_time_sec, end_time_sec = j / fs_signal, (j + win_samples) / fs_signal
                            is_seiz = any(max(start_time_sec, s_start) < min(end_time_sec, s_end) for s_start, s_end in seiz.get(edf_name, []))
                            feats = np.mean([extract_single_feature_vector(window[c, :], fs_signal) for c in range(n_ch)], axis=0)
                            all_features.append(feats); all_labels.append(1 if is_seiz else 0)
                except Exception as e: print(f"  [ERRO] Falha ao processar {edf_name}: {e}")
                finally:
                    if os.path.exists(local_path): os.remove(local_path)
            gc.collect()
    
    print("\n" + "="*40 + "\n" + "FASE DE TREINAMENTO E AVALIAÇÃO".center(40) + "\n" + "="*40)
    
    X = np.array(all_features); y = np.array(all_labels)
    X_scaled = StandardScaler().fit_transform(X)
    X_scaled = np.nan_to_num(X_scaled, nan=0.0, posinf=0.0, neginf=0.0)

    X_pos, X_neg = X_scaled[y == 1], X_scaled[y == 0]
    if len(X_pos) == 0: raise ValueError("Nenhuma amostra de crise foi encontrada.")
        
    X_neg_balanced = resample(X_neg, replace=False, n_samples=len(X_pos), random_state=SEED)
    X_balanced = np.vstack([X_pos, X_neg_balanced])
    
    y_balanced = np.hstack([np.ones(len(X_pos)), np.zeros(len(X_pos))]).astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3, stratify=y_balanced, random_state=SEED)

    NUM_FEATURES = X_train.shape[1]
    
    encoder = HDC(DIMENSIONS, NUM_FEATURES, NUM_LEVELS, seed=SEED)
    X_train_hd = encoder.encode(X_train)
    X_test_hd = encoder.encode(X_test)

    model = HDC(DIMENSIONS, NUM_FEATURES, NUM_LEVELS, seed=SEED)
    model.train_multicentroid(X_train_hd, y_train, threshold=THRESHOLD_MULTICENTROID)
    model.train_multipass(X_train_hd, y_train, epochs=EPOCHS_MULTIPASS, lr=LEARNING_RATE_MULTIPASS, subtract_wrong=True, initial_training=False)
    y_pred_raw = model.predict(X_test_hd)

    win_samples_final = int(fs_signal * WINDOW_SECONDS)
    step_final = win_samples_final // 2
    fs_windows = fs_signal / step_final 
    y_pred_pp = post_process_predictions(y_pred_raw, window_size=SMOOTHING_WINDOW_SIZE,
                                       merge_gap=MERGE_SEIZURES_THRESHOLD, fs=fs_windows)

    cm = confusion_matrix(y_test, y_pred_pp, labels=[0,1])
    print("\n=== RESULTADO FINAL (HDC MC + MP) ===")
    print(f"F1: {f1_score(y_test, y_pred_pp, zero_division=0):.4f} | Precisão: {precision_score(y_test, y_pred_pp, zero_division=0):.4f} | Sensibilidade: {recall_score(y_test, y_pred_pp, zero_division=0):.4f}")
    print("Matriz de Confusão:\n", cm)

except Exception as e:
    raise e
finally:
    end_time = time.time()
    print(f"\nTempo total de execução: {(time.time()-start_time)/60:.2f} min", flush=True)

Serviço do Google Drive conectado com sucesso.
Caminho do dataset encontrado no Drive!

Arquivo salvo em: ./temp_chb01_summary.txt
chb01: Processando 7 arquivos com crise...

>>> Lote 1:
  Processando: chb01_03.edf
Arquivo salvo em: ./temp_chb01_03.edf
  Processando: chb01_04.edf
Arquivo salvo em: ./temp_chb01_04.edf
  Processando: chb01_15.edf
Arquivo salvo em: ./temp_chb01_15.edf
  Processando: chb01_16.edf
Arquivo salvo em: ./temp_chb01_16.edf
  Processando: chb01_18.edf
Arquivo salvo em: ./temp_chb01_18.edf

>>> Lote 2:
  Processando: chb01_21.edf
Arquivo salvo em: ./temp_chb01_21.edf
  Processando: chb01_26.edf
Arquivo salvo em: ./temp_chb01_26.edf

Arquivo salvo em: ./temp_chb02_summary.txt
chb02: Processando 3 arquivos com crise...

>>> Lote 1:
  Processando: chb02_16+.edf
Arquivo salvo em: ./temp_chb02_16+.edf
  Processando: chb02_16.edf
Arquivo salvo em: ./temp_chb02_16.edf
  Processando: chb02_19.edf
Arquivo salvo em: ./temp_chb02_19.edf

Arquivo salvo em: ./temp_chb03_summar

SVM, RANDOM FOREST, LOGISTIC REGRESSION, KNN, DECISION TREE, GRADIENT BOOSTING E NAIVE BAYES EM 5 PACIENTES

In [44]:
PATIENTS = [f"chb{str(i).zfill(2)}" for i in range(1, 6)]
DRIVE_PATH_COMPONENTS = ['TCC EPILEPSIA DATA', 'chb-mit-scalp-eeg-database-1.0.0']

WINDOW_SECONDS = 2
MAX_CHANNELS = 20 
BATCH_SIZE = 5
SEED = 42

SMOOTHING_WINDOW_SIZE = 5
MERGE_SEIZURES_THRESHOLD = 30

start_time = time.time()
try:
    service = get_drive_service()
    root_id = find_folder_id_by_path(service, DRIVE_PATH_COMPONENTS)

    all_features, all_labels = [], []
    fs_signal = -1

    for patient in PATIENTS:
        print(f"\n{'='*15} {patient} {'='*15}")
        patient_id = find_folder_id(service, patient, parent_id=root_id)
        files_map = get_files_from_drive_folder(service, patient_id)

        summary_path = f"./temp_{patient}_summary.txt"
        download_file_locally(service, files_map[f"{patient}-summary.txt"], summary_path)
        seiz = parse_summary_file(summary_path)
        os.remove(summary_path)

        edf_with_seiz = sorted([f for f, ivals in seiz.items() if len(ivals) > 0])
        print(f"{patient}: Processando {len(edf_with_seiz)} arquivos com crise...")

        for k in range(0, len(edf_with_seiz), BATCH_SIZE):
            batch = edf_with_seiz[k:k+BATCH_SIZE]
            print(f"\n>>> Lote {k//BATCH_SIZE + 1}:")

            for edf_name in batch:
                local_path = f"./temp_{edf_name}"
                print(f"  Processando: {edf_name}")
                try:
                    download_file_locally(service, files_map[edf_name], local_path)
                    with pyedflib.EdfReader(local_path) as r:
                        fs_signal = r.getSampleFrequency(0)
                        n_ch = min(int(MAX_CHANNELS), r.signals_in_file)
                        signals = np.array([r.readSignal(c) for c in range(n_ch)], dtype=np.float32)
                        win_samples = int(fs_signal * WINDOW_SECONDS); step = win_samples // 2
                        
                        for j in range(0, signals.shape[1] - win_samples + 1, step):
                            window = signals[:, j : j + win_samples]
                            start_time_sec, end_time_sec = j / fs_signal, (j + win_samples) / fs_signal
                            is_seiz = any(max(start_time_sec, s_start) < min(end_time_sec, s_end) for s_start, s_end in seiz.get(edf_name, []))
                            feats = np.mean([extract_single_feature_vector(window[c, :], fs_signal) for c in range(n_ch)], axis=0)
                            all_features.append(feats); all_labels.append(1 if is_seiz else 0)
                except Exception as e: print(f"  [ERRO] Falha ao processar {edf_name}: {e}")
                finally:
                    if os.path.exists(local_path): os.remove(local_path)
            gc.collect()
    
    print("\n" + "="*40 + "\n" + "FASE DE TREINAMENTO E AVALIAÇÃO".center(40) + "\n" + "="*40)
    
    X = np.array(all_features); y = np.array(all_labels)
    X_scaled = StandardScaler().fit_transform(X)
    X_scaled = np.nan_to_num(X_scaled, nan=0.0, posinf=0.0, neginf=0.0)

    X_pos, X_neg = X_scaled[y == 1], X_scaled[y == 0]
    if len(X_pos) == 0: raise ValueError("Nenhuma amostra de crise foi encontrada nos dados processados.")

    X_neg_balanced = resample(X_neg, replace=False, n_samples=len(X_pos), random_state=SEED)
    X_balanced = np.vstack([X_pos, X_neg_balanced])
    
    y_balanced = np.hstack([np.ones(len(X_pos)), np.zeros(len(X_pos))]).astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3, stratify=y_balanced, random_state=SEED)

    models = {
        "SVM": SVC(kernel="rbf", C=1.0, gamma="scale", random_state=SEED),
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=SEED, n_jobs=-1),
        "LogisticRegression": LogisticRegression(random_state=SEED, max_iter=1000, n_jobs=-1),
        "KNN": KNeighborsClassifier(n_neighbors=7, n_jobs=-1),
        "DecisionTree": DecisionTreeClassifier(random_state=SEED),
        "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=SEED),
        "NaiveBayes": GaussianNB()
    }

    win_samples_final = int(fs_signal * WINDOW_SECONDS); step_final = win_samples_final // 2
    fs_windows = fs_signal / step_final
    print(f"\nFrequência das janelas calculada: {fs_windows:.2f} Hz.")

    for name, clf in models.items():
        print(f"\n--- Treinando e Avaliando: {name} ---")
        clf.fit(X_train, y_train)
        y_pred_raw = clf.predict(X_test)
        
        y_pred_pp = post_process_predictions(y_pred_raw, window_size=SMOOTHING_WINDOW_SIZE,
                                           merge_gap=MERGE_SEIZURES_THRESHOLD, fs=fs_windows)
        
        cm = confusion_matrix(y_test, y_pred_pp, labels=[0,1])
        print(f"F1: {f1_score(y_test, y_pred_pp, zero_division=0):.4f} | Precisão: {precision_score(y_test, y_pred_pp, zero_division=0):.4f} | Sensibilidade: {recall_score(y_test, y_pred_pp, zero_division=0):.4f}")
        print("Matriz de Confusão:\n", cm)

except Exception as e:
     raise e
finally:
    end_time = time.time()
    print(f"\nTempo total de execução: {(time.time()-start_time)/60:.2f} min", flush=True)

Serviço do Google Drive conectado com sucesso.
Caminho do dataset encontrado no Drive!

Arquivo salvo em: ./temp_chb01_summary.txt
chb01: Processando 7 arquivos com crise...

>>> Lote 1:
  Processando: chb01_03.edf
Arquivo salvo em: ./temp_chb01_03.edf
  Processando: chb01_04.edf
Arquivo salvo em: ./temp_chb01_04.edf
  Processando: chb01_15.edf
Arquivo salvo em: ./temp_chb01_15.edf
  Processando: chb01_16.edf
Arquivo salvo em: ./temp_chb01_16.edf
  Processando: chb01_18.edf
Arquivo salvo em: ./temp_chb01_18.edf

>>> Lote 2:
  Processando: chb01_21.edf
Arquivo salvo em: ./temp_chb01_21.edf
  Processando: chb01_26.edf
Arquivo salvo em: ./temp_chb01_26.edf

Arquivo salvo em: ./temp_chb02_summary.txt
chb02: Processando 3 arquivos com crise...

>>> Lote 1:
  Processando: chb02_16+.edf
Arquivo salvo em: ./temp_chb02_16+.edf
  Processando: chb02_16.edf
Arquivo salvo em: ./temp_chb02_16.edf
  Processando: chb02_19.edf
Arquivo salvo em: ./temp_chb02_19.edf

Arquivo salvo em: ./temp_chb03_summar