In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

import polars as pl

from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

In [6]:
class EventDataset(Dataset):
    def __init__(self, path):
        self.__dataframe = pl.read_parquet(path)
        self.__event_ids = self.__dataframe['event_id'].unique().to_list()
        self.__len = len(self.__event_ids)
        print(self.__dataframe.columns)

    def __len__(self):
        return self.__len

    def __getitem__(self, idx):
        event_id = self.__event_ids[idx]
        event_rows = self.__dataframe.filter(pl.col('event_id') == event_id)
        # Convert the event rows to a tensor
        event_tensor = torch.tensor(event_rows.drop(['event_id', 'time', 'marker', '__null_dask_index__']).to_numpy(), dtype=torch.float32)
        marker = torch.tensor(event_rows['marker'].to_list()[0] == 'Stimulus/P', dtype=torch.float32)
        return marker, event_tensor

    def class_balance(self):
        positives = 0
        negatives = 0
        for idx in tqdm(range(self.__len), desc="Counting class values.."):
            marker, event_tensor = self.__getitem__(idx)
            if marker == 0.:
                negatives += 1
            else:
                positives += 1
        return positives, negatives

In [7]:
# Define the directory containing the Parquet files
parquet_directory = '/home/owner/Documents/DEV/BrainLabyrinth/data/combined.parquet'

def collate_fn(batch):
    max_length = 2000  # Fixed length for all sequences
    markers, tensors = zip(*batch)

    batch_size = len(tensors)
    num_features = tensors[0].size(1)  # Assuming all tensors have the same number of features

    # Pre-allocate a tensor of the desired shape
    padded_tensors = torch.zeros((batch_size, max_length, num_features), dtype=torch.float32)

    for i, tensor in enumerate(tensors):
        length = tensor.size(0)
        if length < max_length:
            padded_tensors[i, :length, :] = tensor
        else:
            padded_tensors[i, :max_length, :] = tensor[:max_length]

    # Convert markers to a tensor
    markers = torch.tensor(markers, dtype=torch.float32)

    return markers, padded_tensors

dataset = EventDataset(parquet_directory)
dataset.class_balance()

['event_id', 'marker', 'time', 'Fp1', 'Fpz', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'FC5', 'FC1', 'FC2', 'FC6', 'M1', 'T7', 'C3', 'Cz', 'C4', 'T8', 'M2', 'CP5', 'CP1', 'CP2', 'CP6', 'P7', 'P3', 'Pz', 'P4', 'P8', 'POz', 'O1', 'O2', 'EOG', 'AF7', 'AF3', 'AF4', 'AF8', 'F5', 'F1', 'F2', 'F6', 'FC3', 'FCz', 'FC4', 'C5', 'C1', 'C2', 'C6', 'CP3', 'CP4', 'P5', 'P1', 'P2', 'P6', 'PO5', 'PO3', 'PO4', 'PO6', 'FT7', 'FT8', 'TP7', 'TP8', 'PO7', 'PO8', 'Oz', '__null_dask_index__']


Counting class values..:   0%|          | 0/2914 [00:00<?, ?it/s]

(896, 2018)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = 64  # Number of features
hidden_size = 256  # Number of features in the hidden state
num_layers = 1  # Number of recurrent layers
output_size = 1  # Number of output classes (binary classification)
learning_rate = 3e-3
num_epochs = 10_000
batch_size = 128

# Check if a GPU is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model, loss function, and optimizer
model = RNNClassifier(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Create a DataLoader for the dataset
dataset = EventDataset(parquet_directory)

# Split the dataset into training, validation, and test sets
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

del dataset

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize TensorBoard writer
writer = SummaryWriter()

# Initialize variables to track the best validation loss
best_val_loss = float('inf')

print("Training start")
# Training loop
for epoch in tqdm(range(num_epochs)):
    model.train()
    for i, (markers, features) in enumerate(train_loader):

        features = features.to(device)  # Move features to the device
        markers = markers.unsqueeze(-1).to(device)

        outputs = model(features)
        loss = criterion(outputs, markers)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Log the loss to TensorBoard
        writer.add_scalar('Loss/train', loss.item(), epoch * len(train_loader) + i)

    # Validation step
    model.eval()
    val_loss = 0
    all_markers = []
    all_predictions = []
    with torch.no_grad():
        for i, (markers, features) in enumerate(val_loader):
            features = features.to(device)
            markers = markers.unsqueeze(-1).to(device)

            outputs = model(features)
            loss = criterion(outputs, markers)
            val_loss += loss.item()

            # Collect markers and predictions for metrics calculation
            all_markers.extend(markers.cpu().numpy().flatten())
            all_predictions.extend(torch.sigmoid(outputs).cpu().numpy().flatten())

    val_loss /= len(val_loader)
    writer.add_scalar('Loss/val', val_loss, epoch)

    # Calculate validation metrics
    accuracy = accuracy_score(all_markers, [1 if p > 0.5 else 0 for p in all_predictions])
    precision = precision_score(all_markers, [1 if p > 0.5 else 0 for p in all_predictions])
    recall = recall_score(all_markers, [1 if p > 0.5 else 0 for p in all_predictions])
    f1 = f1_score(all_markers, [1 if p > 0.5 else 0 for p in all_predictions])

    # Log validation metrics to TensorBoard
    writer.add_scalar('Metrics/val_accuracy', accuracy, epoch)
    writer.add_scalar('Metrics/val_precision', precision, epoch)
    writer.add_scalar('Metrics/val_recall', recall, epoch)
    writer.add_scalar('Metrics/val_f1', f1, epoch)

    # Save the model if the current validation

['event_id', 'marker', 'time', 'Fp1', 'Fpz', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'FC5', 'FC1', 'FC2', 'FC6', 'M1', 'T7', 'C3', 'Cz', 'C4', 'T8', 'M2', 'CP5', 'CP1', 'CP2', 'CP6', 'P7', 'P3', 'Pz', 'P4', 'P8', 'POz', 'O1', 'O2', 'EOG', 'AF7', 'AF3', 'AF4', 'AF8', 'F5', 'F1', 'F2', 'F6', 'FC3', 'FCz', 'FC4', 'C5', 'C1', 'C2', 'C6', 'CP3', 'CP4', 'P5', 'P1', 'P2', 'P6', 'PO5', 'PO3', 'PO4', 'PO6', 'FT7', 'FT8', 'TP7', 'TP8', 'PO7', 'PO8', 'Oz', '__null_dask_index__']
Training start


TypeError: 'float' object cannot be interpreted as an integer

In [18]:
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Assuming model, criterion, test_loader, device, writer, and epoch are already defined

model.eval()
test_loss = 0
all_test_markers = []
all_test_predictions = []
with torch.no_grad():
    for i, (markers, features) in tqdm(enumerate(test_loader)):
        features = features.to(device)
        markers = markers.unsqueeze(-1).to(device)

        outputs = model(features)
        loss = criterion(outputs, markers)
        test_loss += loss.item()

        # Collect markers and predictions for metrics calculation
        all_test_markers.extend(markers.cpu().numpy().flatten())
        all_test_predictions.extend(torch.sigmoid(outputs).cpu().numpy().flatten())

test_loss /= len(test_loader)
writer.add_scalar('Loss/test', test_loss, epoch)

# Calculate test metrics
test_accuracy = accuracy_score(all_test_markers, [1 if p > 0.5 else 0 for p in all_test_predictions])
test_precision = precision_score(all_test_markers, [1 if p > 0.5 else 0 for p in all_test_predictions])
test_recall = recall_score(all_test_markers, [1 if p > 0.5 else 0 for p in all_test_predictions])
test_f1 = f1_score(all_test_markers, [1 if p > 0.5 else 0 for p in all_test_predictions])
test_roc_auc = roc_auc_score(all_test_markers, all_test_predictions)

# Log test metrics to TensorBoard
writer.add_scalar('Metrics/test_accuracy', test_accuracy, epoch)
writer.add_scalar('Metrics/test_precision', test_precision, epoch)
writer.add_scalar('Metrics/test_recall', test_recall, epoch)
writer.add_scalar('Metrics/test_f1', test_f1, epoch)
writer.add_scalar('Metrics/test_roc_auc', test_roc_auc, epoch)

# Close the TensorBoard writer
writer.close()

28it [00:01, 26.14it/s]


In [20]:
print(f"""
{test_accuracy=}
{test_precision=}
{test_recall=}
{test_f1=}
{test_roc_auc=}
"""
)


test_accuracy=0.58675799086758
test_precision=0.3173076923076923
test_recall=0.23076923076923078
test_f1=0.26720647773279355
test_roc_auc=np.float64(0.5285053929121726)



In [14]:
from sklearn.metrics import f1_score
import numpy as np
best_threshold = 0.0
best_f1 = 0.0
thresholds = np.arange(0.0, 1.0, 0.01)

for threshold in tqdm(thresholds):
    binary_predictions = (all_test_predictions > threshold).astype(int)
    current_recall = f1_score(all_test_markers, binary_predictions)

    if current_recall > best_f1:
        best_f1 = current_recall
        best_threshold = threshold

print(f"{best_threshold=}")
print(f"{best_f1=}")

  0%|          | 0/100 [00:00<?, ?it/s]

best_threshold=np.float64(0.0)
best_f1=0.49225473321858865


In [17]:
from sklearn.metrics import recall_score
import numpy as np
best_threshold = 0.1
best_recall = 0.0
thresholds = np.arange(0.1, 1.0, 0.01)

for threshold in tqdm(thresholds):
    binary_predictions = (all_test_predictions > threshold).astype(int)
    current_recall = recall_score(all_test_markers, binary_predictions)

    if current_recall > best_recall:
        best_recall = current_recall
        best_threshold = threshold

print(f"{best_threshold=}")
print(f"{best_recall=}")

  0%|          | 0/90 [00:00<?, ?it/s]

best_threshold=np.float64(0.1)
best_recall=0.48951048951048953
