In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

import dask.dataframe as dd

from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
class EventDataset(Dataset):
    def __init__(self, path, split='train'):
        self.__dataframe = dd.read_parquet(path)
        self.__event_ids = self.__dataframe['event_id'].unique().compute()
        self.__len = len(self.__event_ids)

        # Split the dataset
        train_ids, test_ids = train_test_split(self.__event_ids, test_size=0.2, random_state=69)
        train_ids, val_ids = train_test_split(train_ids, test_size=0.25, random_state=69)  # 0.25 x 0.8 = 0.2

        match split:
            case 'train':
                self.__event_ids = train_ids
            case 'val':
                self.__event_ids = val_ids
            case 'test':
                self.__event_ids = test_ids
            case _:
                raise ValueError("split must be 'train', 'val', or 'test'")

        self.__len = len(self.__event_ids)

    def __len__(self):
        return self.__len

    def __getitem__(self, idx):
        event_id = self.__event_ids[idx]
        event_rows = self.__dataframe[self.__dataframe['event_id'] == event_id].compute()
        # Convert the event rows to a tensor
        event_tensor = torch.tensor(event_rows.drop(columns=['event_id', 'time', 'marker']).values, dtype=torch.float32)
        marker = torch.tensor(event_rows.marker.values[0] == 'Stimulus/P', dtype=torch.float32)
        return marker, event_tensor

    def class_balance(self):
        positives = 0
        negatives = 0
        for idx in tqdm(range(self.__len), desc="Counting class values.."):
            marker, event_tensor = self.__getitem__(idx)
            if marker == 0.:
                negatives += 1
            else:
                positives += 1
        return positives, negatives

In [None]:
# Define the directory containing the Parquet files
parquet_directory = '/home/owner/Documents/DEV/BrainLabyrinth/data/combined.parquet'

# Create datasets for train, validation, and test
train_dataset = EventDataset(parquet_directory, split='train')
val_dataset = EventDataset(parquet_directory, split='val')
test_dataset = EventDataset(parquet_directory, split='test')

# Example usage
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

def collate_fn(batch):
    max_length = 2000  # Fixed length for all sequences
    markers, tensors = zip(*batch)

    batch_size = len(tensors)
    num_features = tensors[0].size(1)  # Assuming all tensors have the same number of features

    # Pre-allocate a tensor of the desired shape
    padded_tensors = torch.zeros((batch_size, max_length, num_features), dtype=torch.float32)

    for i, tensor in enumerate(tensors):
        length = tensor.size(0)
        if length < max_length:
            padded_tensors[i, :length, :] = tensor
        else:
            padded_tensors[i, :max_length, :] = tensor[:max_length]

    # Convert markers to a tensor
    markers = torch.tensor(markers, dtype=torch.float32)

    return markers, padded_tensors

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [None]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = 64  # Number of features
hidden_size = 128  # Number of features in the hidden state
num_layers = 1  # Number of recurrent layers
output_size = 1  # Number of output classes (binary classification)
learning_rate = 3e-3
num_epochs = 10
batch_size = 4

# Check if a GPU is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model, loss function, and optimizer
model = RNNClassifier(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Create a DataLoader for the dataset
dataset = EventDataset(parquet_directory)
dataset.class_balance()

# Initialize TensorBoard writer
writer = SummaryWriter()
# Training loop
for epoch in tqdm(range(num_epochs)):
    for i, (marker, features) in tqdm(enumerate(dataloader)):
        features = features.to(device)  # Move features to the device
        marker = marker.unsqueeze(-1).to(device)

        outputs = model(features)
        loss = criterion(outputs, marker)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Log the loss to TensorBoard
        print(epoch * len(dataloader) + i)
        writer.add_scalar('Loss/train', loss.item(), epoch * len(dataloader) + i)

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print("Training complete.")
torch.save(model.state_dict(), "model.torch")

# Close the TensorBoard writer
writer.close()

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

# Example usage
model.eval()
all_markers = []
all_outputs = []

with torch.no_grad():
    for markers, features in sampled_dataloader:
        features = features.to(device)  # Ensure features are in float format
        markers = markers.unsqueeze(-1)  # Ensure markers are in float format

        outputs = F.sigmoid(model(features).squeeze())
        all_markers.extend(markers.cpu().numpy())
        all_outputs.extend(outputs.cpu().numpy())

In [None]:
# Convert outputs to binary predictions using a threshold (e.g., 0.5)
threshold = 0.33
binary_predictions = [1 if output > threshold else 0 for output in all_outputs]

# Calculate metrics
roc_auc = roc_auc_score(all_markers, all_outputs)
accuracy = accuracy_score(all_markers, binary_predictions)
precision = precision_score(all_markers, binary_predictions)
recall = recall_score(all_markers, binary_predictions)

print(f'ROC AUC: {roc_auc:.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

In [None]:
from sklearn.metrics import f1_score

best_threshold = 0.0
best_f1 = 0.0
thresholds = np.arange(0.0, 1.0, 0.01)

for threshold in tqdm(thresholds):
    binary_predictions = (all_outputs > threshold).astype(int)
    current_recall = f1_score(all_markers, binary_predictions)

    if current_recall > best_f1:
        best_f1 = current_recall
        best_threshold = threshold

print(f"{best_threshold=}")
print(f"{best_f1=}")