## model training

In [1]:
import torch
import numpy as np

# Load tensor
labels_tensor = torch.load("C:/Users/nisar/github-classroom/uwa-computer-science/project-12-prototype-bio-acoustic-detection-system-soundsentinel/tensors/labels_tensor.pt")
spikes_tensor = torch.load("C:/Users/nisar/github-classroom/uwa-computer-science/project-12-prototype-bio-acoustic-detection-system-soundsentinel/tensors/spike_data_tensor.pt")

label_distribution = torch.bincount(labels_tensor)
print(f'Original Labels distribution: {label_distribution}')

Original Labels distribution: tensor([1294,   95, 1046,   53,   40], device='cuda:0')


  labels_tensor = torch.load("C:/Users/nisar/github-classroom/uwa-computer-science/project-12-prototype-bio-acoustic-detection-system-soundsentinel/tensors/labels_tensor.pt")
  spikes_tensor = torch.load("C:/Users/nisar/github-classroom/uwa-computer-science/project-12-prototype-bio-acoustic-detection-system-soundsentinel/tensors/spike_data_tensor.pt")


In [2]:
import torch
import numpy as np

class CCMKDataset(torch.utils.data.Dataset):
    def __init__(self, spikes_tensor, labels_tensor, target_label=2):
        self.spikes_tensor = spikes_tensor
        self.labels_tensor = labels_tensor
        self.target_label = target_label

        # Step 1: Ignore labels 1, 3, and 4
        valid_mask = (self.labels_tensor == 0) | (self.labels_tensor == target_label)
        self.spikes_tensor = self.spikes_tensor[valid_mask]
        self.labels_tensor = self.labels_tensor[valid_mask]

        # Convert the target label (2) to 1 (positive sample)
        self.labels_tensor[self.labels_tensor == target_label] = 1

        # Step 2: Downsample label 0 (background noise) to match the number of target samples (label 2 -> now 1)
        label_0_indices = torch.where(self.labels_tensor == 0)[0]
        label_1_indices = torch.where(self.labels_tensor == 1)[0]

        # print samples
        #print(f"Number of label 0 samples before downsampling: {len(label_0_indices)}")
        #print(f"Number of label 1 samples (after converting target label 2 to 1): {len(label_1_indices)}")

        num_samples = min(len(label_1_indices), len(label_0_indices))

        # Randomly sample from label 0 and label 1 indices
        selected_label_0_indices = torch.tensor(np.random.choice(label_0_indices.cpu(), size=num_samples, replace=False))
        selected_label_1_indices = torch.tensor(np.random.choice(label_1_indices.cpu(), size=num_samples, replace=False))

        # Combine the downsampled label 0 indices with label 1 indices
        balanced_indices = torch.cat([selected_label_0_indices, selected_label_1_indices])

        # print data after samples
        print(f"Number of downsampled label 0 samples: {len(selected_label_0_indices)}")
        print(f"Number of target label 1 samples: {len(selected_label_1_indices)}")
        print(f"Balanced dataset length: {len(balanced_indices)}")

        # Apply the balanced indices to spikes and labels
        self.spikes_tensor = self.spikes_tensor[balanced_indices]
        self.labels_tensor = self.labels_tensor[balanced_indices]

        # Debugging: Check the distribution of labels after processing
        print(f"Filtered Labels distribution after processing: {torch.bincount(self.labels_tensor.int())}")

    def __len__(self):
        return len(self.labels_tensor)

    def __getitem__(self, idx):
        return self.spikes_tensor[idx], self.labels_tensor[idx]

dataset = CCMKDataset(spikes_tensor=spikes_tensor, labels_tensor=labels_tensor)


Number of downsampled label 0 samples: 1046
Number of target label 1 samples: 1046
Balanced dataset length: 2092
Filtered Labels distribution after processing: tensor([1046, 1046], device='cuda:0')


In [3]:
from torch.utils.data import Dataset, DataLoader, Subset, random_split


# Balance the dataset by selecting equal samples from both classes
label_0_indices = torch.where(dataset.labels_tensor == 0)[0]
label_1_indices = torch.where(dataset.labels_tensor == 1)[0]

dataset_0 = torch.utils.data.Subset(dataset, label_0_indices)
dataset_1 = torch.utils.data.Subset(dataset, label_1_indices)

# Split size
train_size_0 = int(0.8 * len(dataset_0))
val_size_0 = int(0.1 * len(dataset_0))
test_size_0 = len(dataset_0) - train_size_0 - val_size_0

train_size_1 = int(0.8 * len(dataset_1))
val_size_1 = int(0.1 * len(dataset_1))
test_size_1 = len(dataset_1) - train_size_1 - val_size_1

# Split the dataset
train_dataset_0, val_dataset_0, test_dataset_0 = random_split(dataset_0, [train_size_0, val_size_0, test_size_0], generator=torch.Generator().manual_seed(42))
train_dataset_1, val_dataset_1, test_dataset_1 = random_split(dataset_1, [train_size_1, val_size_1, test_size_1], generator=torch.Generator().manual_seed(42))

# Combine splits from both classes
train_dataset = torch.utils.data.ConcatDataset([train_dataset_0, train_dataset_1])
val_dataset = torch.utils.data.ConcatDataset([val_dataset_0, val_dataset_1])
test_dataset = torch.utils.data.ConcatDataset([test_dataset_0, test_dataset_1])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [4]:
import os
import torch
import pickle

# Directory to store cached data
cache_dir = 'dataloader_cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

def cache_loader_to_gpu(loader, cache_name):
    cache_path = os.path.join(cache_dir, cache_name)
    
    # Check if the cache exists
    if os.path.exists(cache_path):
        print(f"Loading cached data for {cache_name} from disk")
        with open(cache_path, 'rb') as f:
            cached_data = pickle.load(f)
        
        # Move cached data to GPU
        cached_data_gpu = [(inputs.to('cuda'), targets.to('cuda')) for inputs, targets in cached_data]
        return cached_data_gpu
    
    # If no cache, load data using the original loader and cache it
    print(f"Caching data for {cache_name}")
    data_list = []
    for data in loader:
        data_list.append((data[0].to('cuda'), data[1].to('cuda')))  # Move data to GPU immediately
    
    # Save the data to cache on disk (CPU version)
    with open(cache_path, 'wb') as f:
        pickle.dump([(inputs.cpu(), targets.cpu()) for inputs, targets in data_list], f)
    
    return data_list  # Return GPU version of the data

# Example usage with train_loader, val_loader, and test_loader
train_loader_cache = cache_loader_to_gpu(train_loader, 'train_loader_gpu.pkl')
val_loader_cache = cache_loader_to_gpu(val_loader, 'val_loader_gpu.pkl')
test_loader_cache = cache_loader_to_gpu(test_loader, 'test_loader_gpu.pkl')


Caching data for train_loader_gpu.pkl
Caching data for val_loader_gpu.pkl
Caching data for test_loader_gpu.pkl


In [5]:
import os

# Get the absolute path of the cache directory
cache_dir = 'dataloader_cache'
absolute_cache_path = os.path.abspath(cache_dir)
print(f"Cache directory is located at: {absolute_cache_path}")

Cache directory is located at: c:\Users\nisar\github-classroom\uwa-computer-science\project-12-prototype-bio-acoustic-detection-system-soundsentinel\angela19sep\dataloader_cache


To use the cached data

In [1]:
import pickle
import os
import torch

# Directory where the cached data is stored
cache_dir = 'C:/Users/nisar/github-classroom/uwa-computer-science/project-12-prototype-bio-acoustic-detection-system-soundsentinel/dataloader_cache'

# Function to load cached data
def load_cached_data(cache_name):
    cache_path = os.path.join(cache_dir, cache_name)
    
    # Check if cache file exists
    if os.path.exists(cache_path):
        print(f"Loading cached data from {cache_path}")
        with open(cache_path, 'rb') as f:
            cached_data = pickle.load(f)
        return cached_data
    else:
        print(f"Cache file {cache_path} not found.")
        return None

# Load cached data loaders
train_loader_cache = load_cached_data('train_loader_gpu.pkl')
val_loader_cache = load_cached_data('val_loader_gpu.pkl')
test_loader_cache = load_cached_data('test_loader_gpu.pkl')

# Ensure the cached data is on the GPU if necessary
def move_data_to_gpu(cached_loader):
    gpu_loader = [(inputs.to('cuda'), targets.to('cuda')) for inputs, targets in cached_loader]
    return gpu_loader

# Move to GPU if needed (only if the data is not already on GPU)
train_loader_cache = move_data_to_gpu(train_loader_cache)
val_loader_cache = move_data_to_gpu(val_loader_cache)
test_loader_cache = move_data_to_gpu(test_loader_cache)


Loading cached data from C:/Users/nisar/github-classroom/uwa-computer-science/project-12-prototype-bio-acoustic-detection-system-soundsentinel/dataloader_cache\train_loader_gpu.pkl
Loading cached data from C:/Users/nisar/github-classroom/uwa-computer-science/project-12-prototype-bio-acoustic-detection-system-soundsentinel/dataloader_cache\val_loader_gpu.pkl
Loading cached data from C:/Users/nisar/github-classroom/uwa-computer-science/project-12-prototype-bio-acoustic-detection-system-soundsentinel/dataloader_cache\test_loader_gpu.pkl
