This notebook is part of Ville Tonteris master's thesis. 

"Underwater sound anomaly detection"

In [3]:
# Import all necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import math
import os
import torch
import torchaudio
from torch.utils.data import Dataset
import pandas as pd
from torch.utils.data import DataLoader
import torchaudio.transforms as transforms

In [4]:
class SoundDataset(Dataset):
    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples, device):
        self.annotations = pd.read_excel(annotations_file)
        self.audio_dir = audio_dir
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        self.device = device

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        return signal, label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate).to(self.device)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _get_audio_sample_path(self, index):
        filename = self.annotations.iloc[index]['Filename']
        path = os.path.join(self.audio_dir, filename)
        return path

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index]['Type']


In [None]:
# Define neural network model

class ClassificationModel:
    
    def __init__(self) -> None:
        pass

In [6]:
ANNOTATIONS_FILE = r'C:\Users\elliw\OneDrive\Documents\Aalto\Masters Thesis\shipsEar_AUDIOS\shipsEar.xlsx'
AUDIO_DIR = r'C:\Users\elliw\OneDrive\Documents\Aalto\Masters Thesis\shipsEar_AUDIOS'
SAMPLE_RATE = 16000
NUM_SAMPLES = 16000

# Define transformations
transformation = transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_mels=64,
    n_fft=1024,
    hop_length=512
)

# Initialize dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = SoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, transformation, SAMPLE_RATE, NUM_SAMPLES, device)

# Create dataloader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

model = ClassificationModel()

for inputs, targets in dataloader:
    print(inputs[0].shape, targets)
    # Your training code here



AttributeError: 'tuple' object has no attribute 'shape'