In [30]:

import torchaudio
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import os
import pandas as pd
import torchaudio
from torch.utils.data import Dataset, DataLoader
import torch



In [31]:
labels_df = pd.read_excel('./data/NEW_IRB300012145_Patient_ID_deidentified.xlsx')
Smokeing_status = labels_df.iloc[:, 4].to_list()

Smokeing_status[149]

'never'

In [32]:
import torchaudio.transforms as T

class AmplitudeNormalization:
    def __call__(self, waveform):
        # Normalize the waveform to be within [-1, 1]
        peak = waveform.abs().max()
        if peak > 0:
            waveform = waveform / peak
        return waveform

# To use it:
# waveform, sample_rate = torchaudio.load('path/to/audio.wav')
# waveform = AmplitudeNormalization()(waveform)


class PadTrimAudio:
    def __init__(self, max_len):
        self.max_len = max_len

    def __call__(self, waveform):
        if waveform.size(1) > self.max_len:
            # Trim the waveform if longer than max_len
            waveform = waveform[:, :self.max_len]
        elif waveform.size(1) < self.max_len:
            # Pad with zeros if shorter than max_len
            padding_size = self.max_len - waveform.size(1)
            waveform = torch.nn.functional.pad(waveform, (0, padding_size), "constant", 0)
        return waveform

from sklearn.preprocessing import StandardScaler
import numpy as np

class FeatureNormalization:
    def __init__(self):
        self.scaler = StandardScaler()

    def fit(self, features):
        # Fit the scaler on the training set features
        self.scaler.fit(features)

    def transform(self, features):
        # Apply normalization to features
        return self.scaler.transform(features)

In [50]:
class SoundDataset(Dataset):
    def __init__(self, data_dir, labels_df, transform=None, max_len=1000000):
        self.data_dir = data_dir
        self.labels_df = labels_df
        self.transform = transform
        self.max_len = max_len
        self.orderlist = ['RLP.wav', 'RUP.wav', 'RUA Hum.wav', 'LUA Hum.wav', 'RUA.wav', 'RMP.wav', 'LMP.wav', 'LUA.wav', 'LLP.wav', 'LUP.wav']

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        patient_id = str(idx+1).zfill(3)
        audio_dir = os.path.join(self.data_dir, str(patient_id), 'breath Eko')
        audio_files = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.endswith('.wav')]

        # Concatenate audio files
        waveform_list = [torch.zeros(1, self.max_len) for _ in range(len(self.orderlist))]
        for audio_file in audio_files:
            file_name = os.path.basename(audio_file)
            index = self.orderlist.index(file_name)
            waveform, sample_rate = torchaudio.load(audio_file)
            if self.transform:
                waveform = self.transform(waveform)
            waveform = PadTrimAudio(self.max_len)(waveform)
            waveform_list[index] = waveform


        # Concatenate all waveforms along the time dimension
        waveform = torch.cat(waveform_list, dim=0)

        label = self.labels_df[idx]

        return waveform, label

transform = AmplitudeNormalization()
data_dir = './data/Patients'
sound_dataset = SoundDataset(data_dir, Smokeing_status, transform=transform)

dataloader = DataLoader(sound_dataset, batch_size=4, shuffle=True)

In [51]:
for batch in dataloader:
    waveforms, labels = batch
    print(waveforms.shape)
    print(labels)

torch.Size([4, 10, 1000000])
('never', 'never', 'never', 'former')
torch.Size([4, 10, 1000000])
('never', 'never', 'never', 'former')
torch.Size([4, 10, 1000000])
('former', 'current', 'never', 'former')
torch.Size([4, 10, 1000000])
('former', 'never', 'never', 'never')
torch.Size([4, 10, 1000000])
('former', 'never', 'former', 'former')
torch.Size([4, 10, 1000000])
('never', 'never', 'never', 'former')
torch.Size([4, 10, 1000000])
('never', 'never', 'never', 'former')
torch.Size([4, 10, 1000000])
('former', 'never', 'former', 'never')
torch.Size([4, 10, 1000000])
('never', 'never', 'never', 'former')
torch.Size([4, 10, 1000000])
('never', 'never', 'never', 'current')
torch.Size([4, 10, 1000000])
('never', 'never', 'never', 'never')
torch.Size([4, 10, 1000000])
('never', 'never', 'former', 'never')
torch.Size([4, 10, 1000000])
('former', 'never', 'former', 'never')
torch.Size([4, 10, 1000000])
('former', 'never', 'former', 'never')
torch.Size([4, 10, 1000000])
('never', 'current', 'for

RuntimeError: stack expects each tensor to be equal size, but got [10, 1000000] at entry 0 and [13, 1000000] at entry 2

In [63]:
import torchaudio.transforms as T
class FixedSizeMelSpectrogram(T.MelSpectrogram):
    def __init__(self, sample_rate=22050, n_mels=64, max_pad_length=900, n_fft=2048, hop_length=512, **kwargs):
        super().__init__(sample_rate=sample_rate, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length, **kwargs)
        self.max_pad_length = max_pad_length

    def forward(self, waveform):
        # Convert stereo to mono if necessary
        if waveform.size(0) > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Generate the Mel Spectrogram
        mel_spec = super().forward(waveform)

        # Ensure the Mel Spectrogram is of fixed size
        if mel_spec.size(2) < self.max_pad_length:
            # Padding
            padded_mel_spec = torch.nn.functional.pad(mel_spec, (0, self.max_pad_length - mel_spec.size(2)))
        else:
            # Truncating
            padded_mel_spec = mel_spec[:, :, :self.max_pad_length]

        return padded_mel_spec

label_to_int = {
    'current': 0,
    'former': 1,
    'never': 2
}


class AudioClassifier(nn.Module):
    def __init__(self):
        super(AudioClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(115200, 128)  # Adjust size based on your MelSpectrogram output
        self.fc2 = nn.Linear(128, 10)  # Assuming 10 classes

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class UrbanSound8KDataset(Dataset):
    def __init__(self, soundata_dataset, transform=None, target_length=1000000):  # target_length depends on your data
        self.clips = soundata_dataset.load_clips()
        self.transform = transform
        self.target_length = target_length

    def __len__(self):
        return len(self.clips)

    def __getitem__(self, idx):
        clip_key = list(self.clips.keys())[idx]
        clip = self.clips[clip_key]
        waveform, sr = torchaudio.load(clip.audio_path)
        if self.transform:
            waveform = self.transform(waveform)

        label_string = clip.tags.labels[0]
        label = label_to_int[label_string]

        return waveform, label



In [73]:

# Usage of the transform
transform = FixedSizeMelSpectrogram(sample_rate=22050, n_mels=64, max_pad_length=900)

# Create PyTorch datasets
train_dataset = UrbanSound8KDataset(dataset, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

def train(model, device, train_loader, optimizer, criterion, num_epochs=50):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0

        for data, targets in train_loader:
            data = data.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()
            outputs = model(data)  # Add channel dimension
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Compute accuracy
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
        avg_loss = total_loss / len(train_loader)
        accuracy = 100 * correct / total
        print(f'Epoch {epoch+1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%')

train(model, device, train_loader, optimizer, criterion)


Epoch 1, Loss: 12.5799, Accuracy: 33.50%
Epoch 2, Loss: 1.5315, Accuracy: 51.82%
Epoch 3, Loss: 1.4019, Accuracy: 61.37%
Epoch 4, Loss: 1.1271, Accuracy: 65.60%
Epoch 5, Loss: 1.2163, Accuracy: 69.84%
Epoch 6, Loss: 1.0536, Accuracy: 70.11%
Epoch 7, Loss: 0.9006, Accuracy: 77.40%
Epoch 8, Loss: 0.7792, Accuracy: 80.08%
Epoch 9, Loss: 0.8212, Accuracy: 82.48%
Epoch 10, Loss: 0.7381, Accuracy: 83.67%
Epoch 11, Loss: 0.4484, Accuracy: 88.31%
Epoch 12, Loss: 0.3376, Accuracy: 91.30%
Epoch 13, Loss: 0.7144, Accuracy: 88.99%
Epoch 14, Loss: 0.7038, Accuracy: 87.44%
Epoch 15, Loss: 0.7641, Accuracy: 86.21%
Epoch 16, Loss: 0.3453, Accuracy: 93.19%
Epoch 17, Loss: 0.2770, Accuracy: 93.84%
Epoch 18, Loss: 0.1458, Accuracy: 95.98%
Epoch 19, Loss: 0.8000, Accuracy: 92.24%
Epoch 20, Loss: 0.3702, Accuracy: 94.32%
Epoch 21, Loss: 0.3860, Accuracy: 95.68%
Epoch 22, Loss: 0.3826, Accuracy: 94.31%
Epoch 23, Loss: 0.0996, Accuracy: 97.62%
Epoch 24, Loss: 0.4093, Accuracy: 96.06%
Epoch 25, Loss: 0.2236, 