In [66]:
import torch
import torchaudio
import polars as pl
import matplotlib.pyplot as plt
import os
import numpy as np
import tqdm
from torchaudio import transforms
from IPython.display import Audio
import math, random

import helpers.input_processor as ip

In [67]:
#load data from dataframe
data_dir = "data/raw_training/training_data/"
target_label = 'murmur_in_recording'
df = (
    ip.loadTrainingData(data_dir)
    .filter(pl.col(target_label) != 'Unknown')
    .pipe(ip.encodeData)
    .select([
        pl.col('audio_file').apply(lambda x: os.path.join(data_dir, x)),
        pl.col(target_label)
    ])
)

loading data from save file:  cache/ingested_data.json


In [68]:
# #balance the data so that there is an equal number of murmur positive and murmur negative samples
# #do this by duplicating random rows of whichever group (pos or neg) is smaller
# neg_df = df.filter(pl.col(target_label)==0.0)
# pos_df = df.filter(pl.col(target_label)==1.0)
# numNeg = neg_df.height
# numPos = pos_df.height

# while numNeg != numPos:
#     if numNeg < numPos:
#         df.vstack(neg_df.sample(n=min(numPos-numNeg, neg_df.height), shuffle=True), in_place=True)
#     else: 
#         df.vstack(pos_df.sample(n=min(numNeg-numPos, pos_df.height), shuffle=True), in_place=True)
#     numNeg = df.filter(pl.col(target_label)==0.0).height
#     numPos = df.filter(pl.col(target_label)==1.0).height

# #reshuffle rows
# df = df.sample(frac=1.0, shuffle=True)

# #check number of positive and negative samples
# numNeg = df.filter(pl.col(target_label)==0.0).height
# numPos = df.filter(pl.col(target_label)==1.0).height
# print('Total Samples:       ', df.height)
# print('Positive Samples:    ', numPos)
# print('Negative Samples:    ', numNeg)
# print('Percent Positive Samples:    ', numPos/(numPos+numNeg))


# method 2:

#balance the data so that there is an equal number of murmur positive and murmur negative samples
#do this by duplicating random rows of whichever group (pos or neg) is smaller
neg_df = df.filter(pl.col(target_label)==0.0)
pos_df = df.filter(pl.col(target_label)==1.0)
numNeg = neg_df.height
numPos = pos_df.height

if numNeg < numPos:
    df = neg_df.vstack(pos_df.sample(n=numNeg))
elif numPos < numNeg:
    df = pos_df.vstack(neg_df.sample(n=numPos))
else:
    df = neg_df.vstack(pos_df)

#reshuffle rows
df = df.sample(frac=1.0, shuffle=True)

#check number of positive and negative samples
numNeg = df.filter(pl.col(target_label)==0.0).height
numPos = df.filter(pl.col(target_label)==1.0).height
print('Total Samples:       ', df.height)
print('Positive Samples:    ', numPos)
print('Negative Samples:    ', numNeg)
print('Percent Positive Samples:    ', numPos/(numPos+numNeg))

Total Samples:        998
Positive Samples:     499
Negative Samples:     499
Percent Positive Samples:     0.5


In [69]:
# x = path to audio file
# samp_rate = sample rate of the signal
# duration = length of time (in seconds) to which the signal is resized
# do_augmentation = whether to perform audio and image augmentation on the signal
# n_freq_masks = number of frequency masks
# n_time_masks = number of time masks
# remaining keyword argument are passed to transforms.MelSpectrogram()
def preprocessAudio(x, samp_rate, duration, do_augmentation=True, n_freq_masks=1, n_time_masks=1, n_mels=128, n_fft=1024, hop_len=None):
    # read and load audio file in .wav format
    sig, sr = torchaudio.load(x)

    # Check that audio is mono (has 1 audio channel)
    num_channels = sig.shape[0]
    if num_channels != 1:
        raise Exception('The provided audio file \'%s\' has %s channels, when 1 was expected' % (x, num_channels))
    
    # resize sample, either by padding it with silence or truncating it
    num_rows, sig_len = sig.shape
    max_len = sr * duration
    if (sig_len > max_len):
        # Truncate the signal to the given length
        sig = sig[:,:max_len]
    elif (sig_len < max_len):
        # Pad with zeroes at the beginning and end of the signal
        pad_begin_len = random.randint(0, max_len - sig_len)
        pad_end_len = max_len - sig_len - pad_begin_len
        pad_begin = torch.zeros((num_rows, pad_begin_len))
        pad_end = torch.zeros((num_rows, pad_end_len))
        sig = torch.cat((pad_begin, sig, pad_end), 1)

    # Audio Augmentation    --------\
    if do_augmentation == True:
        # time shift signal to the left or right by a random percent of its original length (max 99%)
        _, sig_len = sig.shape
        max_shift = 0.99
        sig = sig.roll(int(random.random() * max_shift * sig_len))
    #-------------------------------/

    # get Mel spectrogram
    top_db = 80
    melSpec = torchaudio.transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    melSpec = torchaudio.transforms.AmplitudeToDB(top_db=top_db)(melSpec)

    # Image Augmentation    --------\
    if do_augmentation == True:
        # Apply time and frequency mask
        max_mask_pct=0.1
        n_steps = melSpec.shape[2]
        mask_value = melSpec.mean()
        for i in range(n_freq_masks):
            melSpec = torchaudio.transforms.FrequencyMasking(max_mask_pct * n_mels)(melSpec, mask_value)
        for i in range(n_time_masks):
            melSpec = torchaudio.transforms.TimeMasking(max_mask_pct * n_steps)(melSpec, mask_value)
    #-------------------------------/
    
    return melSpec

In [70]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class AudioDataset(Dataset):
    #augmentData = whether to perform data augmentation
    def __init__(self, audioPaths, labels, augmentData=True):
        self.audioPaths = audioPaths
        self.labels = labels
        self.augmentData = augmentData
        self.sr = 4000
        self.sample_duration = 25
        self.n_mels = 128

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        spec = preprocessAudio(self.audioPaths[idx], self.sr, self.sample_duration, do_augmentation=self.augmentData, n_mels=self.n_mels)
        label = self.labels[idx]
        return spec, label

In [71]:
def splitDataframe(df, split_ratio=0.8):
    total_size = df.height
    train_size = round(split_ratio * total_size)
    test_size = total_size - train_size

    df = df.sample(frac=1.0, shuffle=True)
    trainSet = df.head(train_size)
    testSet = df.tail(test_size)
    return trainSet, testSet

In [72]:
train_df, test_df = splitDataframe(df)
classes = df.get_column(target_label).unique().to_list()

augmentTrainData = True
augmentTestData = True

train_ds = AudioDataset(
    audioPaths=train_df.get_column('audio_file').to_list(),
    labels=train_df.get_column(target_label).to_list(),
    augmentData=augmentTrainData
)
test_ds = AudioDataset(
    audioPaths=test_df.get_column('audio_file').to_list(),
    labels=test_df.get_column(target_label).to_list(),
    augmentData=augmentTestData
)

_________________________________________________________________________________________________________
tutorial code
_________________________________________________________________________________________________________

In [74]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters 
num_epochs = 25
batch_size = 32
learning_rate = 0.001

# Dataloaders
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4, prefetch_factor=4)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=4, prefetch_factor=4)

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        # numClasses = 2
        # self.conv1 = nn.Conv2d(1, 2*1, 5)   # first param = 1 since iput image has 1 channel
        # self.pool = nn.MaxPool2d(2, 2)
        # self.conv2 = nn.Conv2d(2*1, 16, 5)
        # self.fc1 = nn.Linear(16 * 29 * 46, 120)
        # self.fc2 = nn.Linear(120, 84)
        # self.fc3 = nn.Linear(84, numClasses)
        numClasses = 2
        self.conv1 = nn.Conv2d(1, 32, 5)   # first param = 1 since iput image has 1 channel
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 5)
        self.fc1 = nn.Linear(64 * 29 * 46, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, numClasses)


    def forward(self, x):
        # -> n, 3, 32, 32
        x = self.pool(F.relu(self.conv1(x)))  # -> n, 6, 14, 14
        x = self.pool(F.relu(self.conv2(x)))  # -> n, 16, 5, 5
        x = x.view(-1, 64 * 29 * 46)            # -> n, 400
        x = F.relu(self.fc1(x))               # -> n, 120
        x = F.relu(self.fc2(x))               # -> n, 84
        x = self.fc3(x)                       # -> n, 10
        return x


model = ConvNet().to(device)

In [79]:
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    running_loss = 0.0
    epoch_bar = tqdm.tqdm(train_loader)
    epoch_bar.set_description("Epoch %s" % epoch)
    for i, (images, labels) in enumerate(epoch_bar):
        epoch_bar.set_postfix({"loss": running_loss})

        # origin shape: [4, 3, 32, 32] = 4, 3, 1024
        # input_layer: 3 input channels, 6 output channels, 5 kernel size
        images = images.type(torch.FloatTensor).to(device)
        labels = labels.type(torch.LongTensor).to(device)

        # Forward pass
        # outputs = model(images.unsqueeze(1))
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss = loss.item()

print('Finished Training')
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)

Epoch 0: 100%|██████████| 25/25 [00:01<00:00, 21.67it/s, loss=0.538]
Epoch 1: 100%|██████████| 25/25 [00:01<00:00, 21.58it/s, loss=0.413]
Epoch 2: 100%|██████████| 25/25 [00:01<00:00, 20.32it/s, loss=0.417]
Epoch 3: 100%|██████████| 25/25 [00:01<00:00, 21.60it/s, loss=0.518]
Epoch 4: 100%|██████████| 25/25 [00:01<00:00, 21.80it/s, loss=0.493]
Epoch 5: 100%|██████████| 25/25 [00:01<00:00, 21.45it/s, loss=0.463]
Epoch 6: 100%|██████████| 25/25 [00:01<00:00, 21.05it/s, loss=0.53] 
Epoch 7: 100%|██████████| 25/25 [00:01<00:00, 19.82it/s, loss=0.582]
Epoch 8: 100%|██████████| 25/25 [00:01<00:00, 20.38it/s, loss=0.398]
Epoch 9: 100%|██████████| 25/25 [00:01<00:00, 21.64it/s, loss=0.677]
Epoch 10: 100%|██████████| 25/25 [00:01<00:00, 21.74it/s, loss=0.426]
Epoch 11: 100%|██████████| 25/25 [00:01<00:00, 21.19it/s, loss=0.604]
Epoch 12: 100%|██████████| 25/25 [00:01<00:00, 21.72it/s, loss=0.576]
Epoch 13: 100%|██████████| 25/25 [00:01<00:00, 21.54it/s, loss=0.378]
Epoch 14: 100%|██████████| 25/

Finished Training


In [87]:
# Test the model
net = ConvNet().to(device)
net.load_state_dict(torch.load(PATH))

correct = 0
total = 0
with torch.no_grad():
    progressBar = tqdm.tqdm(test_loader)
    progressBar.set_description('Testing Model')
    for i, (images, labels) in enumerate(progressBar):
        images = images.type(torch.FloatTensor).to(device)
        labels = labels.type(torch.LongTensor).to(device)

        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the test images: {100 * correct / total} %')
        

Testing Model: 100%|██████████| 7/7 [00:00<00:00, 13.42it/s]

Accuracy of the network on the test images: 74.0 %



