In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset

#import torchaudio
import librosa #use librosa instead of torchaudio since we're on Win...

import pandas as pd
import numpy as np
import os

In [2]:
#check if Cuda is avaliable
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"
print(device)

cpu


In [3]:
torch.cuda.empty_cache()

In [4]:
# Import the dataset
data_csv = pd.read_csv(os.getcwd()+"\\dataset\\UrbanSound8K\\metadata\\UrbanSound8K.csv") 
data_csv.head()


Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [5]:
#Formatting dataset
class US8K(Dataset):
    
    def __init__(self, csv_path, file_path, folder_list):
        
        self.files_name = []
        self.labels = []
        self.folders = []
        
        #read metadata from .csv
        csv_metadata = pd.read_csv(csv_path)
        
        #loop through the metadata and save them into the lists
        for i in range(0, len(csv_metadata)):
            if csv_metadata.iloc[i, 5] in folder_list:
                self.files_name.append(csv_metadata.iloc[i,0])
                self.labels.append(csv_metadata.iloc[i,6])
                self.folders.append(csv_metadata.iloc[i,5])
            
        self.filePath = file_path
        self.folderList = folder_list
            
        
    def __getitem__(self, index):
        #format audio file path
        path = self.filePath + "fold" + str(self.folders[index]) + "\\" + self.files_name[index]
        #load audio data(sampling rate = 22.5K; downmixed to mono)
        audio_data = librosa.core.load(path, sr=8000, mono=True)[0]
        #reshape audio data [1,n_frames]
        audio_data = np.reshape(audio_data,(audio_data.size,1))
        #convert np array to tensor
        audio_data = torch.from_numpy(audio_data)
        
         #downsample the audio to ~8kHz
        tempData = torch.zeros([160000, 1]) #tempData accounts for audio clips that are too short
        if audio_data.numel() < 160000:
            
            tempData[:audio_data.numel()] = audio_data[:]
        else:
            tempData[:] = audio_data[:160000]

        audio_data = tempData
        soundFormatted = torch.zeros([32000, 1])
        soundFormatted[:32000] = audio_data[::5] #take every fifth sample of soundData
        soundFormatted = soundFormatted.permute(1, 0)
        

        
        return soundFormatted, self.labels[index]
    
    def __len__(self):
        return len(self.files_name)
        

In [6]:
csv_path = os.getcwd() + '\\dataset\\UrbanSound8K\\metadata\\UrbanSound8K.csv'
file_path = os.getcwd() + '\\dataset\\UrbanSound8K\\audio\\'

train_set = US8K(csv_path, file_path, range(1,8))
validation_set = US8K(csv_path, file_path, range(8,10))
test_set = US8K(csv_path, file_path, [10])

print("Train set size: " + str(len(train_set)) + ' files')
print("Validation set size: " + str(len(validation_set)) + ' files')
print("Test set size: " + str(len(test_set)) + ' files')

Train set size: 6273 files
Validation set size: 1622 files
Test set size: 837 files


In [7]:
kwargs = {'num_workers': 1, 'pin_memory': True} if device == 'cuda' else {} #needed for using datasets on gpu

train_loader = torch.utils.data.DataLoader(train_set, batch_size = 128, shuffle = True, **kwargs)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size = 128, shuffle = True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_set, batch_size = 128, shuffle = True, **kwargs)

# Construct the network

In [8]:
class CNN(nn.Module):
    #M5 CNN: m5 denotes 5 weighted layers
    
    def __init__(self):
        super(CNN,self).__init__()
        #first layer
        #a convolutional layer with receptive field 882 and 128 filters, with stride 4 
        self.conv1 = nn.Sequential(
            nn.Conv1d(in_channels=1, # input height
                      out_channels=128, # n_filters
                      kernel_size= 80, # filter size; 40ms=> kernel size = 0.04 * sampling rate = 882
                      stride = 4), #hop size
            nn.BatchNorm1d(128), # batch normalization
            nn.ReLU(), #activation func
            nn.MaxPool1d(4), # max pooling
        )  
        
        #second layer
        #a convolutional layer with receptive field 3 and 128 filters
        self.conv2 = nn.Sequential(
            nn.Conv1d(in_channels=128, # input height
                      out_channels=128, # n_filters
                      kernel_size= 3), # filter size
            nn.BatchNorm1d(128), # batch normalization
            nn.ReLU(), #activation func
            nn.MaxPool1d(4), # max pooling
        )
        
        #third layer
        #a convolutional layer with receptive field 3 and 256 filters
        self.conv3 = nn.Sequential(
            nn.Conv1d(in_channels=128, # input height
                      out_channels=256, # n_filters
                      kernel_size= 3), # filter size
            nn.BatchNorm1d(256), # batch normalization
            nn.ReLU(), #activation func
            nn.MaxPool1d(4), # max pooling
        )
        
        #fourth layer
        #a convolutional layer with receptive field 3 and 512 filters
        self.conv4 = nn.Sequential(
            nn.Conv1d(in_channels=256, # input height
                      out_channels=512, # n_filters
                      kernel_size= 3), # filter size
            nn.BatchNorm1d(512), # batch normalization
            nn.ReLU(), #activation func
            nn.MaxPool1d(4), # max pooling
        )
        
        self.avgPool = nn.AvgPool1d(30)
        self.fc1 = nn.Linear(512, 10)#fully connected layer
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        
        x = self.avgPool(x)
        x = x.permute(0, 2, 1) #change the 512x1 to 1x512
        x = self.fc1(x)
        return F.log_softmax(x, dim = 2)
model = CNN()
model.to(device)
print(model)        

CNN(
  (conv1): Sequential(
    (0): Conv1d(1, 128, kernel_size=(80,), stride=(4,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv1d(128, 256, kernel_size=(3,), stride=(1,))
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv1d(256, 512, kernel_size=(3,), stride=(1,))
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    

In [9]:
optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay = 0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1)

In [14]:
def train(model, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        #clear gradients for this training step
        optimizer.zero_grad()
        #move to gpu if available
        data = data.to(device)
        #target is the ground truth label
        target = target.to(device)
        #set requires_grad to True for training
        data = data.requires_grad_() 
        #computation
        output = model(data)
        #original output dimensions are batchSizex1x10
        output = output.permute(1, 0, 2) 
        #the loss functions expects a batchSizex10 input
        loss = F.nll_loss(output[0], target) 
        #backpropagation, compute gradients
        loss.backward()
        # apply gradients
        optimizer.step()
        #print training stats
        if batch_idx % log_interval == 0: 
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss))
            
def validation(model, epoch):
    model.eval()
    correct = 0
    for data, target in validation_loader:
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        output = output.permute(1, 0, 2)
        # get the index of the max log-probability
        pred = output.max(2)[1]
        correct += pred.eq(target).cpu().sum().item()
    print('\Validation set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(validation_loader.dataset),
        100. * correct / len(validation_loader.dataset)))
    
def test(model, epoch):
    model.eval()
    correct = 0
    for data, target in test_loader:
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        output = output.permute(1, 0, 2)
        pred = output.max(2)[1] 
        correct += pred.eq(target).cpu().sum().item()
    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [11]:
log_interval = 20
for epoch in range(1, 41):
    if epoch == 31:
        print("First round of training complete. Setting learn rate to 0.001.")
    scheduler.step()
    train(model, epoch)
    test(model, epoch)




Test set: Accuracy: 159/837 (19%)


Test set: Accuracy: 143/837 (17%)


Test set: Accuracy: 140/837 (17%)


Test set: Accuracy: 192/837 (23%)


Test set: Accuracy: 244/837 (29%)


Test set: Accuracy: 292/837 (35%)


Test set: Accuracy: 173/837 (21%)


Test set: Accuracy: 300/837 (36%)


Test set: Accuracy: 317/837 (38%)


Test set: Accuracy: 213/837 (25%)


Test set: Accuracy: 331/837 (40%)


Test set: Accuracy: 269/837 (32%)


Test set: Accuracy: 294/837 (35%)


Test set: Accuracy: 360/837 (43%)


Test set: Accuracy: 256/837 (31%)


Test set: Accuracy: 333/837 (40%)


Test set: Accuracy: 298/837 (36%)


Test set: Accuracy: 370/837 (44%)


Test set: Accuracy: 262/837 (31%)


Test set: Accuracy: 428/837 (51%)


Test set: Accuracy: 449/837 (54%)


Test set: Accuracy: 449/837 (54%)


Test set: Accuracy: 462/837 (55%)


Test set: Accuracy: 471/837 (56%)


Test set: Accuracy: 458/837 (55%)


Test set: Accuracy: 466/837 (56%)


Test set: Accuracy: 457/837 (55%)


Test set: Accuracy: 441/837