# Stage 1: Musical Instrument Classifier 





###Installing / Importing Packages, Mounting Google Drive

In [None]:
!pip install torchaudio==0.6.0
!pip install torchvision==0.7.0



In [None]:
# import all libraries we need here before starting 

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.optim as optim 
import matplotlib.pyplot as plt
import torch.utils.data as Data
import torch.nn as nn
import torchaudio
import numpy as np
import math
import os 
import shutil
use_cuda = True

###Data Processing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
%%capture
!unzip '/content/drive/My Drive/APS 360 Project/IRMAS-Training-Small.zip' -d '/root/'

In [None]:
# clean folders

# classes = ['cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru', 'vio', 'voi']
classes = ['gac','pia','tru','vio']
train_dir = "/root/IRMAS-Small"
for file in os.listdir(train_dir):
  if file not in classes:
    os.remove(os.path.join(train_dir, file))
    print("Removed {}".format(file))

In [None]:
Normalized_Max = 0.5

def normalize_waveform(waveform, norm_max=0.5):
  max_magnitudes = waveform.abs().max(dim=1, keepdim=True)[0]
  normalized_waveforms = waveform.float().div(max_magnitudes) * norm_max
  return normalized_waveforms

def audio_loader(file_path):
  waveform, _ = torchaudio.load(file_path)
  return normalize_waveform(waveform, Normalized_Max)

audioFolder = torchvision.datasets.DatasetFolder("/root/IRMAS-Small", loader=audio_loader, extensions='wav')

In [None]:
VALIDATION_PERCENTAGE = 0.2
TEST_PERCENTAGE = 0.2

def get_data_indices(data_size):
  # Randomly split data into training, validation and test sets.

  # Create a list of randomized indices of image data
  np.random.seed(1)
  indices = np.arange(data_size)
  np.random.shuffle(indices)

  # Set size for each dataset
  validation_size = math.floor(data_size * VALIDATION_PERCENTAGE)
  test_size = math.floor(data_size * TEST_PERCENTAGE)
  training_size = data_size - validation_size - test_size

  training_indices = indices[:training_size]
  val_indices = indices[training_size : training_size + validation_size]
  test_indices = indices[training_size + validation_size:]

  return training_indices, val_indices, test_indices

def get_data_loaders(folder, batch_size=64): 
  # Load training, validation and test data.
  
  data_size = len(folder)

  # Get training, validation and test data indices
  training_indices, val_indices, test_indices = get_data_indices(data_size)

  # Create subsets
  training_set = torch.utils.data.Subset(folder, training_indices)
  val_set = torch.utils.data.Subset(folder, val_indices)
  test_set = torch.utils.data.Subset(folder, test_indices)

  # Create dataloaders for each dataset
  train_loader = Data.DataLoader(training_set, batch_size=batch_size)
  validation_loader = Data.DataLoader(val_set, batch_size=batch_size)
  test_loader = Data.DataLoader(test_set, batch_size=batch_size)

  return train_loader, validation_loader, test_loader

In [None]:
train_loader, val_loader, test_loader = get_data_loaders(audioFolder, 1)

# Output the size of each dataset.
print("# of training examples: ", len(train_loader))
print("# of validation examples: ", len(val_loader))
print("# of test examples: ", len(test_loader))

# of training examples:  1732
# of validation examples:  576
# of test examples:  576


In [None]:
for i, data in enumerate(train_loader, 0):
  inputs, labels = data
  print(inputs[0].shape)
  
  break

torch.Size([2, 132299])


In [None]:
print(labels.item())

0


###Training the Baseline Model of Random Forest Models

In [None]:
train_inputs = []
train_labels = []
for i,data in enumerate(train_loader,0):
    inputs,labels = data
    train_inputs.append(torch.reshape(inputs[0],(-1,)).numpy())
    train_labels.append(labels.item())


In [None]:
train_inputs = np.array(train_inputs)
train_labels = np.array(train_labels)

In [None]:
val_inputs = []
val_labels = []
for i,data in enumerate(val_loader,0):
    inputs,labels = data
    val_inputs.append(torch.reshape(inputs[0],(-1,)).numpy())
    val_labels.append(labels.item())


In [None]:
val_inputs = np.array(val_inputs)
val_labels = np.array(val_labels)

In [None]:
#baseline model training here
# Random Forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=500)

# Fit the model to our training data
model.fit(train_inputs, train_labels)

# Make predictions
val_predicted = model.predict(val_inputs)


In [None]:
correct = 0
for i in range(len(val_predicted)):
    if val_predicted[i] == val_labels[i]:
        correct +=1 
print("accuracy of baseline model: {0}".format(correct/len(val_predicted)))

accuracy of baseline model: 0.1849366144668158


###Convolutional Network Architecture

In [None]:
# we could start off with a simple CNN architecture... to improve use similar architecture as AlexNet or others for CNN approach...
# input size is huge 132,299: we need to find a way to downscale while extracting relevant features of the input.

num_classes = len(classes)

class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 20, 11) # input channel is 2 for audio files
        self.pool = nn.MaxPool2d(2, 2) 
        self.conv2 = nn.Conv1d(10, 5, 9)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv1d(2, 4, 5)
        self.pool2 = nn.MaxPool2d(4, 4)
        self.fc1 = nn.Linear(8266, 500)
        self.fc2 = nn.Linear(500, num_classes)


    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x))) # x_input = 132,299 -> conv1: (132,299-11+1)/1 132289 -> maxpool2d: x_output = 66,144
        x = self.pool(F.relu(self.conv2(x))) # x_input = 66,144 -> conv2: 66,144-9+1/1 -> 66,136 -> maxpool2d: x_output = 33,068
        x = self.pool2(F.relu(self.conv3(x))) # x_input = 33,068 -> conv3: 33,068-5+1/1 -> 33,064 -> maxpool2d: x_output = 33,068/4 -> 8266
        x = x.view(-1, 8266)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
print('Convolutional Neural Network Architecture Selected')

In [None]:
class MyNet2(nn.Module):
    def __init__(self):
        super(MyNet2, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 20, 11) # input channel is 2 for audio files
        self.pool = nn.MaxPool2d(2, 2) 
        self.conv2 = nn.Conv1d(10, 5, 9)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv1d(2, 4, 5)
        self.pool2 = nn.MaxPool2d(4, 4)
        self.fc1 = nn.Linear(8266, 100)
        self.fc2 = nn.Linear(100, num_classes)


    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x))) # x_input = 132,299 -> conv1: (132,299-11+1)/1 132289 -> maxpool2d: x_output = 66,144
        x = self.pool(F.relu(self.conv2(x))) # x_input = 66,144 -> conv2: 66,144-9+1/1 -> 66,136 -> maxpool2d: x_output = 33,068
        x = self.pool2(F.relu(self.conv3(x))) # x_input = 33,068 -> conv3: 33,068-5+1/1 -> 33,064 -> maxpool2d: x_output = 33,068/4 -> 8266
        x = x.view(-1, 8266)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
class MyNet3(nn.Module):
    def __init__(self):
        super(MyNet3, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 20, 11) # input channel is 2 for audio files
        self.pool = nn.MaxPool2d(2, 2) 
        self.conv2 = nn.Conv1d(10, 5, 9)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv1d(2, 4, 5)
        self.pool2 = nn.MaxPool2d(4, 4)
        self.fc1 = nn.Linear(8266, num_classes)


    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x))) # x_input = 132,299 -> conv1: (132,299-11+1)/1 132289 -> maxpool2d: x_output = 66,144
        x = self.pool(F.relu(self.conv2(x))) # x_input = 66,144 -> conv2: 66,144-9+1/1 -> 66,136 -> maxpool2d: x_output = 33,068
        x = self.pool2(F.relu(self.conv3(x))) # x_input = 33,068 -> conv3: 33,068-5+1/1 -> 33,064 -> maxpool2d: x_output = 33,068/4 -> 8266
        x = x.view(-1, 8266)
        x = self.fc1(x)
        return x

In [None]:
class MyNet4(nn.Module):
    def __init__(self):
        super(MyNet4, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 20, 11) # input channel is 2 for audio files
        self.pool = nn.MaxPool2d(2, 2) 
        self.conv2 = nn.Conv1d(10, 5, 9)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv1d(2, 4, 5)
        self.pool2 = nn.MaxPool2d(4, 4)
        self.fc1 = nn.Linear(8266, num_classes)


    def forward(self, x):
        x = self.pool(F.leaky_relu(self.conv1(x))) # x_input = 132,299 -> conv1: (132,299-11+1)/1 132289 -> maxpool2d: x_output = 66,144
        x = self.pool(F.leaky_relu(self.conv2(x))) # x_input = 66,144 -> conv2: 66,144-9+1/1 -> 66,136 -> maxpool2d: x_output = 33,068
        x = self.pool2(F.leaky_relu(self.conv3(x))) # x_input = 33,068 -> conv3: 33,068-5+1/1 -> 33,064 -> maxpool2d: x_output = 33,068/4 -> 8266
        x = x.view(-1, 8266)
        x = self.fc1(x)
        return x

In [None]:
class MyNet5(nn.Module):
    def __init__(self):
        super(MyNet5, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 20, 11) # input channel is 2 for audio files
        self.pool = nn.MaxPool2d(2, 2) 
        self.conv2 = nn.Conv1d(20, 5, 9)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv1d(5, 1, 5)
        self.pool2 = nn.MaxPool2d(4, 4)
        self.fc1 = nn.Linear(8266, num_classes)
        self.bn1 = nn.BatchNorm1d(10)
        self.bn2 = nn.BatchNorm1d(2)
        self.bn3 = nn.BatchNorm1d(1)

    def forward(self, x):
        x = self.bn1(self.pool(F.leaky_relu(self.conv1(x)))) # x_input = 132,299 -> conv1: (132,299-11+1)/1 132289 -> maxpool2d: x_output = 66,144
        x = self.bn2(self.pool(F.leaky_relu(self.conv2(x)))) # x_input = 66,144 -> conv2: 66,144-9+1/1 -> 66,136 -> maxpool2d: x_output = 33,068
        x = self.bn3(self.pool2(F.leaky_relu(self.conv3(x)))) # x_input = 33,068 -> conv3: 33,068-5+1/1 -> 33,064 -> maxpool2d: x_output = 33,068/4 -> 8266
        x = x.view(-1, 8266)
        x = self.fc1(x)
        return x

In [None]:
class MyNet6(nn.Module):
    def __init__(self):
        super(MyNet6, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 20, 11) # input channel is 2 for audio files
        self.pool = nn.MaxPool2d(2, 2) 
        self.conv2 = nn.Conv1d(10, 5, 9)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv1d(2, 4, 5)
        self.pool2 = nn.MaxPool2d(4, 4)
        self.fc1 = nn.Linear(8266, num_classes)
        self.bn1 = nn.BatchNorm1d(10)
        self.bn2 = nn.BatchNorm1d(2)
        self.bn3 = nn.BatchNorm1d(1)

    def forward(self, x):
        x = self.bn1(self.pool(F.relu(self.conv1(x)))) # x_input = 132,299 -> conv1: (132,299-11+1)/1 132289 -> maxpool2d: x_output = 66,144
        x = self.bn2(self.pool(F.relu(self.conv2(x)))) # x_input = 66,144 -> conv2: 66,144-9+1/1 -> 66,136 -> maxpool2d: x_output = 33,068
        x = self.bn3(self.pool2(F.relu(self.conv3(x)))) # x_input = 33,068 -> conv3: 33,068-5+1/1 -> 33,064 -> maxpool2d: x_output = 33,068/4 -> 8266
        x = x.view(-1, 8266)
        x = self.fc1(x)
        return x

In [None]:
class MyNet7(nn.Module):
    def __init__(self):
        super(MyNet7, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 16, 11) # input channel is 2 for audio files
        self.conv2 = nn.Conv1d(16, 32, 9)
        self.conv3 = nn.Conv1d(32, 8, 5)
        self.conv4 = nn.Conv1d(8, 2, 5)
        self.fc1 = nn.Linear(1030, num_classes)
        self.pool = nn.MaxPool1d(2, 2) 
        self.pool2 = nn.MaxPool1d(4, 4)
        self.bn1 = nn.BatchNorm1d(16)
        self.bn2 = nn.BatchNorm1d(32)
        self.bn3 = nn.BatchNorm1d(8)
        self.bn4 = nn.BatchNorm1d(2)
        self.drop = nn.Dropout(0.2)

    def forward(self, x):
        x = self.bn1(self.pool2(F.relu(self.conv1(x)))) 
        x = self.bn2(self.pool2(F.relu(self.conv2(x)))) 
        x = self.bn3(self.pool2(F.relu(self.conv3(x)))) 
        x = self.bn4(self.pool2(F.relu(self.conv4(x))))
        # print(np.shape(x))
        x = self.drop(x)
        x = x.view(-1, 1030)
        x = self.fc1(x)
        return x

In [None]:
class MyNet8(nn.Module):
    def __init__(self):
        super(MyNet8, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 16, 11) # input channel is 2 for audio files
        self.conv2 = nn.Conv1d(16, 32, 9)
        self.conv3 = nn.Conv1d(32, 8, 5)
        self.conv4 = nn.Conv1d(8, 2, 5)
        self.fc1 = nn.Linear(1030, num_classes)
        self.pool = nn.MaxPool1d(2, 2) 
        self.pool2 = nn.MaxPool1d(4, 4)
        self.bn1 = nn.BatchNorm1d(16)
        self.bn2 = nn.BatchNorm1d(32)
        self.bn3 = nn.BatchNorm1d(8)
        self.bn4 = nn.BatchNorm1d(2)
        self.drop = nn.Dropout(0.3)

    def forward(self, x):
        x = self.bn1(self.pool2(F.relu(self.conv1(x)))) 
        x = self.bn2(self.pool2(F.relu(self.conv2(x)))) 
        x = self.bn3(self.pool2(F.relu(self.conv3(x)))) 
        x = self.bn4(self.pool2(F.relu(self.conv4(x))))
        # print(np.shape(x))
        x = self.drop(x)
        x = x.view(-1, 1030)
        x = self.fc1(x)
        return x

In [None]:
class MyNet9(nn.Module):
    def __init__(self):
        super(MyNet9, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 16, 11) # input channel is 2 for audio files
        self.conv2 = nn.Conv1d(16, 32, 9)
        self.conv3 = nn.Conv1d(32, 16, 7)
        self.conv4 = nn.Conv1d(16, 8, 5)
        self.conv5 = nn.Conv1d(8, 2, 5)
        self.fc1 = nn.Linear(1030, num_classes)
        self.pool = nn.MaxPool1d(2, 2) 
        self.pool2 = nn.MaxPool1d(4, 4)
        self.bn1 = nn.BatchNorm1d(16)
        self.bn2 = nn.BatchNorm1d(32)
        self.bn3 = nn.BatchNorm1d(16)
        self.bn4 = nn.BatchNorm1d(8)
        self.bn5 = nn.BatchNorm1d(2)
        self.drop = nn.Dropout(0.3)

    def forward(self, x):
        x = self.bn1(self.pool(F.relu(self.conv1(x)))) 
        x = self.bn2(self.pool(F.relu(self.conv2(x)))) 
        x = self.bn3(self.pool2(F.relu(self.conv3(x)))) 
        x = self.bn4(self.pool2(F.relu(self.conv4(x))))
        x = self.bn5(self.pool2(F.relu(self.conv5(x))))
        # print(np.shape(x))
        x = self.drop(x)
        x = x.view(-1, 1030)
        x = self.fc1(x)
        return x

In [None]:
class MyNet10(nn.Module):
    def __init__(self):
        super(MyNet10, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 16, 11) # input channel is 2 for audio files
        self.conv2 = nn.Conv1d(16, 32, 9)
        self.conv3 = nn.Conv1d(32, 16, 7)
        self.conv4 = nn.Conv1d(16, 8, 5)
        self.conv5 = nn.Conv1d(8, 2, 5)
        self.fc1 = nn.Linear(514, num_classes)
        self.pool = nn.MaxPool1d(2, 2) 
        self.pool2 = nn.MaxPool1d(4, 4)
        self.bn1 = nn.BatchNorm1d(16)
        self.bn2 = nn.BatchNorm1d(32)
        self.bn3 = nn.BatchNorm1d(16)
        self.bn4 = nn.BatchNorm1d(8)
        self.bn5 = nn.BatchNorm1d(2)
        self.drop = nn.Dropout(0.3)

    def forward(self, x):
        x = self.bn1(self.pool(F.relu(self.conv1(x)))) 
        x = self.bn2(self.pool2(F.relu(self.conv2(x)))) 
        x = self.bn3(self.pool2(F.relu(self.conv3(x)))) 
        x = self.bn4(self.pool2(F.relu(self.conv4(x))))
        x = self.bn5(self.pool2(F.relu(self.conv5(x))))
        # print(np.shape(x))
        x = self.drop(x)
        x = x.view(-1, 514)
        x = self.fc1(x)
        return x

In [None]:
class MyNet11(nn.Module):
    def __init__(self):
        super(MyNet11, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 32, 11) # input channel is 2 for audio files
        self.conv2 = nn.Conv1d(32, 64, 9)
        self.conv3 = nn.Conv1d(64, 32, 7)
        self.conv4 = nn.Conv1d(32, 16, 5)
        self.conv5 = nn.Conv1d(16, 2, 5)
        self.fc1 = nn.Linear(514, num_classes)
        self.pool = nn.MaxPool1d(2, 2) 
        self.pool2 = nn.MaxPool1d(4, 4)
        self.bn1 = nn.BatchNorm1d(32)
        self.bn2 = nn.BatchNorm1d(64)
        self.bn3 = nn.BatchNorm1d(32)
        self.bn4 = nn.BatchNorm1d(16)
        self.bn5 = nn.BatchNorm1d(2)
        self.drop = nn.Dropout(0.3)

    def forward(self, x):
        x = self.bn1(self.pool(F.relu(self.conv1(x)))) 
        x = self.bn2(self.pool2(F.relu(self.conv2(x)))) 
        x = self.bn3(self.pool2(F.relu(self.conv3(x)))) 
        x = self.bn4(self.pool2(F.relu(self.conv4(x))))
        x = self.bn5(self.pool2(F.relu(self.conv5(x))))
        # print(np.shape(x))
        x = self.drop(x)
        x = x.view(-1, 514)
        x = self.fc1(x)
        return x

In [None]:
class MyNet12(nn.Module):
    def __init__(self):
        super(MyNet12, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 64, 11) # input channel is 2 for audio files
        self.conv2 = nn.Conv1d(64, 64, 9)
        self.conv3 = nn.Conv1d(64, 64, 7)
        self.conv4 = nn.Conv1d(64, 32, 5)
        self.conv5 = nn.Conv1d(32, 2, 5)
        self.fc1 = nn.Linear(514, num_classes)
        self.pool = nn.MaxPool1d(2, 2) 
        self.pool2 = nn.MaxPool1d(4, 4)
        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(64)
        self.bn3 = nn.BatchNorm1d(64)
        self.bn4 = nn.BatchNorm1d(32)
        self.bn5 = nn.BatchNorm1d(2)
        self.drop = nn.Dropout(0.3)

    def forward(self, x):
        x = self.bn1(self.pool(F.relu(self.conv1(x)))) 
        x = self.bn2(self.pool2(F.relu(self.conv2(x)))) 
        x = self.bn3(self.pool2(F.relu(self.conv3(x)))) 
        x = self.bn4(self.pool2(F.relu(self.conv4(x))))
        x = self.bn5(self.pool2(F.relu(self.conv5(x))))
        # print(np.shape(x))
        x = self.drop(x)
        x = x.view(-1, 514)
        x = self.fc1(x)
        return x

In [None]:
class MyNet13(nn.Module):
    def __init__(self):
        super(MyNet13, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 64, 11) # input channel is 2 for audio files
        self.conv2 = nn.Conv1d(64, 64, 9)
        self.conv3 = nn.Conv1d(64, 64, 7)
        self.conv4 = nn.Conv1d(64, 32, 5)
        self.conv5 = nn.Conv1d(32, 16, 5)
        self.conv6 = nn.Conv1d(16, 2, 5)
        self.fc1 = nn.Linear(254, num_classes)
        self.pool = nn.MaxPool1d(2, 2) 
        self.pool2 = nn.MaxPool1d(4, 4)
        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(64)
        self.bn3 = nn.BatchNorm1d(64)
        self.bn4 = nn.BatchNorm1d(32)
        self.bn5 = nn.BatchNorm1d(16)
        self.bn6 = nn.BatchNorm1d(2)
        self.drop = nn.Dropout(0.3)

    def forward(self, x):
        x = self.bn1(self.pool(F.relu(self.conv1(x)))) 
        x = self.bn2(self.pool(F.relu(self.conv2(x)))) 
        x = self.bn3(self.pool2(F.relu(self.conv3(x)))) 
        x = self.bn4(self.pool2(F.relu(self.conv4(x))))
        x = self.bn5(self.pool2(F.relu(self.conv5(x))))
        x = self.bn6(self.pool2(F.relu(self.conv6(x))))
        # print(np.shape(x))
        x = self.drop(x)
        x = x.view(-1, 254)
        x = self.fc1(x)
        return x

###Training Code

In [None]:
def get_accuracy(model, loader):
    correct = 0
    total = 0
    for inputs, labels in loader:
        if use_cuda and torch.cuda.is_available():
           inputs = inputs.cuda()
           labels = labels.cuda()
        output = model(inputs)
        #select index with maximum prediction score
        pred = output.max(1, keepdim=True)[1]
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += inputs.shape[0]
    return correct / total

def train(model, train_loader=None, valid_loader=None, batch_size=64, num_epochs=5, learning_rate=1e-4, checkpoint=False, checkpoint_name=None, checkpoint_bestonly=False): 
    torch.manual_seed(1)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    best_valacc = 0.0

    if train_loader is not None and valid_loader is not None:
        pass
    else:
        train_loader, val_loader, _ = get_data_loaders(audioFolder, batch_size) 

    epoch_plot, losses, val_losses, train_acc, val_acc = [], [], [], [], []
    for epoch in range(num_epochs):
        total_train_loss = 0
        num_train_batch = 0
        for inputs, labels in iter(train_loader):

            if use_cuda and torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()

            
            pred = model(inputs)
            loss = criterion(pred, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            total_train_loss = total_train_loss + (float(loss.item()) /len(inputs))
            num_train_batch += 1
        total_train_loss = total_train_loss / num_train_batch #/ 21502
        losses.append(float(total_train_loss))
        train_acc.append(get_accuracy(model,train_loader))
        
        # make validation predictions and calculate loss
        total_val_loss = 0
        num_val_batch = 0
        for inputs, labels in iter(val_loader):
            
            if use_cuda and torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()

            pred = model(inputs)
            val_loss = criterion(pred,labels)
            
            total_val_loss = total_val_loss + (float(val_loss.item()) /len(inputs))
            num_val_batch += 1
        total_val_loss = total_val_loss / num_val_batch #/ 4608
        val_losses.append(float(total_val_loss))
        val_acc.append(get_accuracy(model,val_loader))

        epoch_plot.append(epoch)
        print('Epoch:{}, Loss:{:.4f}, Val_Loss:{:.4f}, Train_acc:{:.4f}, Val_acc:{:.4f}'.format(
            epoch+1,
            float(total_train_loss),
            float(total_val_loss),
            float(train_acc[epoch]),
            float(val_acc[epoch])))

        # Save the current model (checkpoint) to a file
        if checkpoint:
            if (checkpoint_bestonly and val_acc[-1] > best_valacc):
                best_valacc = val_acc[-1]
                if checkpoint_name is not None:
                    model_path = "/content/drive/My Drive/APS 360 Project/saved_models/{}_batch_size={}_lr={}_best".format(checkpoint_name,batch_size,learning_rate,epoch)
                else:
                    model_path = "/content/drive/My Drive/APS 360 Project/saved_models/batch_size={}_lr={}_best".format(batch_size,learning_rate,epoch)
                torch.save(model.state_dict(), model_path)
            elif not checkpoint_bestonly:
                if checkpoint_name is not None:
                    model_path = "/content/drive/My Drive/APS 360 Project/saved_models/{}_batch_size={}_lr={}_epoch={}".format(checkpoint_name,batch_size,learning_rate,epoch)
                else:
                    model_path = "/content/drive/My Drive/APS 360 Project/saved_models/batch_size={}_lr={}_epoch={}".format(batch_size,learning_rate,epoch)
                torch.save(model.state_dict(), model_path)

    # plotting
    plt.title("Training Curve")
    plt.plot(epoch_plot, losses, label="Train")
    plt.plot(epoch_plot, val_losses, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.legend(loc='best')
    plt.show()

    plt.title("Training Curve")
    plt.plot(epoch_plot, train_acc, label="Train")
    plt.plot(epoch_plot, val_acc, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()

    print("Final Training Accuracy: {}".format(train_acc[-1]))
    print("Final Validation Accuracy: {}".format(val_acc[-1]))
    print ("Maximum validation accuracy for this model is:", max(val_acc),
           "at epoch", epoch_plot[val_acc.index(max(val_acc))],"\n")
    

In [None]:
torch.cuda.empty_cache()
model = MyNet8()
if use_cuda and torch.cuda.is_available():
    model = model.cuda()
train(model, learning_rate=0.0003, num_epochs=30, batch_size=32, checkpoint=True, checkpoint_name='MyNet8', checkpoint_bestonly=True)

### Check Test Accuracy

In [None]:
# load in best model and check training accuracy
stage_1_model = MyNet13()
saved_model = '/content/drive/My Drive/APS 360 Project/saved_models/MyNet13_batch_size=16_lr=0.0003_best_0.7257valacc'
stage_1_model.load_state_dict(torch.load(saved_model))
train_loader, val_loader, test_loader = get_data_loaders(audioFolder, 16) 
print(get_accuracy(stage_1_model.eval().cuda(),test_loader))

# Stage 2: Multi-Instrument Identification

###Data Processing

In [None]:
# Combine audio files
import os

input_dir_path = "/root/IRMAS-Small"
combined_dir_path = "/root/IRMAS-Combine"

# Make a directory in /root/
os.mkdir(combined_dir_path)

class_directories = os.listdir(input_dir_path)
file_count = len(os.listdir(os.path.join(input_dir_path, class_directories[0])))

# create files with combined and normalized audio
for i in range(len(class_directories)):
  for j in range(i+1, len(class_directories)):
    dir_name = class_directories[i] + '+' + class_directories[j]
    dir_path = os.path.join(combined_dir_path, dir_name)
    if os.path.isdir(dir_path):
      for file_name_to_remove in os.listdir(dir_path):
        os.remove(os.path.join(dir_path, file_name_to_remove))
      os.rmdir(dir_path)
    os.mkdir(dir_path)
    class1_names = os.listdir(os.path.join(input_dir_path, class_directories[i]))
    class2_names = os.listdir(os.path.join(input_dir_path, class_directories[j]))
    for file_index in range(file_count):
      wave1, sample_rate = torchaudio.load(os.path.join(input_dir_path, class_directories[i], class1_names[file_index]))
      wave2, sample_rate = torchaudio.load(os.path.join(input_dir_path, class_directories[j], class2_names[file_index]))
      wave1 = normalize_waveform(wave1, 0.5)
      wave2 = normalize_waveform(wave2, 0.5)
      combined_wave = wave1 + wave2
      combined_wave = normalize_waveform(combined_wave, 0.5)
      file_name = dir_name + str(file_index) + ".wav"
      torchaudio.save(os.path.join(dir_path, file_name), combined_wave, sample_rate=sample_rate)


In [None]:
from zipfile import ZipFile

# Create zip file for combined audio files and folders
folder_paths = [os.path.join(combined_dir_path, folder_name) for folder_name in os.listdir(combined_dir_path)]
# writing files to a zipfile 
with ZipFile('/root/IRMAS-Combine.zip','w') as zip: 
  # writing each file one by one 
  for folder in folder_paths: 
    for file_name in os.listdir(folder):
      zip.write(os.path.join(folder, file_name), os.path.join("IRMAS-Combine", os.path.basename(folder), file_name))

In [None]:
# copy saved zip file from /root to our shared drive folder
!cp '/root/IRMAS-Combine.zip' '/content/drive/My Drive/APS 360 Project/'

In [None]:
# Unzip combined dataset to root directory
%%capture
!unzip '/content/drive/My Drive/APS 360 Project/IRMAS-Combine.zip' -d '/root/'

In [None]:
one_hot_class_targets = np.array([[1, 1, 0, 0],
                                  [1, 0, 1, 0],
                                  [1, 0, 0, 1],
                                  [0, 1, 1, 0],
                                  [0, 1, 0, 1],
                                  [0, 0, 1, 1]])  # Order: gac, pia, tru, vio

def get_one_hot_targets(class_index):
  return one_hot_class_targets[class_index]

def audio_loader(file_path):
  waveform, _ = torchaudio.load(file_path)
  return waveform

target_transform = transforms.Compose([
                                transforms.Lambda(get_one_hot_targets)
                               ])

combined_audio_folder = torchvision.datasets.DatasetFolder("/root/IRMAS-Combine", loader=audio_loader, target_transform=target_transform, extensions='wav')

FileNotFoundError: ignored

###Baseline Model

In [None]:
train_loader, val_loader, test_loader = get_data_loaders(combined_audio_folder, 1)

# Output the size of each dataset.
print("# of training examples: ", len(train_loader))
print("# of validation examples: ", len(val_loader))
print("# of test examples: ", len(test_loader))

In [None]:
#Since the label is in hotline encoding, we can turn that into a bit 2 repersentation of the label

train_inputs = []
train_labels = []
for i,data in enumerate(train_loader,0):
    inputs,labels = data
    train_inputs.append(torch.reshape(inputs[0],(-1,)).numpy())
    res = int("".join(str(x) for x in np.array(torch.reshape(labels[0],(-1,)).numpy())), 2)  
    train_labels.append(res)

In [None]:
train_inputs = np.array(train_inputs)
train_labels = np.array(train_labels)

In [None]:
val_inputs = []
val_labels = []
for i,data in enumerate(val_loader,0):
    inputs,labels = data
    val_inputs.append(torch.reshape(inputs[0],(-1,)).numpy())
    res = int("".join(str(x) for x in np.array(torch.reshape(labels[0],(-1,)).numpy())), 2)  
    val_labels.append(res)

In [None]:
val_inputs = np.array(val_inputs)
val_labels = np.array(val_labels)

In [None]:
#baseline model training here
# Random Forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)

# Fit the model to our training data
model.fit(train_inputs, train_labels)

# Make predictions
val_predicted = model.predict(val_inputs)


In [None]:
def accuracy_int_bit(value1,value2):
  accuracy = 0
  for i in range(3):
    if f'{value1:04b}'[i] == f'{value2:04b}'[i]:
      accuracy += 1
  return accuracy/4


In [None]:
correct = 0
partial_correct =  0
for i in range(len(val_predicted)):
    if val_predicted[i] == val_labels[i]:
        correct +=1 
    partial_correct += accuracy_int_bit(val_predicted[i],val_labels[i])
print("accuracy of baseline model: {0}".format(correct/len(val_predicted)))
print("Partial accuracy of baseline model: {0}".format(partial_correct/len(val_predicted)))

###Transfer Learning Architecture

In [None]:
class TransferModel(nn.Module): # This model will output an embedding state with shape [batch_num, 514]
  def __init__(self):
    super(TransferModel, self).__init__()
    self.name = "transferNet"
    self.conv1 = nn.Conv1d(2, 64, 11) # input channel is 2 for audio files
    self.conv2 = nn.Conv1d(64, 64, 9)
    self.conv3 = nn.Conv1d(64, 64, 7)
    self.conv4 = nn.Conv1d(64, 32, 5)
    self.conv5 = nn.Conv1d(32, 16, 5)
    self.conv6 = nn.Conv1d(16, 2, 5)
    self.pool = nn.MaxPool1d(2, 2) 
    self.pool2 = nn.MaxPool1d(4, 4)
    self.bn1 = nn.BatchNorm1d(64)
    self.bn2 = nn.BatchNorm1d(64)
    self.bn3 = nn.BatchNorm1d(64)
    self.bn4 = nn.BatchNorm1d(32)
    self.bn5 = nn.BatchNorm1d(16)
    self.bn6 = nn.BatchNorm1d(2)

  def forward(self, x):
    x = self.bn1(self.pool(F.relu(self.conv1(x)))) 
    x = self.bn2(self.pool(F.relu(self.conv2(x)))) 
    x = self.bn3(self.pool2(F.relu(self.conv3(x)))) 
    x = self.bn4(self.pool2(F.relu(self.conv4(x))))
    x = self.bn5(self.pool2(F.relu(self.conv5(x))))
    x = self.bn6(self.pool2(F.relu(self.conv6(x))))
    x = x.view(-1, 254)
    return x

def LoadFeatureModel(state_dict_path, transfered_model):
  # Load the best MyNet13 model
  MyNet13_best_state = torch.load(state_dict_path) #The state_dict file is stored in the shared google drive
  MyNet13_model = MyNet13()
  MyNet13_model.load_state_dict(MyNet13_best_state)

  # Copy features from MyNet11 to Transfered_model
  transfered_model.conv1 = MyNet13_model.conv1
  transfered_model.conv2 = MyNet13_model.conv2
  transfered_model.conv3 = MyNet13_model.conv3
  transfered_model.conv4 = MyNet13_model.conv4
  transfered_model.conv5 = MyNet13_model.conv5
  transfered_model.conv6 = MyNet13_model.conv6
  transfered_model.bn1 = MyNet13_model.bn1
  transfered_model.bn2 = MyNet13_model.bn2
  transfered_model.bn3 = MyNet13_model.bn3
  transfered_model.bn4 = MyNet13_model.bn4
  transfered_model.bn5 = MyNet13_model.bn5
  transfered_model.bn6 = MyNet13_model.bn6

  # Disable gradient for transfered_model
  for param in transfered_model.parameters():
      param.requires_grad = False

  return transfered_model

def LoadFeature(transfered_model, original_folder, batch_size=64): # Output the feature dataset
  train_loader, val_loader, test_loader = get_data_loaders(combined_audio_folder, batch_size=64)
  
  feature_train_loader = []
  feature_val_loader = []
  feature_test_loader = []

  if use_cuda and torch.cuda.is_available():
    transfered_model = transfered_model.cuda()

  for inputs, labels in train_loader:
    if use_cuda and torch.cuda.is_available():
      inputs = inputs.cuda()
      labels = labels.cuda()
    features = transfered_model(inputs)
    feature_train_loader.append([features, labels])

  for inputs, labels in val_loader:
    if use_cuda and torch.cuda.is_available():
      inputs = inputs.cuda()
      labels = labels.cuda()
    features = transfered_model(inputs)
    feature_val_loader.append([features, labels]) 

  for inputs, labels in test_loader:
    if use_cuda and torch.cuda.is_available():
      inputs = inputs.cuda()
      labels = labels.cuda()
    features = transfered_model(inputs)
    feature_test_loader.append([features, labels]) 

  # Avoid pytorch to track weight update in feature data 
  #features = torch.from_numpy(features.detach().numpy())

  return feature_train_loader, feature_val_loader, feature_test_loader

In [None]:
class predictionNet(nn.Module):
    def __init__(self):
        super(predictionNet, self).__init__()
        self.name = "prediction_net"
        self.fc1 = nn.Linear(254, 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 4)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

In [None]:
def get_accuracy_transfer_learning(prediction_model, loader):
    """
    Model output is considered correct only if all four outputs are correct.
    """
    correct = 0
    total = 0

    t = torch.Tensor([0])

    for features, labels in loader:
        
        if use_cuda and torch.cuda.is_available():
          features = features.cuda()
          labels = labels.cuda()
          t = t.cuda()
        
        outputs = prediction_model(features)
        one_hot_outputs = (outputs >= t).int()

        corr = sum(sum(one_hot_outputs == labels)).item()

        correct += corr
        total += labels.shape[0] * 4
    
    return correct / total

def transfer_train(prediction_model, train_loader=None, val_loader=None, batch_size=64, num_epochs=5, 
            learning_rate=1e-4, checkpoint=False, checkpoint_name=None, checkpoint_bestonly=False,
            accuracy=get_accuracy_transfer_learning): 
    torch.manual_seed(1)
    criterion = nn.MultiLabelSoftMarginLoss()
    optimizer = torch.optim.Adam(prediction_model.parameters(), lr=learning_rate)
    best_valacc = 0.0

    if train_loader is not None and val_loader is not None:
        pass
    else:
        train_loader, val_loader, _ = get_data_loaders(combined_audio_folder, batch_size) 

    epoch_plot, losses, val_losses, train_acc, val_acc = [], [], [], [], []

    for epoch in range(num_epochs):
        total_train_loss = 0
        start_time = time.time()
        prediction_model.train()
        for i, (features, labels) in enumerate(train_loader, 0):

            if use_cuda and torch.cuda.is_available():
                features = features.cuda()
                labels = labels.cuda()

            outputs = prediction_model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            total_train_loss += loss.item()
        total_train_loss = total_train_loss / (i+1) #/ 21502
        losses.append(total_train_loss)
        train_acc.append(accuracy(prediction_model,train_loader))
        
        # make validation predictions and calculate loss
        total_val_loss = 0
        num_val_batch = 0
        prediction_model.eval()
        with torch.no_grad():
          for features, labels in iter(val_loader):

              if use_cuda and torch.cuda.is_available():
                  features = features.cuda()
                  labels = labels.cuda()

              outputs = prediction_model(features)
              
              val_loss = criterion(outputs, labels)

              total_val_loss += val_loss.item()
              num_val_batch += 1
        total_val_loss = total_val_loss / num_val_batch #/ 4608
        val_losses.append(float(total_val_loss))
        val_acc.append(accuracy(prediction_model, val_loader))

        epoch_plot.append(epoch+1)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print('Epoch:{}, Loss:{:.4f}, Val_Loss:{:.4f}, Train_acc:{:.4f}, Val_acc:{:.4f}, Total time elapsed: {:.2f} seconds'.format(
            epoch+1,
            float(total_train_loss),
            float(total_val_loss),
            float(train_acc[epoch]),
            float(val_acc[epoch]),
            elapsed_time))

        # Save the current model (checkpoint) to a file
        if checkpoint:
            if (checkpoint_bestonly and val_acc[-1] > best_valacc):
                best_valacc = val_acc[-1]
                best_epoch = epoch+1
                best_model_state = prediction_model.state_dict()
            elif not checkpoint_bestonly:
                if checkpoint_name is not None:
                    model_path = "/content/drive/My Drive/APS 360 Project/saved_models/{}_batch_size={}_lr={}_epoch={}".format(checkpoint_name,batch_size,learning_rate,epoch)
                else:
                    model_path = "/content/drive/My Drive/APS 360 Project/saved_models/batch_size={}_lr={}_epoch={}".format(batch_size,learning_rate,epoch)
                torch.save(prediction_model.state_dict(), model_path)


    if checkpoint_name is not None:
        model_path = "/content/drive/My Drive/APS 360 Project/saved_models/{}_batch_size={}_lr={}_epoch={}_best".format(checkpoint_name,batch_size,learning_rate,best_epoch)
    else:
        model_path = "/content/drive/My Drive/APS 360 Project/saved_models/batch_size={}_lr={}__epoch={}_best".format(batch_size,learning_rate,best_epoch)
    torch.save(best_model_state, model_path)

    # plotting
    plt.title("Training Curve")
    plt.plot(epoch_plot, losses, label="Train")
    plt.plot(epoch_plot, val_losses, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.legend(loc='best')
    plt.show()

    plt.title("Training Curve")
    plt.plot(epoch_plot, train_acc, label="Train")
    plt.plot(epoch_plot, val_acc, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()

    print("Final Training Accuracy: {}".format(train_acc[-1]))
    print("Final Validation Accuracy: {}".format(val_acc[-1]))
    print ("Maximum validation accuracy for this model is:", max(val_acc),
           "at epoch", epoch_plot[val_acc.index(max(val_acc))],"\n")
    

In [None]:
transfer_model = LoadFeatureModel('/content/drive/My Drive/APS 360 Project/saved_models/MyNet13_batch_size=16_lr=0.0003_best_0.7257valacc', TransferModel())

# Get features vectors for training, validation and testing
feature_train_loader, feature_val_loader, feature_test_loader = LoadFeature(transfer_model, combined_audio_folder, batch_size=64)

In [None]:
torch.cuda.empty_cache()
prediction_model = predictionNet()
if use_cuda and torch.cuda.is_available():
  prediction_model = prediction_model.cuda()
transfer_train(prediction_model, train_loader=feature_train_loader, val_loader=feature_val_loader, learning_rate=0.0001, num_epochs=150, batch_size=64,
               checkpoint=True, checkpoint_name="Transfer_learning", checkpoint_bestonly=True)

In [None]:
# Get test accuracy for transfer learning model
best_model = predictionNet()

state = torch.load("/content/drive/My Drive/APS 360 Project/saved_models/Transfer_learning_batch_size=64_lr=0.0001_epoch=8_best")
best_model.load_state_dict(state)

if use_cuda and torch.cuda.is_available():
  best_model = best_model.cuda()

get_accuracy_2(transfer_model, best_model, feature_test_loader)

###Convolutional Network Architecture

In [None]:
class multiNet(nn.Module):
    def __init__(self):
      super(multiNet, self).__init__()
      self.name = "net"
      self.conv1 = nn.Conv1d(2, 64, 11) # input channel is 2 for audio files
      self.conv2 = nn.Conv1d(64, 64, 9)
      self.conv3 = nn.Conv1d(64, 64, 7)
      self.conv4 = nn.Conv1d(64, 32, 5)
      self.conv5 = nn.Conv1d(32, 16, 5)
      self.conv6 = nn.Conv1d(16, 2, 5)
      self.fc1 = nn.Linear(254, 1)
      self.pool = nn.MaxPool1d(2, 2) 
      self.pool2 = nn.MaxPool1d(4, 4)
      self.bn1 = nn.BatchNorm1d(64)
      self.bn2 = nn.BatchNorm1d(64)
      self.bn3 = nn.BatchNorm1d(64)
      self.bn4 = nn.BatchNorm1d(32)
      self.bn5 = nn.BatchNorm1d(16)
      self.bn6 = nn.BatchNorm1d(2)
      self.drop = nn.Dropout(0.3)

    def forward(self, x):
        x = self.bn1(self.pool(F.relu(self.conv1(x)))) 
        x = self.bn2(self.pool(F.relu(self.conv2(x)))) 
        x = self.bn3(self.pool2(F.relu(self.conv3(x)))) 
        x = self.bn4(self.pool2(F.relu(self.conv4(x))))
        x = self.bn5(self.pool2(F.relu(self.conv5(x))))
        x = self.bn6(self.pool2(F.relu(self.conv6(x))))
        # print(np.shape(x))
        x = self.drop(x)
        x0 = self.fc1(x)
        x1 = self.fc1(x)
        x2 = self.fc1(x)
        x3 = self.fc1(x)
        return x0,x1,x2,x3

###Training Code

In [None]:
class Stage2_test(nn.Module):
    """
    Just a sample architecture I used to test the training code, but this is based on
    MyNet10 because our larger models gave me CUDA out of memory errors too frequently.
    """
    def __init__(self):
        super(Stage2_test, self).__init__()
        self.name = "net"
        self.conv1 = nn.Conv1d(2, 16, 11) # input channel is 2 for audio files
        self.conv2 = nn.Conv1d(16, 32, 9)
        self.conv3 = nn.Conv1d(32, 16, 7)
        self.conv4 = nn.Conv1d(16, 8, 5)
        self.conv5 = nn.Conv1d(8, 2, 5)
        self.fc1 = nn.Linear(514, 1)
        self.pool = nn.MaxPool1d(2, 2) 
        self.pool2 = nn.MaxPool1d(4, 4)
        self.bn1 = nn.BatchNorm1d(16)
        self.bn2 = nn.BatchNorm1d(32)
        self.bn3 = nn.BatchNorm1d(16)
        self.bn4 = nn.BatchNorm1d(8)
        self.bn5 = nn.BatchNorm1d(2)
        self.drop = nn.Dropout(0.3)

    def forward(self, x):
        x = self.bn1(self.pool(F.relu(self.conv1(x)))) 
        x = self.bn2(self.pool2(F.relu(self.conv2(x)))) 
        x = self.bn3(self.pool2(F.relu(self.conv3(x)))) 
        x = self.bn4(self.pool2(F.relu(self.conv4(x))))
        x = self.bn5(self.pool2(F.relu(self.conv5(x))))
        # print(np.shape(x))
        x = self.drop(x)
        x = x.view(-1, 514)
        x0 = self.fc1(x)
        x1 = self.fc1(x)
        x2 = self.fc1(x)
        x3 = self.fc1(x)
        return x0,x1,x2,x3

In [None]:
def get_accuracy_multilabel(model, loader):
    """
    Model output is considered correct only if all four outputs are correct.
    """
    correct = 0
    total = 0
    for inputs, labels in loader:
        if use_cuda and torch.cuda.is_available():
           inputs = inputs.cuda()
           labels = labels.cuda()
        outputs = model(inputs)
        zeros = torch.from_numpy(np.zeros(np.shape(outputs))).cuda() if (use_cuda and torch.cuda.is_available()) else torch.from_numpy(np.zeros(np.shape(outputs)))
        batch_size = inputs.shape[0]
        corr = [True if all((outputs[i,:]>zeros[i,:]).long()==labels[i,:]) else False for i in range(batch_size)]
        # print(corr)
        correct += int(sum(corr))
        total += inputs.shape[0]
    return correct / total

def get_part_accuracy_multilabel(model,loader):
    """
    "Part marks" assigned for calculating model output correctness. Each correct
    binary classification is considered, even if other outputs corresponding to 
    the same data sample are incorrect.
    """
    correct = 0
    total = 0
    t = torch.Tensor([0])
    for inputs, labels in loader:   
        if use_cuda and torch.cuda.is_available():
          inputs = inputs.cuda()
          labels = labels.cuda()
          t = t.cuda()
        outputs = model(inputs)
        one_hot_outputs = (outputs >= t).int()
        corr = sum(sum(one_hot_outputs == labels)).item()
        correct += corr
        total += inputs.shape[0] * 4
    return correct / total


def train_multilabel(model, train_loader=None, valid_loader=None, batch_size=64, num_epochs=5, 
            learning_rate=1e-4, checkpoint=False, checkpoint_name=None, checkpoint_bestonly=False): 
    torch.manual_seed(1)
    criterion = nn.MultiLabelSoftMarginLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    best_valacc = 0.0

    if train_loader is not None and valid_loader is not None:
        pass
    else:
        train_loader, val_loader, _ = get_data_loaders(combined_audio_folder, batch_size) 

    epoch_plot, losses, val_losses, train_acc, train_acc_part, val_acc, val_acc_part = [], [], [], [], [], [], []
    for epoch in range(num_epochs):
        total_train_loss = 0
        num_train_batch = 0
        for inputs, labels in iter(train_loader):

            if use_cuda and torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()
            
            pred = model(inputs)
            loss = criterion(pred, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            total_train_loss = total_train_loss + (float(loss.item()) /len(inputs))
            num_train_batch += 1
        total_train_loss = total_train_loss / num_train_batch #/ 21502
        losses.append(float(total_train_loss))
        train_acc.append(get_accuracy_multilabel(model,train_loader))
        train_acc_part.append(get_part_accuracy_multilabel(model,train_loader))
        
        # make validation predictions and calculate loss
        total_val_loss = 0
        num_val_batch = 0
        for inputs, labels in iter(val_loader):
            
            if use_cuda and torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()

            pred = model(inputs)
            val_loss = criterion(pred,labels)
            
            total_val_loss = total_val_loss + (float(val_loss.item()) /len(inputs))
            num_val_batch += 1
        total_val_loss = total_val_loss / num_val_batch #/ 4608
        val_losses.append(float(total_val_loss))
        val_acc.append(get_accuracy_multilabel(model,val_loader))
        val_acc_part.append(get_part_accuracy_multilabel(model,val_loader))

        epoch_plot.append(epoch)
        print('Epoch:{}, Loss:{:.4f}, Val_Loss:{:.4f}, Train_acc:{:.4f}, Train_acc_part:{:.4f}, Val_acc:{:.4f}, Val_acc_part:{:.4f}'.format(
            epoch+1,
            float(total_train_loss),
            float(total_val_loss),
            float(train_acc[epoch]),
            float(train_acc_part[epoch]),
            float(val_acc[epoch]),
            float(val_acc_part[epoch])))

        # Save the current model (checkpoint) to a file
        if checkpoint:
            if (checkpoint_bestonly and val_acc_part[-1] > best_valacc):
                best_valacc = val_acc_part[-1]
                if checkpoint_name is not None:
                    model_path = "/content/drive/My Drive/APS 360 Project/saved_models/{}_batch_size={}_lr={}_best".format(checkpoint_name,batch_size,learning_rate,epoch)
                else:
                    model_path = "/content/drive/My Drive/APS 360 Project/saved_models/batch_size={}_lr={}_best".format(batch_size,learning_rate,epoch)
                torch.save(model.state_dict(), model_path)
            elif not checkpoint_bestonly:
                if checkpoint_name is not None:
                    model_path = "/content/drive/My Drive/APS 360 Project/saved_models/{}_batch_size={}_lr={}_epoch={}".format(checkpoint_name,batch_size,learning_rate,epoch)
                else:
                    model_path = "/content/drive/My Drive/APS 360 Project/saved_models/batch_size={}_lr={}_epoch={}".format(batch_size,learning_rate,epoch)
                torch.save(model.state_dict(), model_path)

    # plotting
    plt.title("Training Curve")
    plt.plot(epoch_plot, losses, label="Train")
    plt.plot(epoch_plot, val_losses, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.legend(loc='best')
    plt.show()

    plt.title("Training Curve")
    plt.plot(epoch_plot, train_acc, label="Train")
    plt.plot(epoch_plot, val_acc, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()

    plt.title("Training Curve")
    plt.plot(epoch_plot, train_acc_part, label="Train")
    plt.plot(epoch_plot, val_acc_part, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Part Accuracy")
    plt.legend(loc='best')
    plt.show()

    print("Final Training Accuracy: {}".format(train_acc[-1]))
    print("Final Training (part) Accuracy: {}".format(train_acc_part[-1]))
    print("Final Validation Accuracy: {}".format(val_acc[-1]))
    print("Final Validation (part) Accuracy: {}".format(val_acc_part[-1]))
    print ("Maximum validation accuracy for this model is:", max(val_acc),
           "at epoch", epoch_plot[val_acc.index(max(val_acc))],"\n")
    print ("Maximum validation (part) accuracy for this model is:", max(val_acc_part),
           "at epoch", epoch_plot[val_acc_part.index(max(val_acc_part))],"\n")

In [None]:
### multi-output training using BCE loss for each output
# torch.cuda.empty_cache()
# model2 = Stage2_test()
# if use_cuda and torch.cuda.is_available():
#     model2 = model2.cuda()
# train_2(model2, learning_rate=0.0003, num_epochs=30, batch_size=32, checkpoint=True, checkpoint_name='Stage2_test', checkpoint_bestonly=True)

### multi-label soft margin loss for single vector output
torch.cuda.empty_cache()
model2 = MyNet10()
if use_cuda and torch.cuda.is_available():
    model2 = model2.cuda()
train_multilabel(model2, learning_rate=0.0001, num_epochs=50, batch_size=32, checkpoint=True, checkpoint_name='Stage2_MyNet10', checkpoint_bestonly=True)

# Overall Results

In [None]:
from sklearn.metrics import confusion_matrix

train_loader1, val_loader1, test_loader1 = get_data_loaders(audioFolder, 16) 
train_loader2, val_loader2, test_loader2 = get_data_loaders(combined_audio_folder, 16) 

### Stage 1 

In [None]:
# load stage 1 model
# load in best model and check training accuracy
stage_1_model = MyNet13()
saved_model = '/content/drive/My Drive/APS 360 Project/saved_models/MyNet13_batch_size=16_lr=0.0003_best_0.7257valacc'
stage_1_model.load_state_dict(torch.load(saved_model,map_location=torch.device('cpu')))

# # do test set predictions for stage 1
print("Overall Test Accuracy (Stage 1):",get_accuracy(stage_1_model.eval().cuda(),test_loader1))
print("Confusion Matrix:")
stage_1_model = stage_1_model.eval().cuda()
all_outputs = []
all_labels = []
for inputs, labels in test_loader1:
    if use_cuda and torch.cuda.is_available():
        inputs = inputs.cuda()
        labels = labels.cuda()
    outputs = stage_1_model(inputs)
    pred = outputs.max(1, keepdim=True)[1].view_as(labels)
    all_outputs.extend(pred.tolist())
    all_labels.extend(labels.tolist())
print(confusion_matrix(all_labels,all_outputs))

### Stage 2 

In [None]:
# load stage 2 model (non transfer learning)
stage_2_model = MyNet13()
saved_model = '/content/drive/My Drive/APS 360 Project/saved_models/Stage2_MyNet13_batch_size=16_lr=0.0003_best_0.6905valaccpart_0.2243valacc'
stage_2_model.load_state_dict(torch.load(saved_model,map_location=torch.device('cpu')))

# do test set predictions for stage 2 (non transfer learning)
# print(get_part_accuracy_multilabel_class(stage_2_model.eval().cuda(),test_loader2,0))
print("Overall Test Accuracy (Stage 2)",get_part_accuracy_multilabel(stage_2_model.eval().cuda(),test_loader2))
stage_2_model = stage_2_model.eval().cuda()
all_outputs = []
all_labels = []
t = torch.Tensor([0]).cuda()
for inputs, labels in test_loader2:
    if use_cuda and torch.cuda.is_available():
        inputs = inputs.cuda()
        labels = labels.cuda()
    outputs = stage_2_model(inputs)
    pred = (outputs >= t).int()
    all_outputs.extend(pred.tolist())
    all_labels.extend(labels.tolist())

def multi_hot_to_num(label):
    # print(label)
    if label == [1,1,0,0]:
        return 0
    elif label == [1,0,1,0]:
        return 1
    elif label == [1,0,0,1]:
        return 2
    elif label == [0,1,1,0]:
        return 3
    elif label == [0,1,0,1]:
        return 4
    elif label == [0,0,1,1]:
        return 5
    else:
        # print("Error")
        return 6
converted_labels = [multi_hot_to_num(label) for label in all_labels]
converted_outputs = [multi_hot_to_num(output) for output in all_outputs]
print(confusion_matrix(converted_labels,converted_outputs))

### Stage 2: Transfer Learning

In [None]:
# load stage 2 transfer learning model

# do test set predictions for stage 2 transfer learning

**Demo on real data**

In [None]:
regulated_length = 132299

def slice_audio(waveform):
  audio_batch = []
  length = waveform.shape[1]
  for i in range(0, length, regulated_length):
    if i + regulated_length <= length:
      waveform_slices = waveform.narrow(1, i, regulated_length)
      audio_batch.append(waveform_slices)
  return audio_batch

In [None]:
real_tru_vio, real_tru_vio_sample_rate = torchaudio.load('/content/real_tru_vio.mp3')

In [None]:
real_tru, real_tru_sample_rate = torchaudio.load('/content/real_trumpet.mp3')

In [None]:
sliced_tru_vio = slice_audio(real_tru_vio)

In [None]:
sliced_tru = slice_audio(real_tru)

In [None]:
# Taking the best stage 2 model
num_classes = 4

best_multi_class_model = MyNet13()

state = torch.load("/content/drive/My Drive/APS 360 Project/saved_models/Stage2_MyNet13_batch_size=16_lr=0.0003_best_0.6905valaccpart_0.2243valacc")
best_multi_class_model.load_state_dict(state)

if use_cuda and torch.cuda.is_available():
  best_multi_class_model = best_multi_class_model.cuda()

In [31]:
map = {0: 'acoustic guitar', 1: 'piano', 2: 'trumpet', 3: 'violin'}

In [None]:
def get_multi_class_real_data_accuracy(loader, label, best_model, transfer_model=None): 
  best_model.eval()
  if transfer_model is not None:
    transfer_model.eval()
  predictions = torch.tensor([0, 0, 0, 0])
  correct = 0
  t = torch.Tensor([0])
  n = 0
  for input in loader: 
    n += 1
    result = []     
    if use_cuda and torch.cuda.is_available():
      input = input.cuda()
      t = t.cuda()
      label = label.cuda()
      predictions = predictions.cuda()
    input = input.unsqueeze(0)
    if transfer_model is not None:
      input = transfer_model(input)
    output = best_model(input).squeeze()
    one_hot_output = (output >= t).int()
    for i in range(len(one_hot_output)):
      if one_hot_output[i] > 0:
        result.append(map[i])
    predictions = torch.add(predictions, one_hot_output)
    corr = sum(one_hot_output == label).item()
    correct += corr
    print("Prediction for sample {}: {} \n".format(n, result))

  print("Accuracy: {} | Number of predictions for each class: {}".format(correct / (len(loader) * 4), predictions))

In [43]:
def get_single_class_real_data_accuracy(loader, label, best_model):
  best_model.eval()
  predictions = torch.tensor([0, 0, 0, 0])
  correct = 0
  t = torch.Tensor([0])
  n = 0
  for input in loader:    
    n += 1
    result = []  
    if use_cuda and torch.cuda.is_available():
      input = input.cuda()
    input = input.unsqueeze(0)
    output = best_model(input).squeeze()
    pred_index = output.max(0, keepdim=True)[1].item()
    result.append(map[pred_index])
    predictions[pred_index] += 1
    correct += int(pred_index == label)
    print("Prediction for sample {}: {} \n".format(n, result))
  print("Accuracy: {}% | Number of predictions for each class: {}".format((correct / len(loader) * 100), predictions))

In [None]:
get_multi_class_real_data_accuracy(sliced_tru_vio, torch.tensor([0, 0, 1, 1]), best_multi_class_model)

Prediction for sample 1: ['acoustic guitar', 'piano', 'trumpet'] 

Prediction for sample 2: ['piano', 'trumpet'] 

Prediction for sample 3: ['piano', 'trumpet'] 

Prediction for sample 4: ['acoustic guitar', 'violin'] 

Prediction for sample 5: ['trumpet', 'violin'] 

Prediction for sample 6: ['trumpet', 'violin'] 

Prediction for sample 7: ['acoustic guitar', 'piano'] 

Prediction for sample 8: ['piano', 'trumpet'] 

Prediction for sample 9: ['violin'] 

Prediction for sample 10: ['piano', 'violin'] 

Prediction for sample 11: ['piano', 'trumpet', 'violin'] 

Prediction for sample 12: ['piano', 'trumpet', 'violin'] 

Prediction for sample 13: ['trumpet', 'violin'] 

Prediction for sample 14: ['piano', 'trumpet', 'violin'] 

Prediction for sample 15: ['piano', 'trumpet'] 

Prediction for sample 16: ['trumpet', 'violin'] 

Prediction for sample 17: ['acoustic guitar', 'piano', 'trumpet'] 

Prediction for sample 18: ['piano', 'violin'] 

Prediction for sample 19: ['piano', 'violin'] 

Pr

In [None]:
num_classes = 4
best_single_class_model = MyNet13()
state = torch.load("/content/drive/My Drive/APS 360 Project/saved_models/MyNet13_batch_size=16_lr=0.0003_best_0.7257valacc")
best_single_class_model.load_state_dict(state)

if use_cuda and torch.cuda.is_available():
  best_single_class_model = best_single_class_model.cuda()

In [44]:
get_single_class_real_data_accuracy(sliced_tru[:25], 2, best_single_class_model)

Prediction for sample 1: ['trumpet'] 

Prediction for sample 2: ['trumpet'] 

Prediction for sample 3: ['trumpet'] 

Prediction for sample 4: ['trumpet'] 

Prediction for sample 5: ['trumpet'] 

Prediction for sample 6: ['trumpet'] 

Prediction for sample 7: ['trumpet'] 

Prediction for sample 8: ['trumpet'] 

Prediction for sample 9: ['trumpet'] 

Prediction for sample 10: ['trumpet'] 

Prediction for sample 11: ['trumpet'] 

Prediction for sample 12: ['trumpet'] 

Prediction for sample 13: ['trumpet'] 

Prediction for sample 14: ['trumpet'] 

Prediction for sample 15: ['trumpet'] 

Prediction for sample 16: ['trumpet'] 

Prediction for sample 17: ['trumpet'] 

Prediction for sample 18: ['trumpet'] 

Prediction for sample 19: ['trumpet'] 

Prediction for sample 20: ['trumpet'] 

Prediction for sample 21: ['trumpet'] 

Prediction for sample 22: ['trumpet'] 

Prediction for sample 23: ['trumpet'] 

Prediction for sample 24: ['trumpet'] 

Prediction for sample 25: ['trumpet'] 

Accuracy: