In [1]:
%pip install resampy
# %pip install torchvggish

Collecting resampy
  Downloading resampy-0.4.2-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: resampy
Successfully installed resampy-0.4.2
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import json
import os

import torch
import torchvision
from torchvision import transforms
from torchvision import models
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, tqdm_notebook
from torch.autograd import Variable
from fastprogress.fastprogress import format_time, master_bar, progress_bar
from sklearn.metrics import f1_score, jaccard_score

from matplotlib import pyplot as plt

from sklearn import preprocessing
import soundfile as sf

import librosa
# from torchvggish import vggish, vggish_input

In [103]:
# from google.colab import drive
# drive.mount('/content/drive')

In [104]:
tqdm.pandas()

In [105]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [63]:
class InstrumentDataset(Dataset):
  def __init__(self, csv_file, json_file, root_dir, spec_type):
    self.audio_frame = pd.read_csv(csv_file)
    with open(json_file, 'r') as f:
      self.instrument_classes = json.load(f)
    
    self.json_file = json_file
    self.csv_file = csv_file
    self.root_dir = root_dir

    # Add chroma_cqt if desired
    if spec_type not in ["mel_spectrogram", "chroma_stft", "cqt"]:
      raise Exception("Not valid spectrogram type")
    else:
      self.spec_type = spec_type

    # List of unique sample_keys (aka audio file names)
    self.unique_audio_files = self.audio_frame.sample_key.unique()

    # Dataframe specifying the sample_key of the audio file and the instrument labels
    self.audio_file_labels = self.audio_frame.groupby('sample_key')['instrument'].apply(list).reset_index(name='labels')

#     self.audio_file = self.audio_file_labels.iloc[:5000,:].copy()
    self.audio_file = self.audio_file_labels.copy()

    num_data = self.audio_file.shape

    os_join = np.vectorize(os.path.join)

    self.audio_file[spec_type] = np.full(num_data[0], [0])
    self.audio_file[spec_type] = os_join(np.full(num_data[0], self.root_dir), np.full(num_data[0], spec_type), np.full(num_data[0], spec_type),
                                          self.audio_file.sample_key.str[:3], 
                                          self.audio_file.sample_key.str[:] + np.full(num_data[0], '_' + spec_type + '.npy'))

    # Matrix of instrument labels ordered by audio file number (increasing sample_key value)
    # self.label_matrix = self.audio_file_labels.labels.tolist()
    self.label_matrix = self.audio_file.labels.tolist()

    binarizer = preprocessing.MultiLabelBinarizer()
    
    self.binary_label_matrix = binarizer.fit_transform(self.label_matrix)
    self.label_df = pd.DataFrame(self.binary_label_matrix,columns=[instrument for instrument in self.instrument_classes.keys()])

    self.audio_file = pd.concat([self.audio_file, self.label_df], axis=1)

  def get_instrument_class_dict(self):
    return self.instrument_classes

  def __len__(self):
    return len(self.audio_file.index)

  def __getitem__(self, idx):
    # Allow for slicing
    if torch.is_tensor(idx):
      idx = idx.tolist()

    if type(idx) is int:
      idx = [idx]

    # Get the instruments types on hot encoded
    instrument_types = np.array(self.audio_file.iloc[idx, 3:]).astype(float)
    # Get the spectrograms as a numpy array from the npy files
    specs = np.array(self.audio_file.iloc[idx, 2])

    spec_transforms = transforms.Compose([
        transforms.Normalize([-47.3835], [18.5056]),
    ])
    
    spec_array = []
    
    for file_idx, file_name in enumerate(specs):
      # 3 channel expanded
      spec = np.load(file_name).astype(float)
#       plt.imshow(spec)
#       plt.show()
      spec = np.expand_dims(spec,-1)
#       spec = np.repeat(spec, 3, -1)
      # 1 channel
      # spec = np.load(file_name).astype(float)
      # spec = 2*(spec - np.min(spec))/np.ptp(spec) - 1
#       spec = spec_transforms(spec)
#       print((spec - np.min(spec))/np.ptp(spec))
#       plt.imshow((spec - np.min(spec))/np.ptp(spec))
#       plt.show()
      spec_array.append(spec)
    specs = np.asarray(spec_array)
    specs = specs.transpose((0,-1,1,2))
    specs = torch.from_numpy(specs)
    specs = spec_transforms(specs)
    specs = specs.squeeze(0)

    specs = np.stack(specs)
#     print(specs.shape)

    instrument_types = torch.from_numpy(instrument_types)

    
    sample = {'specs': specs, 
              'instrument(s)': instrument_types, 
              'sample_key': self.audio_file.iloc[idx, 0].tolist(), 
              'spec_type': self.spec_type}

    return sample

In [24]:
class VGGishInstrumentDataset(Dataset):
  def __init__(self, csv_file, json_file, root_dir, spec_type):
    self.audio_frame = pd.read_csv(csv_file)
    with open(json_file, 'r') as f:
      self.instrument_classes = json.load(f)
    
    self.json_file = json_file
    self.csv_file = csv_file
    self.root_dir = root_dir

    # Add chroma_cqt if desired
    if spec_type not in ["vgg", "audio", "mel_spectrogram"]:
      raise Exception("Not valid spectrogram type")
    else:
      self.spec_type = spec_type

    # List of unique sample_keys (aka audio file names)
    self.unique_audio_files = self.audio_frame.sample_key.unique()

    # Dataframe specifying the sample_key of the audio file and the instrument labels
    self.audio_file_labels = self.audio_frame.groupby('sample_key')['instrument'].apply(list).reset_index(name='labels')

#     self.audio_file = self.audio_file_labels.iloc[:500,:].copy()
    self.audio_file = self.audio_file_labels.copy()

    num_data = self.audio_file.shape

    os_join = np.vectorize(os.path.join)

    self.audio_file[spec_type] = np.full(num_data[0], [0])
#     self.audio_file[spec_type] = os_join(np.full(num_data[0], self.root_dir), np.full(num_data[0], 'audio'), np.full(num_data[0], 'audio'),
#                                           self.audio_file.sample_key.str[:3], 
#                                           self.audio_file.sample_key.str[:] + np.full(num_data[0], '.ogg'))
    self.audio_file[spec_type] = os_join(np.full(num_data[0], self.root_dir), np.full(num_data[0], self.spec_type), np.full(num_data[0], self.spec_type),
                                          self.audio_file.sample_key.str[:3], 
                                          self.audio_file.sample_key.str[:] + np.full(num_data[0], '_' + spec_type + '.npy'))

    # Matrix of instrument labels ordered by audio file number (increasing sample_key value)
    # self.label_matrix = self.audio_file_labels.labels.tolist()
    self.label_matrix = self.audio_file.labels.tolist()

    binarizer = preprocessing.MultiLabelBinarizer()
    
    self.binary_label_matrix = binarizer.fit_transform(self.label_matrix)
    self.label_df = pd.DataFrame(self.binary_label_matrix,columns=[instrument for instrument in self.instrument_classes.keys()])

    self.audio_file = pd.concat([self.audio_file, self.label_df], axis=1)

  def get_instrument_class_dict(self):
    return self.instrument_classes

  def __len__(self):
    return len(self.audio_file.index)

  def __getitem__(self, idx):
    # Allow for slicing
    if torch.is_tensor(idx):
      idx = idx.tolist()

    if type(idx) is int:
      idx = [idx]

    # Get the instruments types on hot encoded
    instrument_types = np.array(self.audio_file.iloc[idx, 3:]).astype(float)
    # Get the spectrograms as a numpy array from the npy files
    audios = np.array(self.audio_file.iloc[idx, 2])

    spec_transforms = transforms.Compose([
        transforms.Normalize([-47.3835], [18.5056]),
    ])
    
    spec_array = []
    
    for file_idx, file_name in enumerate(audios):
      spec = np.load(file_name, allow_pickle=True).astype(float)
      spec = np.expand_dims(spec,-1)
      spec_array.append(spec)
    specs = np.stack(spec_array)
    specs = torch.from_numpy(specs)
    specs = specs.squeeze()
    specs = specs.double()

#     audios = np.expand_dims(audios, axis=-1)
#     audios = torch.from_numpy(audios)

#     audios = np.stack(audios)
#     specs = np.squeeze(specs)
    # specs = np.repeat(specs[:,:,:,np.newaxis], 3, -1)
    # print(specs.shape)

    instrument_types = torch.from_numpy(instrument_types)
    
    fs = 22050
    
    sample = {'specs': specs, 
              'instrument(s)': instrument_types, 
              'sample_key': self.audio_file.iloc[idx, 0].tolist(), 
              'spec_type': self.spec_type,
              'fs': fs}

    return sample

In [64]:
# Dataset definition
mel_spec_dataset = InstrumentDataset(csv_file='/kaggle/input/spec-data/openmic-2018-aggregated-labels.csv',
                                       json_file='/kaggle/input/spec-data/class-map.json',
                                       root_dir='/kaggle/input/spec-data',
                                       spec_type='mel_spectrogram')

In [4]:
# VGGish Dataset definition
vgg_dataset = VGGishInstrumentDataset(csv_file='/kaggle/input/spec-data/openmic-2018-aggregated-labels.csv',
                                       json_file='/kaggle/input/spec-data/class-map.json',
                                       root_dir='/kaggle/input/spec-data',
                                       spec_type='vgg')

In [110]:
def mean_std(dataloader):
  # running_sum = 0
  # for i, sample in enumerate(progress_bar(dataloader)):
  #   specs = sample['specs']

  #   specs = specs[:,0,:,:]

  #   running_sum += specs.sum()
  running_sum = 0
  running_squared_sum = 0
  running_n = 0
  for batch in dataloader:
    batch = batch['specs']
    running_sum += batch.sum()
    running_squared_sum += batch.square().sum()
    running_n += batch.shape[0] * batch.shape[1] * batch.shape[2]
    print(running_sum)
    print(running_squared_sum)
  print(running_n)
  mean = running_sum / running_n
  std = np.sqrt(running_squared_sum / running_n - np.square(running_sum / running_n))

  return mean, std

In [111]:
# loader = DataLoader(mel_spec_dataset, batch_size=1000, num_workers=0)
# mel_mean, mel_std = mean_std(loader)
# print(mel_mean)
# print(mel_std)

In [65]:
data = mel_spec_dataset[5:7]['specs']

print(data)
print(data.shape)
# plt.imshow(data.transpose(1,2,0))
# plt.show()


[[[[-0.36767249  0.447959    0.61176076 ...  0.81777948  0.80595874
     0.92247752]
   [-0.34065364  0.62695887  0.78907196 ...  1.01451047  1.07192553
     1.2770999 ]
   [-0.04682366  0.95625108  1.4307696  ...  1.85251694  1.89726691
     1.7921467 ]
   ...
   [-1.76252053 -1.39438873 -1.00261542 ... -0.21400279 -0.53991765
    -0.86752118]
   [-1.76252053 -1.76252053 -1.50584148 ... -1.09042668 -1.27280391
    -1.5497471 ]
   [-1.76252053 -1.76252053 -1.76252053 ... -1.76252053 -1.76252053
    -1.76252053]]]


 [[[-0.35247439  0.8397323   1.00691142 ...  0.3398836   0.21492143
     0.81777948]
   [-0.3744272   1.11751983  1.53293463 ...  1.58021761  1.51858086
     1.60048175]
   [ 0.01228007  1.36406682  1.78074812 ...  1.99478806  1.95890366
     1.83056413]
   ...
   [-1.76252053 -1.76252053 -1.76252053 ... -1.76252053 -1.76252053
    -1.76252053]
   [-1.76252053 -1.76252053 -1.76252053 ... -1.76252053 -1.76252053
    -1.76252053]
   [-1.76252053 -1.76252053 -1.76252053 ... -1.

In [5]:
vgg_data = vgg_dataset[5:7]['specs']
print(vgg_data)
fs = vgg_dataset[5:6]['fs']
print(fs)

print(vgg_data.shape)
# plt.imshow(data.transpose(1,2,0))
# plt.show()

tensor([[[[-4.6052e+00, -4.6052e+00, -4.6052e+00,  ..., -4.6052e+00,
           -4.6051e+00, -4.6051e+00],
          [-3.8980e+00, -3.8998e+00, -3.9142e+00,  ..., -3.7212e+00,
           -3.9750e+00, -4.1558e+00],
          [-6.1428e-02, -8.2499e-02,  7.0901e-02,  ..., -9.1753e-01,
           -1.0298e+00, -1.0156e+00],
          ...,
          [ 9.7568e-01,  7.9385e-01,  8.2498e-01,  ..., -5.8668e-01,
           -7.1539e-01, -1.4541e+00],
          [ 3.8418e-01,  4.2612e-01,  2.7493e-01,  ..., -7.9187e-01,
           -7.2015e-01, -9.8585e-01],
          [ 9.0007e-01,  7.5159e-01,  8.8080e-01,  ..., -1.0713e+00,
           -1.2385e+00, -1.3461e+00]],

         [[ 3.3946e-02,  4.4372e-01,  9.8933e-01,  ...,  5.0405e-01,
            6.9974e-02, -4.8244e-02],
          [ 9.2487e-01,  6.3326e-01,  6.7975e-01,  ..., -3.3901e-02,
            7.5498e-02, -9.6056e-02],
          [ 8.6667e-01,  8.8299e-01,  6.0556e-01,  ...,  4.4148e-01,
           -4.1301e-01, -3.9728e-01],
          ...,
     

In [114]:
# torch.hub.help('harritaylor/torchvggish', 'vggish')
# urls = {
#             'vggish': "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth",
#             'pca': "vggish_pca_params-970ea276.pth"
#         }
# vggish_model = vggish.VGGish(urls, preprocess=False)
# print(vggish_model)
# vggish_model.eval()
# embedding = vggish_model.forward(vgg_data)
# print(embedding)
# print(embedding.shape)

In [115]:
torch.hub.help('harritaylor/torchvggish', 'vggish')

Using cache found in /root/.cache/torch/hub/harritaylor_torchvggish_master


In [66]:
# Data splits
batch_size = 16
mel_train_size = int(0.8*len(mel_spec_dataset))
mel_test_size = len(mel_spec_dataset) - mel_train_size

mel_train_set, mel_test_set = torch.utils.data.random_split(mel_spec_dataset, [mel_train_size, mel_test_size])
mel_val_size = int(0.25*mel_test_size)
mel_test_size = mel_test_size - mel_val_size
print(mel_train_size)

mel_val_set, mel_test_set = torch.utils.data.random_split(mel_test_set, [mel_val_size, mel_test_size])

mel_train_loader = DataLoader(mel_train_set, batch_size=batch_size, shuffle=True, num_workers=8)
mel_val_loader = DataLoader(mel_val_set, batch_size=batch_size, shuffle=True, num_workers=8)
mel_test_loader = DataLoader(mel_test_set, batch_size=batch_size, shuffle=True, num_workers=8)

16000


In [6]:
# Data splits
batch_size = 16
vgg_train_size = int(0.8*len(vgg_dataset))
vgg_test_size = len(vgg_dataset) - vgg_train_size

vgg_train_set, vgg_test_set = torch.utils.data.random_split(vgg_dataset, [vgg_train_size, vgg_test_size])
vgg_val_size = int(0.25*vgg_test_size)
vgg_test_size = vgg_test_size - vgg_val_size
print(vgg_train_size)

vgg_val_set, vgg_test_set = torch.utils.data.random_split(vgg_test_set, [vgg_val_size, vgg_test_size])

vgg_train_loader = DataLoader(vgg_train_set, batch_size=batch_size, shuffle=True, num_workers=8)
vgg_val_loader = DataLoader(vgg_val_set, batch_size=batch_size, shuffle=True, num_workers=8)
vgg_test_loader = DataLoader(vgg_test_set, batch_size=batch_size, shuffle=True, num_workers=8)

16000


  cpuset_checked))


In [102]:
# Wide Network built from scratch
# class MelSpecNetwork(nn.Module):
#   def __init__(self):
#     super(MelSpecNetwork, self).__init__()
    
#     self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
#     self.relu1 = nn.ReLU()
#     self.conv2 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1)
#     self.relu2 = nn.ReLU()
#     self.conv3 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1)
#     self.relu3 = nn.ReLU()
#     self.conv4 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1)
#     self.relu4 = nn.ReLU()
#     # 1st Arg: num channels (16) * width of spec (128) * length of spec (431)
#     # 2nd Arg: num instrument classes (20)
#     self.fc1 = nn.Linear(16*128*431, 20)
#     self.sigmoid = nn.Sigmoid()

#   def forward(self, input):
#     output = self.conv1(input)
#     output = self.relu1(output)
#     output = self.conv2(output)
#     output = self.relu2(output)
#     # num channels (16) * width of spec (128) * length of spec (431)
#     output = output.view(-1, 16*128*431)
#     output = self.fc1(output)
#     output = self.sigmoid(output)

#     return output

# Resnet50 Arch
# class MelSpecNetwork(nn.Module):
#   def __init__(self):
#     super(MelSpecNetwork, self).__init__()

#     self.resnet = models.resnet50(pretrained=True)
#     default_in_ftrs = self.resnet.fc.in_features
#     for param in self.resnet.parameters():
#         param.requires_grad = False
    
#     # Don't freeze last layer of resnet
# #     for param in self.resnet.layer4.parameters():
# #         param.requires_grad = True

#     # Replace fully connected layer to fit 20 instrument classes
#     self.resnet.fc = nn.Linear(default_in_ftrs, 20)
    

#   def forward(self, input):
#     output = self.resnet(input)

#     return output


# VGGish Arch
class MelSpecNetwork(nn.Module):
  def __init__(self):
    super(MelSpecNetwork, self).__init__()
    
    urls = {
            'vggish': "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth",
            'pca': "vggish_pca_params-970ea276.pth"
        }
    model = torch.hub.load('harritaylor/torchvggish', 'vggish', preprocess=False, postprocess=False)
    self.vggish_model = torch.hub.load('harritaylor/torchvggish', 'vggish', preprocess=False, postprocess=False)
    
#     for param in self.vggish_model.parameters():
#         param.requires_grad = False
#     for layer_index, layer in enumerate(self.vggish_model.children()):
#         print(layer_index)
#         if layer_index == 13:
#             for param in layer.parameters():
#                 param.requires_grad = True
#     for layer_index, layer in enumerate(self.vggish_model.features.children()):
#         print(layer_index)
#         if layer_index == 11 or layer_index == 13:
#             for param in layer.parameters():
#                 param.requires_grad = True

#     for layer_index, layer in enumerate(self.vggish_model.embeddings.children()):
#         if layer_index == 2 or layer_index == 4:
#             for param in layer.parameters():
#                 param.requires_grad = True


#     for param in self.vggish_model.embeddings.parameters():
#         param.requires_grad = True
    
    self.classify = nn.Sequential(
            nn.Linear(512, 20),
        )   

  def forward(self, input):
    bs, num_frames, _, _ = input.size()
    input = input.view(bs*num_frames, 1, input.size(2), input.size(3))
    vggish_logits = self.vggish_model(input) # [bs*num_frames, 128]
    vggish_logits = vggish_logits.reshape(bs, vggish_logits.size(1) * num_frames)
    
    output = self.classify(vggish_logits)
    
    output = self.classify(vggish_logits)

    return output

In [103]:
mel_spec_net = MelSpecNetwork()
print(mel_spec_net)
print(sum(param.numel() for param in mel_spec_net.parameters() if param.requires_grad))

Using cache found in /root/.cache/torch/hub/harritaylor_torchvggish_master


MelSpecNetwork(
  (vggish_model): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, cei

In [104]:
mel_spec_net.eval()
mel_spec_net.double()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
mel_spec_net.to(device)
mel_spec_net(next(iter(mel_train_loader))["specs"].to(device))

tensor([[-7.4161e-03, -4.0460e-02,  8.8254e-03, -4.1167e-02,  2.8135e-02,
         -1.7568e-02,  2.1229e-02, -2.3866e-03, -3.2883e-02,  2.7282e-02,
          5.4677e-03, -4.7778e-02, -1.2149e-02, -2.7916e-02,  4.3513e-02,
         -5.0747e-02, -2.1087e-02, -8.9301e-02,  1.6601e-02,  2.4027e-03],
        [-1.8697e-02, -1.5853e-02, -2.3463e-03, -3.0159e-02,  1.8379e-02,
         -4.4711e-02,  2.4290e-02,  1.7742e-03, -2.0696e-02,  3.9279e-02,
         -3.3661e-03, -3.9283e-02, -2.4907e-03, -6.7596e-03,  5.7836e-02,
         -6.9187e-02, -1.5523e-02, -9.1903e-02,  2.7232e-02, -3.3894e-02],
        [-1.1220e-02, -3.1385e-02,  1.3983e-03, -2.6067e-02,  4.0275e-02,
         -1.7953e-02,  1.9516e-02,  6.0581e-03, -2.1701e-02,  2.4186e-02,
         -4.2735e-04, -4.2169e-02, -1.9722e-02, -1.8266e-02,  5.3457e-02,
         -6.6389e-02, -1.4152e-02, -9.2399e-02,  1.0715e-02, -1.4644e-02],
        [ 1.0809e-03, -3.7004e-02,  1.9590e-03, -3.6919e-02,  3.5853e-02,
         -2.6803e-02,  2.8064e-02, 

In [9]:
mel_spec_net.eval().double()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
mel_spec_net.to(device)
print(next(mel_spec_net.parameters()).is_cuda)
data = next(iter(vgg_train_loader))["specs"]
data = data.to(device)
print(data.device)

result = mel_spec_net(data)
print(result.shape)

True
cuda:0
torch.Size([16, 20])


In [79]:
class EarlyStopping():
  def __init__(self, spec_type, version, patience=7, verbose=True, delta=0):
    self.spec_type = spec_type
    self.version = version
    self.patience = patience
    self.verbose = verbose
    self.counter = 0
    self.best_score = None
    self.early_stop = False
    self.val_loss_min = np.Inf
    self.delta = delta

  def __call__(self, val_loss, epoch, model, optimizer):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, epoch, model, optimizer)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, epoch, model, optimizer)
            self.counter = 0

  def save_checkpoint(self, val_loss, epoch, model, optimizer):
        """Saves model when validation loss decrease."""
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        whole_state = {'epoch': epoch + 1, 'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'val_loss':val_loss}
        torch.save(whole_state, os.path.join('/kaggle/working', self.spec_type + 
                                             '_v' + str(self.version) + '.pt'))
        self.val_loss_min = val_loss

In [105]:
class MelSpecTrainer():
  def __init__(self, model, num_epochs, train_loader, val_loader, spec_type, version, lr=1e-6, weight_decay=0.0001):
    # Define your execution device
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("The model will be running on", self.device, "device")
    self.model = model
    self.num_epochs = num_epochs
    self.train_loader = train_loader
    self.val_loader = val_loader
    self.spec_type = spec_type
    self.version = version
    # Convert model parameters and buffers to CPU or Cuda
    self.model.to(self.device)

    # Model parameters should be doubles to match dataset sample datatype
    self.model.double()

    self.loss_fn = nn.BCEWithLogitsLoss()
#     self.optimizer = torch.optim.Adam(
#         [
#         {"params": self.model.resnet.layer1.parameters(), "lr": 1e-5},
#         {"params": self.model.resnet.layer2.parameters(), "lr": 1e-5},
#         {"params": self.model.resnet.layer3.parameters(), "lr": 1e-5},
#         {"params": self.model.resnet.layer4.parameters(), "lr": 1e-5},
#         {"params": self.model.resnet.fc.parameters(), "lr":lr}]
#         , lr=lr)
#     self.optimizer = torch.optim.Adam(
#         [
#         {"params": self.model.vggish_model.features.parameters(), "lr": 1e-5},
#         {"params": self.model.vggish_model.embeddings.parameters(), "lr": 1e-5},
#         {"params": self.model.classify.parameters(), "lr":lr}]
#         , lr=lr, weight_decay=weight_decay)
    self.optimizer = torch.optim.Adam(
        [
        {"params": self.model.vggish_model.parameters(), "lr": 1e-5},
        {"params": self.model.classify.parameters(), "lr":lr}]
        , lr=lr, weight_decay=weight_decay)
    self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay)
    self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=6, gamma=0.1)

    self.early_stop = EarlyStopping(self.spec_type, self.version, patience=20)

  def continue_train(self, state_dict_name, extra_epochs=0):
    state_dict = torch.load('/kaggle/working/' + state_dict_name, map_location=self.device)
    model_sd = state_dict['model']
    optimizer_sd = state_dict['optimizer']
    epochs_done = state_dict['epoch']
    min_val_loss = state_dict['val_loss']
    print(epochs_done)
    self.model.load_state_dict(model_sd)
    self.optimizer.load_state_dict(optimizer_sd)
    self.num_epochs = self.num_epochs - epochs_done + extra_epochs
    self.early_stop(min_val_loss, epochs_done - 1, self.model, self.optimizer)

  def train(self):
    print(self.device)
    self.model.train()
    pbar = master_bar(range(self.num_epochs))
    headers = ['Train_Loss', 'Val_Loss', 'F1-Macro', 'F1-Micro', 'JS', 'Time']
    pbar.write(headers, table=True)
    train_size = len(self.train_loader.dataset)
    for epoch in pbar:  # loop over the dataset multiple times
      running_train_loss = 0.0
      for i, sample in enumerate(progress_bar(self.train_loader, parent=pbar)):
        specs = sample['specs']
        labels = sample['instrument(s)']
        
        num_rows = specs.shape[0]

        # Labels are 1 x num of instrument classes
        # Need to remove the 1 dimension to get a batch size x num instrument classes result
        labels = labels.squeeze(dim=1)

        # Optionally print label
        # print(list(type_map.keys())[list(type_map.values()).index(labels)])

        # get the inputs
        specs = Variable(specs.to(self.device))
        labels = Variable(labels.to(self.device))

        # zero the parameter gradients
        self.optimizer.zero_grad()
        # predict classes using images from the training set
        outputs = self.model(specs)
        
        # compute the loss based on model output and real labels
        loss = self.loss_fn(outputs, labels)
        running_train_loss += loss * num_rows
        # backpropagate the loss
        loss.backward()
        # adjust parameters based on the calculated gradients
        self.optimizer.step()
      self.scheduler.step()
      running_train_loss = running_train_loss / train_size
      print(epoch)
      print(running_train_loss)
      overall_val_loss, pred_dict = self.predict(pbar)
      y_true, y_pred = pred_dict['y_true'], pred_dict['y_pred']

      str_stats = []
      stats = [running_train_loss,
                overall_val_loss,
                f1_score(y_true, y_pred, average="macro"),
                f1_score(y_true, y_pred, average="micro"),
                jaccard_score(y_true, y_pred, average="samples")]

      for stat in stats:
          str_stats.append(
              'NA' if stat is None else str(stat) if isinstance(stat, int) else f'{stat:.4f}'
          )

      pbar.write(str_stats, table=True)
      val_loss = self.predict(pbar)
      self.early_stop(overall_val_loss, epoch, self.model, self.optimizer)
      if self.early_stop.early_stop:
          print("Early stopping")
          break

  def predict(self, pbar=None, threshold=0.5):
    """
    Evaluate the model on a validation set
    :param device: str (defaults to 'cpu')
    :param pbar: fast_progress progress bar (defaults to None)
    :returns: overall_val_loss (float), accuracies (dict{'acc': value}, preds (dict)
    """
    val_size = len(self.val_loader.dataset)
    # Size is 28 for the 28 emotion classes in GoEmotions dataset
    running_val_loss = 0.0
    self.model.to(self.device)
    self.model.eval()

    preds_dict = {
            'y_true': np.zeros([val_size, 20]),
            'y_pred': np.zeros([val_size, 20])
        }

    with torch.no_grad():
      index_dict = 0
      for i, sample in enumerate(progress_bar(self.val_loader, parent=pbar, leave=(pbar is not None))):
        specs = sample['specs']
        labels = sample['instrument(s)']
        labels = labels.squeeze(dim=1)

        num_rows = specs.shape[0]

        specs = Variable(specs.to(self.device))
        labels = Variable(labels.to(self.device))

        y_prob = self.model(specs)
        running_val_loss += self.loss_fn(y_prob, labels) * num_rows

        y_pred = (torch.sigmoid(y_prob) > threshold).float().cpu().numpy()

        current_index = index_dict
        preds_dict['y_true'][current_index: current_index + num_rows, :] = labels.cpu().detach().numpy()
        preds_dict['y_pred'][current_index: current_index + num_rows, :] = y_pred
        index_dict += num_rows

    running_val_loss = running_val_loss / val_size

    return running_val_loss, preds_dict

  def saveModel(self, model, epoch_num, version):
    path = '/kaggle/working/' + self.spec_type + "_v" + str(version) + "_epoch" + str(epoch_num) + '.pth'
    torch.save(model.state_dict(), path)

In [106]:
num_epochs = 30
mel_spec_trainer = MelSpecTrainer(mel_spec_net, num_epochs, mel_train_loader, mel_val_loader, 'mel_spectrogram', 4, lr=1e-3)

The model will be running on cuda:0 device


In [73]:
num_epochs = 30
mel_spec_trainer = MelSpecTrainer(mel_spec_net, num_epochs, vgg_train_loader, vgg_val_loader, 'vgg', 4, lr=1e-2)

The model will be running on cuda:0 device


In [None]:
mel_spec_trainer.train()

cuda:0


Train_Loss,Val_Loss,F1-Macro,F1-Micro,JS,Time
0.3232,0.3134,0.0418,0.0575,0.034,
0.3072,0.3042,0.0541,0.0747,0.0325,
0.2971,0.2891,0.1355,0.1505,0.0846,
0.2877,0.2843,0.1332,0.145,0.0801,
0.2817,0.2801,0.1876,0.2082,0.1104,
0.2767,0.2762,0.191,0.2112,0.1179,


0
tensor(0.3232, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)


  cpuset_checked))


Validation loss decreased (inf --> 0.313394).  Saving model ...
1
tensor(0.3072, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)


  cpuset_checked))


Validation loss decreased (0.313394 --> 0.304201).  Saving model ...
2
tensor(0.2971, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)


  cpuset_checked))


Validation loss decreased (0.304201 --> 0.289070).  Saving model ...
3
tensor(0.2877, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)


  cpuset_checked))


Validation loss decreased (0.289070 --> 0.284341).  Saving model ...
4
tensor(0.2817, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)


  cpuset_checked))


Validation loss decreased (0.284341 --> 0.280085).  Saving model ...
5
tensor(0.2767, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)


  cpuset_checked))


In [None]:
mel_spec_trainer.continue_train('mel_spectrogram_v4.pt', 10)

In [None]:
mel_spec_trainer.train()

In [None]:
class MelSpecTester():
  def __init__(self, model, state_dict_name):
    self.model = model
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.state_dict = torch.load('/kaggle/working/' + state_dict_name, map_location=self.device)
    self.model_sd = self.state_dict['model']
    self.model.load_state_dict(self.model_sd)

  def test_accuracy(self, test_loader, threshold):
      """
      Evaluate the model on a validation set
      :param device: str (defaults to 'cpu')
      :param pbar: fast_progress progress bar (defaults to None)
      :returns: overall_val_loss (float), accuracies (dict{'acc': value}, preds (dict)
      """
      num_correct = 0
      num_incorrect = 0
      test_size = len(test_loader.dataset)
      self.model.to(self.device)
      self.model.eval()

      preds_dict = {
              'y_true': np.zeros([test_size, 20]),
              'y_pred': np.zeros([test_size, 20])
          }

      with torch.no_grad():
        index_dict = 0
        for i, sample in enumerate(progress_bar(test_loader)):
          specs = sample['specs']
          labels = sample['instrument(s)']
          labels = labels.squeeze(dim=1)

          num_rows = specs.shape[0]

          specs = Variable(specs.to(self.device))
          labels = Variable(labels.to(self.device))

          y_prob = self.model(specs)


          y_pred = (torch.sigmoid(y_prob) > threshold).float().cpu().numpy()
          labels = labels.cpu().detach().numpy()

          current_index = index_dict
          preds_dict['y_true'][current_index: current_index + num_rows, :] = labels
          preds_dict['y_pred'][current_index: current_index + num_rows, :] = y_pred
          index_dict += num_rows
          batch_correct = np.count_nonzero((y_pred == labels).astype(int)) # + torch.count_nonzero((predicted == labels_2).int())
          num_correct += batch_correct
          num_incorrect += specs.shape[0] * 20 - batch_correct
            
      y_true, y_pred = preds_dict['y_true'], preds_dict['y_pred']

      str_stats = []
      stats = [f1_score(y_true, y_pred, average="macro"),
                f1_score(y_true, y_pred, average="micro"),
                jaccard_score(y_true, y_pred, average="samples")]

      for stat in stats:
          str_stats.append(
              'NA' if stat is None else str(stat) if isinstance(stat, int) else f'{stat:.4f}'
          )
      print(str_stats)
      
      print("Correctly Labeled: " + str(num_correct))
      print("Incorrectly Labeled: " + str(num_incorrect))
      return preds_dict


In [None]:
mel_spec_tester = MelSpecTester(mel_spec_net, 'mel_spectrogram_v4.pt')
mel_spec_tester.test_accuracy(mel_test_loader, 0.5)

In [None]:
from IPython.display import FileLink
FileLink(r'/kaggle/working/vgg_v4.pt')