# Audio Classification with Mel-Spectrogram filterbanks

For this tutorial we will use the audio samples from Google Speech Commands Dataset v0.01 https://www.kaggle.com/c/tensorflow-speech-recognition-challenge/data

The audio samples consist of recordings of simple commands of duration of 1 second. Twenty core command words were recorded, with most speakers saying each of them five times. 
The core words are
"Yes", "No", "Up", "Down", "Left", "Right", "On", "Off", "Stop", "Go", "Zero", "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", and "Nine". 

To help distinguish unrecognized words, there are also ten auxiliary words, which most speakers only said once. These include "Bed", "Bird", "Cat", "Dog", "Happy", "House", "Marvin", "Sheila", "Tree", and "Wow".

There is also long samples of background noise / silence.

For convenience, three list of files are provided for train/validation/test splits.

First let's download and extract the database

In [None]:
import io
import torch
import torch.nn as nn
from torchvision import models, transforms, datasets
import torch.utils.data as data
import numpy as np

import os
import os.path
import pickle
import hashlib
import librosa
from scipy.io import wavfile

In [None]:
!pip install torchsummary
#example https://github.com/GRAAL-Research/poutyne/blob/master/examples/mnist.ipynb
!pip install poutyne



In [None]:
!pip install gdown
!gdown "https://drive.google.com/uc?id=1HR28jRFwrveq5zxjkzri61jm9F4-M7p7"
!tar -zxf speech_commands_v0.01_with_splits.tar.gz


Downloading...
From: https://drive.google.com/uc?id=1HR28jRFwrveq5zxjkzri61jm9F4-M7p7
To: /content/speech_commands_v0.01_with_splits.tar.gz
1.49GB [00:10, 138MB/s]


# Define Class for computing Mel-Spectrogram

As seen in previous colab

In [None]:
#Helper Functions borrowed from torchaudio https://github.com/pytorch/audio/blob/master/torchaudio/transforms.py

class PadTrim(object):
    """Pad/Trim a 1d-Tensor (Signal or Labels)
    Args:
        tensor (Tensor): Tensor of audio of size (n x c) or (c x n)
        max_len (int): Length to which the tensor will be padded
        channels_first (bool): Pad for channels first tensors.  Default: `True`
    """

    def __init__(self, max_len, fill_value=0, channels_first=True):
        self.max_len = max_len
        self.fill_value = fill_value
        self.len_dim, self.ch_dim = int(channels_first), int(not channels_first)

    def __call__(self, tensor):
        """
        Returns:
            Tensor: (c x n) or (n x c)
        """
        assert tensor.size(self.ch_dim) < 128, \
            "Too many channels ({}) detected, see channels_first param.".format(tensor.size(self.ch_dim))
        if self.max_len > tensor.size(self.len_dim):
            padding = [self.max_len - tensor.size(self.len_dim)
                       if (i % 2 == 1) and (i // 2 != self.len_dim)
                       else 0
                       for i in range(4)]
            with torch.no_grad():
                tensor = torch.nn.functional.pad(tensor, padding, "constant", self.fill_value)
        elif self.max_len < tensor.size(self.len_dim):
            tensor = tensor.narrow(self.len_dim, 0, self.max_len)
        return tensor

    def __repr__(self):
        return self.__class__.__name__ + '(max_len={0})'.format(self.max_len)




class MelScale(object):
    """This turns a normal STFT into a mel frequency STFT, using a conversion
       matrix.  This uses triangular filter banks.
    Args:
        n_mels (int): number of mel bins
        sr (int): sample rate of audio signal
        f_max (float, optional): maximum frequency. default: `sr` // 2
        f_min (float): minimum frequency. default: 0
        n_stft (int, optional): number of filter banks from stft. Calculated from first input
            if `None` is given.  See `n_fft` in `Spectrogram`.
    """
    def __init__(self, n_mels=128, sr=16000, f_max=None, f_min=0., n_stft=None):
        self.n_mels = n_mels
        self.sr = sr
        self.f_max = f_max if f_max is not None else sr // 2
        self.f_min = f_min
        self.fb = self._create_fb_matrix(n_stft) if n_stft is not None else n_stft

    def __call__(self, spec_f):
        if self.fb is None:
            self.fb = self._create_fb_matrix(spec_f.size(2)).to(spec_f.device)
        else:
            # need to ensure same device for dot product
            self.fb = self.fb.to(spec_f.device)
        spec_m = torch.matmul(spec_f, self.fb)  # (c, l, n_fft) dot (n_fft, n_mels) -> (c, l, n_mels)
        return spec_m

    def _create_fb_matrix(self, n_stft):
        """ Create a frequency bin conversion matrix.
        Args:
            n_stft (int): number of filter banks from spectrogram
        """

        # get stft freq bins
        stft_freqs = torch.linspace(self.f_min, self.f_max, n_stft)
        # calculate mel freq bins
        m_min = 0. if self.f_min == 0 else self._hertz_to_mel(self.f_min)
        m_max = self._hertz_to_mel(self.f_max)
        m_pts = torch.linspace(m_min, m_max, self.n_mels + 2)
        f_pts = self._mel_to_hertz(m_pts)
        # calculate the difference between each mel point and each stft freq point in hertz
        f_diff = f_pts[1:] - f_pts[:-1]  # (n_mels + 1)
        slopes = f_pts.unsqueeze(0) - stft_freqs.unsqueeze(1)  # (n_stft, n_mels + 2)
        # create overlapping triangles
        z = torch.tensor(0.)
        down_slopes = (-1. * slopes[:, :-2]) / f_diff[:-1]  # (n_stft, n_mels)
        up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_stft, n_mels)
        fb = torch.max(z, torch.min(down_slopes, up_slopes))
        return fb

    def _hertz_to_mel(self, f):
        return 2595. * torch.log10(torch.tensor(1.) + (f / 700.))

    def _mel_to_hertz(self, mel):
        return 700. * (10**(mel / 2595.) - 1.)
      
class MelSpectrogram(nn.Module):
    def __init__(self, n_mels = 40, sfr=16000):
        super(MelSpectrogram, self).__init__()
        self.sfr = sfr
        self.window_stride=0.01
        self.window_size=0.02
        self.n_fft=512
        self.n_mels=n_mels
        
        self.win_length = int(self.sfr * self.window_size)
        self.hop_length = int(self.sfr * self.window_stride)
        self.lowfreq = 20
        self.highfreq = self.sfr/2 - 400
        self.window = torch.hamming_window(self.win_length)
        
        self.mel = MelScale(n_mels=self.n_mels, sr=self.sfr, f_max=self.highfreq, f_min=self.lowfreq)
        self.norm = nn.InstanceNorm2d(1)

    def __call__(self, x):
        
        x = x.squeeze(1)
        spec_f = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, 
                    win_length=self.win_length, 
                    window=self.window,
                    center=True,
                    normalized=False, onesided=True,
                    pad_mode='reflect'
                   )
        spec_f = spec_f.pow(2).sum(-1)
        x = self.mel(spec_f.transpose(1,2)).transpose(1,2)
        x = torch.log(x+0.0001)
        x = x.unsqueeze(1)
        #x = self.norm(x)
        return x
      
      
    def plot_sample(self, fbank, index):
        librosa.display.specshow(fbank[index,:,:,:].view(self.n_mels,-1).numpy(),
                          y_axis='mel', x_axis='time',sr=self.sfr, fmax=self.highfreq, hop_length=self.hop_length)
        plt.title('Mel spectrogram')
        plt.colorbar(format='%+2.0f dB')
        plt.tight_layout()

#AudioReader for wavefiles

This reader provides data in the format X, Y = (raw_audio, target_class)

**add_silence_class** parameter: includes random samples from background noise to the dataset in the sample proportion as the "yes" class


In [None]:
class AudioReader(data.Dataset):
  
    def __init__(self, list_path, transform=PadTrim(16000), add_silence_class=False, add_noise=False):
        
        self.list_path = list_path
        self.database_path = os.path.dirname(list_path) + '/audio/'
        self.add_noise = add_noise
        self.add_silence_class = add_silence_class
        self.transform = transform

        self.target_class = {}
        self.target_class_idx_to_name = {}
        self.target_class_names = ['unknown','silence', 'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
        for i, name in enumerate(self.target_class_names):
            self.target_class[name] = i
            self.target_class_idx_to_name[i] = name
        self.audio_class = {}
        self.audio_speaker = {}
        self.audios = []
                 
        self.read_list_data()
                 
        self.speaker_ids = {}
        for i, spk_id in enumerate(self.audio_speaker.values()):
            self.speaker_ids[spk_id] = i
        
        
        self.num_silence = 0
        
        if self.add_noise or self.add_silence_class:
            self.background_noises_names = []
            self.background_noises = []
            for f in os.listdir(self.database_path + '/_background_noise_/'):
                if f.endswith(".wav"):
                    self.background_noises_names.append('_background_noise_/' + f)
                    print(self.background_noises_names[-1])
                    self.background_noises.append( self.load_audio(self.background_noises_names[-1]))

                    
        if self.add_silence_class:  #Adding the same amount of samples as the "yes" class
            self.num_silence = sum(1 for i in self.audio_class.values() if i == 'yes')
            
        self.seeded = False
        
        
    
    def read_list_data(self):
        with open(self.list_path, 'r') as stream:
            for line in stream:
                file_path = line.strip()
                file_class, file_name = file_path.split('/')
                identity = file_name.split('_')[0]
                self.audio_class[file_path] = file_class
                self.audio_speaker[file_path] = identity
                self.audios.append(file_path)
                
        
    def __len__(self):
        return len(self.audio_class) + self.num_silence


    def __getitem__(self, index):
        
        if not self.seeded:
            self.seeded = True
            np.random.seed(index)
            
        if index >= len(self.audios):
            spk_id=-1
            length = 16000
            audio = self.get_silence_chunk(length)
            target_id = self.target_class['silence']
            audio_id = 'random_silence/randomchunk.wav' 
        else:
          
            audio_id = self.audios[index]
            audio = self.load_audio(audio_id)
      
            spk_id = self.speaker_ids[self.audio_speaker[audio_id]]
            target = self.audio_class[audio_id]
      
            if target not in self.target_class_names:
                target = 'unknown'
            target_id = self.target_class[target]
            
        audio -= audio.mean()
        #max_val = np.abs(audio[np.argpartition(np.abs(audio),-10)[-10:]]).mean()
        max_val = np.abs(audio).max()
        audio /= max_val + 0.001    
            
        sample = torch.FloatTensor(audio)
        if self.add_noise:
            alpha = np.random.uniform(low=0.90, high=1.00)
            beta = 1.0 - alpha
            silence_chunk = self.get_silence_chunk(len(audio))
            max_val = np.abs(silence_chunk).max()
            silence_chunk /= max_val + 0.001    
            sample_noise = torch.FloatTensor(audio * alpha + beta * silence_chunk)
        
        if self.transform is not None:
            sample = sample.view(1,-1)
            sample = self.transform(sample)
            if self.add_noise:
                sample_noise = sample_noise.view(1,-1)
                sample_noise = self.transform(sample_noise)
              
            
        if self.add_noise:
            return sample_noise,  target_id
        else:
            return sample, target_id
        
    def load_audio(self, audio_name):
        
        audio_path = self.database_path + audio_name
        fs, audio = wavfile.read(audio_path)
        audio = audio / 2**15
        #audio, fs = librosa.load(audio_path)

        return audio
                 
    
    
    def get_silence_chunk(self, length):
        i = np.random.randint(0, len(self.background_noises))
        silence = self.background_noises[i]
        max_start = silence.shape[0] - length -1
        random_start = np.random.randint(0, max_start)
        #print("Starting at", random_start )
        chunk = silence[random_start:(random_start + length)]
        return chunk
      
    def get_class_weights(self):
        class_ids = []
        for target in self.audio_class.values():
            if target not in self.target_class_names:
                target = 'unknown'
            target_id = self.target_class[target]
            class_ids.append(target_id)
        for jj in range(self.num_silence):
            class_ids.append(self.target_class['silence'])
        class_ids.append(self.target_class['unknown'])
        from sklearn.utils import class_weight
        #print(np.unique(class_ids))
        class_weight = class_weight.compute_class_weight('balanced', np.unique(class_ids),class_ids)
        class_weight = torch.from_numpy(class_weight).float()
        return class_weight
      
    def get_n_classes(self):
        return len(self.target_class_names)
     
    


In [None]:
train_loader = data.DataLoader(
                    AudioReader('gcommands/training_list.txt',add_silence_class=True), 
                        batch_size=50, shuffle=True, num_workers=2, pin_memory=True, 
                    )

train_loader_noise = data.DataLoader(
                    AudioReader('gcommands/training_list.txt',add_silence_class=True, add_noise=True), 
                        batch_size=50, shuffle=True, num_workers=2, pin_memory=True, 
                    )

valid_loader = data.DataLoader(
                    AudioReader('gcommands/validation_list.txt'), 
                        batch_size=50, shuffle=False, num_workers=2, pin_memory=True, 
                    )

test_loader = data.DataLoader(
                    AudioReader('gcommands/testing_list.txt'), 
                        batch_size=50, shuffle=False, num_workers=2, pin_memory=True, 
                    )

_background_noise_/doing_the_dishes.wav
_background_noise_/white_noise.wav
_background_noise_/exercise_bike.wav
_background_noise_/running_tap.wav
_background_noise_/pink_noise.wav
_background_noise_/dude_miaowing.wav
_background_noise_/doing_the_dishes.wav
_background_noise_/white_noise.wav
_background_noise_/exercise_bike.wav
_background_noise_/running_tap.wav
_background_noise_/pink_noise.wav
_background_noise_/dude_miaowing.wav




In [None]:

for batch in train_loader:
    break

In [None]:
X, target = batch
print(X.shape)
print(target)
train_loader.dataset.target_class_idx_to_name

torch.Size([50, 1, 16000])
tensor([10,  0,  7,  0,  0,  6,  0,  3,  0,  0,  0, 10, 10,  1,  0,  0,  0,  0,
         2,  0,  0,  6,  0,  0,  0,  0,  2,  1,  0,  1,  6,  0,  0,  6, 11,  0,
         0,  0,  0,  0,  1,  0,  0,  5,  6,  0,  0, 10,  4,  9])


{0: 'unknown',
 1: 'silence',
 2: 'yes',
 3: 'no',
 4: 'up',
 5: 'down',
 6: 'left',
 7: 'right',
 8: 'on',
 9: 'off',
 10: 'stop',
 11: 'go'}

In [None]:
train_loader.dataset.get_class_weights()

tensor([0.1356, 2.3723, 2.3723, 2.3812, 2.3941, 2.3954, 2.3994, 2.3825, 2.3672,
        2.3994, 2.3408, 2.3710])

# LeNet Audio classification model

Raw audio is converted into mel spectrogram and treated as a 2D image

we use poutyne for training to reduce boilerplate code

In [None]:
import math
import torch.nn as nn
import torch.nn.functional as F

class LeNet(nn.Module):
    def __init__(self, num_classes=31):
        super(LeNet, self).__init__()
        self.mels = nn.Sequential(
            MelSpectrogram(),
            nn.InstanceNorm2d(1) # Normalization
        )

        self.features = nn.Sequential(
            nn.Conv2d(1, 20, kernel_size=5),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Conv2d(20, 20, kernel_size=5),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Dropout2d(),
        )
        self.classifier = nn.Sequential(
            nn.Linear(20*7*22, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        ## Compute mel filterbanks from raw audio
        bs=x.shape[0]
        debug=False
        if debug:
          print(x.shape) #bs, 1, 16000

        x = self.mels(x)
        if debug:
          print(x.shape) #bs, 1, 40, 101
        

        ## Extract features
        #bs, 20, ((40-5+1)/2 -5 +1)/2, int((int((40-5+1)/2) -5 +1)/2)
        bs, 20, 7, 22
        x=self.features(x)
        
        if debug:
          print(x.shape)
        
        ## Flatten features
        #reshape -> pytorch view(bs, c*height*width)  
        #bs, features

        x = x.view(bs,-1) #bs, 20*7*22

        ## Classify
        x = self.classifier(x)

        if debug:
          raise Exception('In debug mode!')
        return x

## Task description

1. Adapt the shape of the features to be classified using either x.view(?,?) or nn.Flatten() in the classifier definition

2. Compute the correct values for the first linear layer of the classifier


In [None]:
import torch.nn as nn
import torch.optim as optim
from poutyne.framework import Model
from torchsummary import summary
cuda_device = 0
device = torch.device("cuda:%d" % cuda_device if torch.cuda.is_available() else "cpu")
                      
mymodel = LeNet(num_classes = train_loader.dataset.get_n_classes())
print(mymodel.to(device))
summary(mymodel, input_size=(1, 16000))
learning_rate = 0.001

# Optimizer and loss function
#optimizer = optim.SGD(mymodel.parameters(), lr=learning_rate, weight_decay=0.001)
#optimizer = optim.Adam(mymodel.parameters(), lr=learning_rate)
optimizer = optim.Adam( filter(lambda p: p.requires_grad, mymodel.parameters()), lr=learning_rate )
#loss_function = nn.CrossEntropyLoss(weight=train_loader.dataset.get_class_weights())
loss_function = nn.CrossEntropyLoss()

model = Model(mymodel, optimizer, loss_function, batch_metrics=['accuracy'], epoch_metrics=['f1'])

# Send model on GPU
model.to(device)

model.fit_generator(train_loader, valid_loader, epochs=10)


 # Test
test_loss, test_acc = model.evaluate_generator(test_loader)
print('Test:\n\tLoss: {}\n\tAccuracy: {}'.format(test_loss, test_acc))

LeNet(
  (mels): Sequential(
    (0): MelSpectrogram(
      (norm): InstanceNorm2d(1, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
    )
    (1): InstanceNorm2d(1, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
  )
  (features): Sequential(
    (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
    (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): ReLU()
    (3): Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): ReLU()
    (6): Dropout2d(p=0.5, inplace=False)
  )
  (classifier): Sequential(
    (0): Linear(in_features=3080, out_features=1000, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1000, out_features=12, bias=True)
  )
)
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
    InstanceNorm2d-1           [-1, 1, 40, 101]     

## Check performance with data augmentation

instead of training with train_loader we will train with train_loader_noise and evaluate on clean data

In [None]:
                      
mymodel = LeNet(num_classes = train_loader.dataset.get_n_classes())
learning_rate = 0.001

optimizer = optim.Adam( filter(lambda p: p.requires_grad, mymodel.parameters()), lr=learning_rate )
loss_function = nn.CrossEntropyLoss()

model = Model(mymodel, optimizer, loss_function, batch_metrics=['accuracy'], epoch_metrics=['f1'])

# Send model on GPU
model.to(device)

model.fit_generator(train_loader_noise, valid_loader, epochs=10)


 # Test
test_loss, test_acc = model.evaluate_generator(test_loader)
print('Test:\n\tLoss: {}\n\tAccuracy: {}'.format(test_loss, test_acc))

Epoch 1/20 30.33s Step 1059/1059: loss: 0.887520, acc: 72.958374, fscore_micro: 0.729584, val_loss: 0.466242, val_acc: 85.672257, val_fscore_micro: 0.856723
Epoch 2/20 30.24s Step 1059/1059: loss: 0.511687, acc: 83.487573, fscore_micro: 0.834876, val_loss: 0.352559, val_acc: 89.644013, val_fscore_micro: 0.896440
Epoch 3/20 30.57s Step 1059/1059: loss: 0.416066, acc: 86.628390, fscore_micro: 0.866284, val_loss: 0.316556, val_acc: 90.688438, val_fscore_micro: 0.906884
Epoch 4/20 30.21s Step 1059/1059: loss: 0.366843, acc: 88.165748, fscore_micro: 0.881657, val_loss: 0.281563, val_acc: 91.644601, val_fscore_micro: 0.916446
Epoch 5/20 30.27s Step 1059/1059: loss: 0.333011, acc: 89.161064, fscore_micro: 0.891611, val_loss: 0.253941, val_acc: 92.233010, val_fscore_micro: 0.922330
Epoch 6/20 30.43s Step 1059/1059: loss: 0.302482, acc: 90.299917, fscore_micro: 0.902999, val_loss: 0.256760, val_acc: 92.174169, val_fscore_micro: 0.921742
Epoch 7/20 30.52s Step 1059/1059: loss: 0.282184, acc: 91.

# VGG Audio classification model

Let's borrow a successful image classification model and use it for audio mel-spectrograms "images" training the model from scratch

In [None]:
import torch.nn as nn
import torch.nn.functional as F

#https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
class VGG(nn.Module):

    def __init__(self, vgg_name, num_classes=12):
        super(VGG, self).__init__()
        self.mels = nn.Sequential(
            MelSpectrogram(),
            nn.InstanceNorm2d(1) # Normalization
          )
        self.features = make_layers(cfg[vgg_name])
        self.classifier = nn.Sequential(
            nn.Linear(1 * 3 * 512, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
        self._initialize_weights()

    def forward(self, x):
        x = self.mels(x)
        x = self.features(x)
        
        x = x.view(x.size(0), -1)
        
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()


def make_layers(cfg, batch_norm=True):
    layers = []
    in_channels = 1
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)


cfg = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}

In [None]:
#mymodel = LeNet(num_classes = train_loader.dataset.get_n_classes())
#mymodel = TDNN(num_classes = train_loader.dataset.get_n_classes())
mymodel = VGG('VGG11',num_classes = train_loader.dataset.get_n_classes())
#mymodel = MyVGG(num_classes = train_loader.dataset.get_n_classes())
print(mymodel.to(device))
summary(mymodel, input_size=(1, 16000))
learning_rate = 0.001

optimizer = optim.Adam( filter(lambda p: p.requires_grad, mymodel.parameters()), lr=learning_rate )
loss_function = nn.CrossEntropyLoss()

model = Model(mymodel, optimizer, loss_function, batch_metrics=['accuracy'], epoch_metrics=['f1'])
model.to(device)
model.fit_generator(train_loader, valid_loader, epochs=10)

 # Test
test_loss, test_acc = model.evaluate_generator(test_loader)
print('Test:\n\tLoss: {}\n\tAccuracy: {}'.format(test_loss, test_acc))

VGG(
  (mels): Sequential(
    (0): MelSpectrogram(
      (norm): InstanceNorm2d(1, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
    )
    (1): InstanceNorm2d(1, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
  )
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 

## VGG fine tuning

Use a pretrained VGG, freezing its parameters and replacing the last linear layer with a new one with the needed num_classes


In [None]:
import torchvision.models as models
import torch.nn as nn
import torch.nn.functional as F

class MyVGG(nn.Module):
    def __init__(self, num_classes=12):
          super(MyVGG, self).__init__()
          self.origVGG = models.vgg11(pretrained=True)
          for param in self.origVGG.parameters():
              param.requires_grad = False

          in_features = self.origVGG.classifier[6].in_features
          self.origVGG.classifier[6] = nn.Linear(in_features, num_classes)
          conv1 = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          self.origVGG.features[0] = conv1
          self.mels = nn.Sequential(
            MelSpectrogram(),
            nn.InstanceNorm2d(1) # Normalization
          )
          
    def forward(self, x):
        x = self.mels(x)
        x = self.origVGG(x)
        return x


In [None]:
#mymodel = LeNet(num_classes = train_loader.dataset.get_n_classes())
#mymodel = TDNN(num_classes = train_loader.dataset.get_n_classes())
#mymodel = VGG('VGG11',num_classes = train_loader.dataset.get_n_classes())
mymodel = MyVGG(num_classes = train_loader.dataset.get_n_classes())
print(mymodel.to(device))
summary(mymodel, input_size=(1, 16000))
learning_rate = 0.001

optimizer = optim.Adam( filter(lambda p: p.requires_grad, mymodel.parameters()), lr=learning_rate )
loss_function = nn.CrossEntropyLoss()

model = Model(mymodel, optimizer, loss_function, batch_metrics=['accuracy'], epoch_metrics=['f1'])
model.to(device)
model.fit_generator(train_loader, valid_loader, epochs=10)

 # Test
test_loss, test_acc = model.evaluate_generator(test_loader)
print('Test:\n\tLoss: {}\n\tAccuracy: {}'.format(test_loss, test_acc))

MyVGG(
  (origVGG): VGG(
    (features): Sequential(
      (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): ReLU(inplace=True)
      (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (7): ReLU(inplace=True)
      (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (9): ReLU(inplace=True)
      (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (12): ReLU(inplace=True)
      (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (14): ReLU(inplace=True)
      (15): MaxPool2d(kernel_size=

# TDNN Classification model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class TDNN(nn.Module):
    def __init__(self, num_classes=12):
        super(TDNN, self).__init__()
        self.tdnn = nn.Sequential(
            nn.Conv1d(40, 450, stride=1, dilation=1, kernel_size=3),
            nn.ReLU(True),
            nn.Conv1d(450, 450, stride=1, dilation=1, kernel_size=4),
            nn.ReLU(True),
            nn.Conv1d(450, 450, stride=1, dilation=3, kernel_size=3),
            nn.ReLU(True),
            nn.Conv1d(450, 450, stride=1, dilation=3, kernel_size=3),
            nn.ReLU(True),
            nn.Conv1d(450, 450, stride=1, dilation=3, kernel_size=3),
            nn.ReLU(True),
            nn.Conv1d(450, 450, stride=1, dilation=3, kernel_size=3),
            nn.ReLU(True),
            nn.Conv1d(450, 450, stride=1, dilation=3, kernel_size=3),
            nn.ReLU(True),
            nn.MaxPool1d(3, stride=3),
        )
        self.classifier = nn.Sequential(
            nn.Linear(9000, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
        self.mels = nn.Sequential(
            MelSpectrogram(),
            nn.InstanceNorm2d(1) # Normalization
        )


    def forward(self, x):
        x=self.mels(x)
        x.squeeze_(1)
        x = self.tdnn(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x