# Prepare data

In [1]:
from os.path import join
from pathlib import Path

import torch
import torch.nn.functional as F
import pytorch_lightning as pl

from torch import nn
from torch.utils.data import random_split
from torch.utils.data import DataLoader
from pytorch_lightning.metrics.functional import accuracy
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

from audio_loader.features.raw_audio import WindowedAudio
from audio_loader.features.log_mel import WindowedLogMel
from audio_loader.ground_truth.timit import TimitGroundTruth
from audio_loader.samplers.dynamic_sampler import DynamicSampler
from audio_loader.dl_frontends.pytorch.fill_ram import get_dataset_dynamic_size

# Dataloader lightning

In [2]:
class TimitMELDataModule(pl.LightningDataModule):
    
    def __init__(self, data_dir, batch_size):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
    
    def prepare_data(self):
        self.timit_gt = TimitGroundTruth(self.data_dir, with_silences=False)
        self.mel_feature_processor = WindowedLogMel(400, 160, 16000, 40, normalize=False)#, delta_orders=[2])
        self.mel_sampler = DynamicSampler([self.mel_feature_processor], self.timit_gt)
        self.original_train_dataset, self.collate_func = get_dataset_dynamic_size(self.mel_sampler, "train")
        self.test_dataset, self.collate_func = get_dataset_dynamic_size(self.mel_sampler, "test")
        
    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
            self.val_nb_samples = round(len(self.original_train_dataset)/100)
            self.train_nb_samples = len(self.original_train_dataset) - self.val_nb_samples
            self.train_dataset, self.val_dataset = random_split(
                self.original_train_dataset,
                [self.train_nb_samples, self.val_nb_samples]
            )
        
        if stage == 'test' or stage is None:
            return
            

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True,
                          collate_fn=self.collate_func,
                          drop_last=True)

    def val_dataloader(self):
        #return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False,
        #                  collate_fn=self.collate_func,
        #                  drop_last=False)
        return DataLoader(self.test_dataset, self.batch_size , shuffle=False,
                          collate_fn=self.collate_func,
                          drop_last=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, self.batch_size , shuffle=False,
                          collate_fn=self.collate_func,
                          drop_last=False)

In [3]:
# the data
mel_timit = TimitMELDataModule(join(Path.home(), "data/kaggle_TIMIT"), 16)

# Model definition

In [4]:
class lit_mel_model(pl.LightningModule):
    def __init__(self, feature_size):
        """Init all parameters.
        
        feature_size: int
            size of the expected features for the forward step
        """
        super().__init__()
        self.feature_size = feature_size
        
        self.layer_1_grus = nn.GRU(
            feature_size, 550, 5,
            bidirectional=True,
            batch_first=True,
            dropout=0.2
        )
        
        self.bn_fwd = nn.BatchNorm1d(550)
        self.bn_bwd = nn.BatchNorm1d(550)
        self.layer_2_dense = torch.nn.Linear(1100, 128)
        self.bn_layer_2 = nn.BatchNorm1d(128)
        self.act_layer_2 = nn.LeakyReLU(0.1) # in pytorch kaldi it is softmax
        
        self.layer_3_dense = torch.nn.Linear(128, 58)
        
    def forward(self, x):
        """Forward of the model over the data."""
        batch_size = x.batch_sizes[0]
        # shape: (num_layers*directions, batch_size, hidden_size?)
        h_0 = torch.zeros(5*2, batch_size, 550, device="cuda")
        output, h_n = self.layer_1_grus(x, h_0)

        fwd_h = h_n.view(5, 2, batch_size, 550)[-1, 0]
        bwd_h = h_n.view(5, 2, batch_size, 550)[-1, 1]
    
        fwd_h = self.bn_fwd(fwd_h.view(batch_size, 550))
        bwd_h = self.bn_bwd(bwd_h.view(batch_size, 550))

        h = torch.cat((fwd_h, bwd_h), 1)
        dense1 = self.bn_layer_2(self.act_layer_2(self.layer_2_dense(h)))
        return self.layer_3_dense(dense1)
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0004)
        return optimizer
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x.to('cuda'))
        _, y = torch.stack(y).max(dim=1)
        
        loss = F.cross_entropy(y_hat, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x.to('cuda'))
        _, target = torch.stack(y).max(dim=1)
        _, pred = y_hat.max(dim=1)

        loss = F.cross_entropy(y_hat, target)
        self.log('val_loss', loss)
        self.log('val_accuracy', accuracy(pred+1, target+1))
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x.to('cuda'))
        _, target = torch.stack(y).max(dim=1)
        _, pred = y_hat.max(dim=1)
        
        loss = F.cross_entropy(y_hat, target)
        self.log('test_loss', loss)
        self.log('test_accuracy', accuracy(pred+1, target+1))
        return loss

In [5]:
# init model
model = lit_mel_model(40*1) # log MEl band
model.to('cuda')

lit_mel_model(
  (layer_1_grus): GRU(40, 550, num_layers=5, batch_first=True, dropout=0.2, bidirectional=True)
  (bn_fwd): BatchNorm1d(550, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_bwd): BatchNorm1d(550, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_2_dense): Linear(in_features=1100, out_features=128, bias=True)
  (bn_layer_2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act_layer_2): LeakyReLU(negative_slope=0.1)
  (layer_3_dense): Linear(in_features=128, out_features=58, bias=True)
)

# Train model

In [6]:
# trainer definition
trainer = pl.Trainer(
    callbacks=[
        EarlyStopping(monitor='val_loss', patience=10, mode="min")
    ],
    checkpoint_callback=ModelCheckpoint(save_top_k=5, monitor="val_loss", mode="min"),
    progress_bar_refresh_rate=1000,
    gpus=1, auto_select_gpus=True,
    precision=16,
    max_epochs=100
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.


In [7]:
trainer.fit(model, mel_timit)

mel_timit.setup("test")
trainer.test(model, mel_timit.test_dataloader())


  | Name          | Type        | Params
----------------------------------------------
0 | layer_1_grus  | GRU         | 23 M  
1 | bn_fwd        | BatchNorm1d | 1 K   
2 | bn_bwd        | BatchNorm1d | 1 K   
3 | layer_2_dense | Linear      | 140 K 
4 | bn_layer_2    | BatchNorm1d | 256   
5 | act_layer_2   | LeakyReLU   | 0     
6 | layer_3_dense | Linear      | 7 K   


Epoch 0:   0%|          | 0/13977 [00:00<?, ?it/s] 



Epoch 0:  72%|███████▏  | 10000/13977 [07:50<03:06, 21.27it/s, loss=1.300, v_num=105]
Validating: 0it [00:00, ?it/s][A
Epoch 0:  79%|███████▊  | 11000/13977 [08:09<02:12, 22.46it/s, loss=1.300, v_num=105]
Epoch 0:  86%|████████▌ | 12000/13977 [08:18<01:22, 24.09it/s, loss=1.300, v_num=105]
Epoch 0:  93%|█████████▎| 13000/13977 [08:33<00:38, 25.31it/s, loss=1.271, v_num=105]
Epoch 1:  72%|███████▏  | 10000/13977 [07:56<03:09, 20.99it/s, loss=1.230, v_num=105]
Validating: 0it [00:00, ?it/s][A
Epoch 1:  79%|███████▊  | 11000/13977 [08:13<02:13, 22.27it/s, loss=1.230, v_num=105]
Epoch 1:  86%|████████▌ | 12000/13977 [08:20<01:22, 23.95it/s, loss=1.230, v_num=105]
Epoch 1:  93%|█████████▎| 13000/13977 [08:34<00:38, 25.28it/s, loss=1.266, v_num=105]
Epoch 2:  72%|███████▏  | 10000/13977 [07:12<02:52, 23.12it/s, loss=1.079, v_num=105]
Validating: 0it [00:00, ?it/s][A
Epoch 2:  79%|███████▊  | 11000/13977 [07:30<02:01, 24.44it/s, loss=1.079, v_num=105]
Epoch 2:  86%|████████▌ | 12000/13977 



Epoch 4:   7%|▋         | 1000/13977 [01:08<14:52, 14.55it/s, loss=0.872, v_num=105]
Testing: 0it [00:00, ?it/s]



Testing:  80%|████████  | 3000/3742 [00:21<00:05, 136.36it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.6699, device='cuda:0'),
 'test_loss': tensor(1.0169, device='cuda:0'),
 'train_loss': tensor(0.5585, device='cuda:0'),
 'val_accuracy': tensor(0.6622, device='cuda:0'),
 'val_loss': tensor(1.0352, device='cuda:0')}
--------------------------------------------------------------------------------
Testing:  80%|████████  | 3000/3742 [00:27<00:06, 109.80it/s]


[{'train_loss': 0.5585184097290039,
  'val_loss': 1.035223126411438,
  'val_accuracy': 0.6622452735900879,
  'test_loss': 1.016899824142456,
  'test_accuracy': 0.6698864102363586}]