<a href="https://colab.research.google.com/github/unicamp-dl/IA025_2022S1/blob/main/Final_project/Karen_Rosero/Fine_tuning_wav2vec_E3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sound detection and classification using transformers
## Fine-tuning notebook

## Projeto final da disclipina IA025
## Autora: Karen Rosero

# 1. Configurações iniciais

Instalo as bibliotecas faltantes

In [None]:
#!pip install transformers
#!pip install ipywidgets
#!pip install pytorch-lightning==1.5.10
#!pip install nvidia-ml-py3
#!pip install neptune-client
#!pip install lightning-bolts
#!pip install torchmetrics

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Importo as bibliotecas necessárias

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2Config
import torch
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2FeatureEncoder, Wav2Vec2NoLayerNormConvLayer, Wav2Vec2LayerNormConvLayer
from torch import nn
from transformers.activations import ACT2FN
import ipywidgets 
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import csv
import torchaudio
import torchtext
import pytorch_lightning as pl
import nvidia_smi
from pytorch_lightning.loggers.neptune import NeptuneLogger
from pytorch_lightning.loggers import NeptuneLogger
from IPython.display import display, HTML
from dataclasses import dataclass, field
from torch.utils.data import DataLoader
from typing import Any, Dict, List, Optional, Union
# usadas para as métricas
from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
from torchmetrics import Accuracy
from torchmetrics import F1Score
import torch.nn.functional as F
import numpy as np
import contextlib

GPU a ser utilizada

In [None]:
!nvidia-smi

Wed Jul  6 00:52:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
print(f"Pytorch Lightning Version: {pl.__version__}")
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
print(f"Device name: {nvidia_smi.nvmlDeviceGetName(handle)}")

Pytorch Lightning Version: 1.5.10
Device name: b'Tesla P100-PCIE-16GB'


In [None]:
neptune_logger = NeptuneLogger(
    api_key="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJjMWEyNTJlZS05ZDI5LTQzZjktYTkzNy00MDczMmZhODU3OWUifQ==",
    project='kgrosero/IA025-Project-wav2vec2')

Defino hiperparâmetros

In [None]:
version = "wav2vec2-sound_detection_train1" #@param {type: "string"}
lr = 1e-5#@param {type: "number"}
w_decay = 0#@param {type: "number"}
bs = 16#@param {type: "integer"}
accum_grads = 4#@param {type: "integer"}
patience = 30#@param {type: "integer"}
max_epochs = 300#@param {type: "integer"}
# warmup_steps = 1000#@param {type: "integer"}
hold_epochs = 20#@param {type: "integer"}
pretrained = "facebook/wav2vec2-base-960h"#@param {type: "string"}
wav2vec2_processor = "facebook/wav2vec2-base-960h"#@param {type: "string"}
freeze_finetune_updates = 0#@param {type: "integer"}
warmup_epochs = 40#@param {type: "integer"}
apply_mask=False#@param {type: "boolean"}
mask_time_length= 10#@param {type: "integer"}, era 1

# Define hyperparameters
hparams = {"version": version,
          "lr": lr,
          "w_decay": w_decay,
          "bs": bs,
          "patience": patience,
          "hold_epochs":hold_epochs,
          "accum_grads": accum_grads,
          "pretrained":pretrained,
          "wav2vec2_processor": wav2vec2_processor,
          "freeze_finetune_updates":freeze_finetune_updates,
          "warmup_epochs":warmup_epochs,
          "apply_mask":apply_mask, 
          "mask_time_length":mask_time_length, 
          "max_epochs": max_epochs}
hparams

{'accum_grads': 4,
 'apply_mask': False,
 'bs': 16,
 'freeze_finetune_updates': 0,
 'hold_epochs': 20,
 'lr': 1e-05,
 'mask_time_length': 10,
 'max_epochs': 300,
 'patience': 30,
 'pretrained': 'facebook/wav2vec2-base-960h',
 'version': 'wav2vec2-sound_detection_train1',
 'w_decay': 0,
 'warmup_epochs': 40,
 'wav2vec2_processor': 'facebook/wav2vec2-base-960h'}

# 2. Adaptação do modelo wav2vec2 para receber e processar 4 canais de áudio na entrada

Crio o processador original do wav2vec2

In [None]:
processor = Wav2Vec2Processor.from_pretrained(hparams["wav2vec2_processor"], return_attention_mask=True)

In [None]:
print(processor)

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: PreTrainedTokenizer(name_or_path='facebook/wav2vec2-base-960h', vocab_size=32, model_max_len=9223372036854775807, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})


O modelo original recebe um canal na entrada. Isso precisa ser mudado para 4 canais

In [None]:
class Wav2Vec2GroupNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 4 # define 4 canais na camada de entrada
        self.out_conv_dim = config.conv_dim[layer_id]

        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        self.activation = ACT2FN[config.feat_extract_activation]

        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)

    def forward(self, hidden_states):
        hidden_states = self.conv(hidden_states)
        hidden_states = self.layer_norm(hidden_states)
        hidden_states = self.activation(hidden_states)
        return hidden_states
    
class Wav2Vec2_4ChannelFeatureEncoder(nn.Module):
    """Construct the features from raw audio waveform"""

    def __init__(self, config):
        super().__init__()

        if config.feat_extract_norm == "group":
            conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [
                Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
            ]
        elif config.feat_extract_norm == "layer":
            conv_layers = [
                Wav2Vec2LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
            ]
        else:
            raise ValueError(
                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
            )
        self.conv_layers = nn.ModuleList(conv_layers)
        self.gradient_checkpointing = False
        self._requires_grad = True

    def _freeze_parameters(self):
        for param in self.parameters():
            param.requires_grad = False
        self._requires_grad = False

    def forward(self, input_values):
        hidden_states = input_values[:] # mudou para que receba todos os canais (4)
        #print("hidden_states", hidden_states.shape)

        # make sure hidden_states require grad for gradient_checkpointing
        if self._requires_grad and self.training:
            hidden_states.requires_grad = True

        for conv_layer in self.conv_layers:
            if self._requires_grad and self.gradient_checkpointing and self.training:

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs)

                    return custom_forward

                hidden_states = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(conv_layer),
                    hidden_states,
                )
            else:
                hidden_states = conv_layer(hidden_states)

        return hidden_states    

# Crio o novo modelo que herda os processos de Wav2Vec2, mas usa o extrator de features baseado em 4 canais
class Wav2Vec2_4ChannelModel(Wav2Vec2Model):
    def __init__(self, config: Wav2Vec2Config):
        super().__init__(config)

        # del self.feature_extractor
        self.feature_extractor = Wav2Vec2_4ChannelFeatureEncoder(config)    

In [None]:
model4c = Wav2Vec2_4ChannelModel.from_pretrained("facebook/wav2vec2-base-960h", 
                                                 conv_dim = (512, 512, 512,512,512,512),
                                                 conv_stride = (5, 2, 2,2,2,2),
                                                 conv_kernel = (10, 3, 3,3,3,2),
                                                 num_feat_extract_layers = 6,
                                                 ignore_mismatched_sizes=True)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2_4ChannelModel: ['wav2vec2.feature_extractor.conv_layers.6.conv.weight', 'lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2_4ChannelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2_4ChannelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2_4ChannelModel were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2

Agora a primeira camada convolucional do modelo recebe 4 canais como entrada

In [None]:
print(model4c)

Wav2Vec2_4ChannelModel(
  (feature_extractor): Wav2Vec2_4ChannelFeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(4, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1): Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (2): Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (3): Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (4): Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      

# 3. Dataloader da base de dados ANSYN (TUT Sound Events 2018 - Ambisonic, Anechoic and Synthetic Impulse Response Dataset)

In [None]:
audio_path = '/content/drive/MyDrive/Colab Notebooks/Karen/wav_separate_sounds/'

In [None]:
train_splits = ['ov1s1', 'ov1s2']
val_split = 'ov1s3'

X_train = []
X_val = []
X_test = []

for split in train_splits:
  for file_ in os.listdir(audio_path+split):
      if 'tra' in file_:
          X_train.append(file_)
      elif 'tst' in file_:
          X_test.append(file_)

for file_ in os.listdir(audio_path+val_split):
    if 'tra' in file_:
        X_val.append(file_)
    elif 'tst' in file_:
        X_test.append(file_)        

In [None]:
print('train:', len(X_train), 'val:', len(X_val), 'test:', len(X_test))

train: 7096 val: 3538 test: 2595


In [None]:
X_train[0]

'tra_049_10_ov1_s1_00.wav'

In [None]:
class ANSYN_Dataset_SE(torch.utils.data.Dataset):

    def __init__(self, filenames):    
        self.filenames = filenames
        # O path deve ser modificado dependendo da localização dos dados de áudio a carregar
        self.audio_paths1 = '/content/drive/MyDrive/Colab Notebooks/Karen/wav_separate_sounds/ov1s1/'
        self.audio_paths2 = '/content/drive/MyDrive/Colab Notebooks/Karen/wav_separate_sounds/ov1s2/'
        self.audio_paths3 = '/content/drive/MyDrive/Colab Notebooks/Karen/wav_separate_sounds/ov1s3/'

    def process_audio(self, signal, new_sr):
        # right pad if neccesary 
        length_signal = signal.shape[1]
        if length_signal < 53363:
            num_missing_samples = 53363 - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        elif length_signal > 53363:
            signal = signal[:,:53363]
        return signal        
        
    def normalize_layer(self, feats):  
        with torch.no_grad():
            feats = torch.nn.functional.layer_norm(feats, feats.shape)
        return feats

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, index):
        if "ov1_s1" in self.filenames[index]:
          feats, _ = torchaudio.load(self.audio_paths1 + self.filenames[index])
        elif "ov1_s2" in self.filenames[index]:
          feats, _ = torchaudio.load(self.audio_paths2 + self.filenames[index])
        elif "ov1_s3" in self.filenames[index]:
          feats, _ = torchaudio.load(self.audio_paths3 + self.filenames[index])          
        
        feats = self.process_audio(feats, 16000)
        feats = self.normalize_layer(feats)
        target = torch.tensor(int(self.filenames[index][-6:-4])).long()
        return {"input_values": feats, "target":target}

In [None]:
train_dataset = ANSYN_Dataset_SE(X_train)                           
val_dataset =  ANSYN_Dataset_SE(X_val)  
test_dataset = ANSYN_Dataset_SE(X_test)

In [None]:
print('Número de amostras de treinamento:', len(train_dataset))
print('Número de amostras de validação:', len(val_dataset))
print('Número de amostras de teste:', len(test_dataset))

Número de amostras de treinamento: 7096
Número de amostras de validação: 3538
Número de amostras de teste: 2595


In [None]:
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"Frequência de amostragem aceita pelo modelo: {target_sampling_rate}")
# Conferindo se os dados de entrada não geram erro no processor
inputs = processor(train_dataset[5]["input_values"], sampling_rate=target_sampling_rate, return_tensors="pt")
print(f'Input values dimensão: {inputs["input_values"].shape}')
print(inputs)

Frequência de amostragem aceita pelo modelo: 16000
Input values dimensão: torch.Size([1, 4, 53363])
{'input_values': tensor([[[-0.0532,  0.0684, -0.0759,  ...,  0.0004,  0.0004,  0.0004],
         [-0.0689, -0.0306,  0.0201,  ...,  0.0004,  0.0004,  0.0004],
         [ 0.0483, -0.0078, -0.1067,  ...,  0.0004,  0.0004,  0.0004],
         [ 0.0095, -0.0345, -0.0442,  ...,  0.0004,  0.0004,  0.0004]]]), 'attention_mask': tensor([[1, 1, 1, 1]], dtype=torch.int32)}


In [None]:
print('Dimensões de entrada do modelo:')
with torch.no_grad():
    outputs = model4c(**inputs)
last_hidden_states = outputs.last_hidden_state

print('Dimensões de saída do modelo: \n',last_hidden_states.shape)

Dimensões de entrada do modelo:
Dimensões de saída do modelo: 
 torch.Size([1, 333, 768])


In [None]:
@dataclass
class DataCollatorWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["target"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        #print('batch', batch)
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=True,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )
        #print('labels_batch', labels_batch)


        labels = labels_batch["input_ids"]

        batch["target"] = labels

        return batch

In [None]:
data_collator = DataCollatorWithPadding(processor=processor,
                                        # max_length=188,
                                        padding=True)

In [None]:
batch_size = hparams["bs"]

train_dataloader = DataLoader(train_dataset, batch_size=batch_size,
                              collate_fn = data_collator,
                              shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size,
                            collate_fn = data_collator,
                            shuffle=False, num_workers=4)

print('Número de minibatches de treinamento:', len(train_dataloader))
print('Número de minibatches de validação:', len(val_dataloader))

batch = next(iter(train_dataloader))
#print(batch)

x_train, y_train = batch['input_values'], batch['target']
print("\nDimensões dos dados de um minibatch - Audio:", x_train.size())
# print("\nDimensões dos dados de um minibatch:", padding_mask.size())
print("\nDimensões dos dados de um minibatch - Target:", y_train.size())
print("Valores mínimo e máximo entrada: ", torch.min(x_train), torch.max(x_train))
print("Valores mínimo e máximo saída: ", torch.min(y_train), torch.max(y_train))
print("Tipo dos dados dos áudios:         ", type(x_train))
print("Tipo das classes das classes:       ", type(y_train))

Número de minibatches de treinamento: 444
Número de minibatches de validação: 222

Dimensões dos dados de um minibatch - Audio: torch.Size([16, 4, 53363])

Dimensões dos dados de um minibatch - Target: torch.Size([16])
Valores mínimo e máximo entrada:  tensor(-52.5315) tensor(55.0558)
Valores mínimo e máximo saída:  tensor(1) tensor(10)
Tipo dos dados dos áudios:          <class 'torch.Tensor'>
Tipo das classes das classes:        <class 'torch.Tensor'>


Defino as métricas que serão usadas

In [None]:
n_classes = 11
f1 = F1Score(num_classes=n_classes, average='macro')
accuracy = Accuracy(num_classes=n_classes)

### Criando o  lightning module

In [None]:
class Wav2Vec2_sound_detection(pl.LightningModule):
    def __init__(self, *args, **kwargs):
        super().__init__()

        self.hparams.update(hparams)

        self.freeze_finetune_updates = hparams["freeze_finetune_updates"]
        
        #self.model = model4c
        self.model = Wav2Vec2_4ChannelModel.from_pretrained(hparams["pretrained"],
                                                 conv_dim = (512, 512, 512, 512, 512, 512),
                                                 conv_stride = (5, 2, 2, 2, 2, 2),
                                                 conv_kernel = (10, 3, 3, 3, 3, 2),
                                                 num_feat_extract_layers = 6,                                                   
                                                 apply_spec_augment=hparams["apply_mask"],
                                                 #mask_time_length=hparams["mask_time_length"],
                                                 ignore_mismatched_sizes=True)
        

        # self.model.feature_extractor._freeze_parameters()
        
        # freeze base-model
        # for param in self.model.parameters():
        #     param.requires_grad = False
        
        self.projector = nn.Linear(self.model.config.hidden_size, self.model.config.classifier_proj_size)
        n_classes = 11
        self.final_layer = nn.Linear(self.model.config.classifier_proj_size, n_classes)    



    
    def forward(self, samples):
        
        #print("input_values", samples["input_values"].shape)
        #print("attention_mask", samples["attention_mask"].shape)

        ft = self.freeze_finetune_updates <= self.trainer.global_step
       
        with torch.no_grad() if not ft else contextlib.ExitStack():
              hidden_states = self.model(**samples).last_hidden_state
        
        #print('hidden_states', hidden_states.shape)
        #print('hidden_states squeeze', hidden_states[:,0,:].shape)

        padding_mask = self.model._get_feature_vector_attention_mask(hidden_states.shape[1], samples["attention_mask"])
        # print("padding_mask", padding_mask.shape)

        hidden_states[~padding_mask] = 0.0
        
        pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)

        # print("pooled_output", pooled_output.shape)

        proj_pooled = self.projector(pooled_output)
        
        # print("proj_pooled", proj_pooled.shape)

        preds = self.final_layer(proj_pooled)
        
        return F.log_softmax(preds, dim=1)
    
    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
        batch_size = attention_mask.shape[0]

        attention_mask = torch.zeros(
            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
        )
        # these two operations makes sure that all values before the output lengths idxs are attended to
        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
        return attention_mask


    def training_step(self, train_batch, batch_idx):
        
        y_value = train_batch.pop("target")
        log_softs = self.forward(train_batch)

        # print('train preds:', preds) 
        # print('train y_value:', y_value)

        # loss mse or rmspe compute
        # mse_loss = F.mse_loss(preds, y_value)
        loss = F.nll_loss(log_softs, y_value)

        # print('train mse', mse_loss)
        # print('#########################')

        # self.log('mse_loss_step', loss, on_step=True, prog_bar=True)
        self.log('loss_step', loss, on_step=True, prog_bar=True)
        
        return loss

    def training_epoch_end(self, outputs):
        loss = torch.stack([x['loss'] for x in outputs]).mean()       

        self.log("train_loss", loss, prog_bar=True)
  
    def validation_step(self, val_batch, batch_idx):
        
        y_value = val_batch.pop("target")

        # predict 
        log_softs = self.forward(val_batch)
        preds = torch.argmax(log_softs, dim=1) 
        
        # print('log_softs:', log_softs.shape) 
        # print('preds:', preds.shape) 
        # print('y_value:', y_value.shape)
        
        # print('val preds:', preds) 
        # print('val y_value:', y_value)       

        val_acc = accuracy(preds.cpu(), y_value.cpu())
        val_f1 = f1(preds.cpu(), y_value.cpu())
        val_loss = F.nll_loss(log_softs, y_value)

        # print('val mse', mse_loss)
        # print('val rmspe', rmspe_loss)
        # print('----------------------')

        self.log('val_acc', val_acc, prog_bar=True)
        self.log('val_f1', val_f1, prog_bar=True)
        self.log('val_loss', val_loss, prog_bar=True)

        return {"val_acc_step": val_acc, "val_f1_step": val_f1, "val_loss_step": val_loss}

    def validation_epoch_end(self, outputs):
        acc_mean = torch.stack([x['val_acc_step'] for x in outputs]).mean()
        f1_mean = torch.stack([x['val_f1_step'] for x in outputs]).mean()
        loss_mean = torch.stack([x['val_loss_step'] for x in outputs]).mean()

        self.log("val_acc", acc_mean, prog_bar=True)
        self.log("val_f1", f1_mean, prog_bar=True)
        self.log("val_loss", loss_mean, prog_bar=True)
  
    def test_step(self, test_batch, batch_idx):
        
        y_value = test_batch.pop("target")

        # predict 
        log_softs = self.forward(test_batch)
        preds = torch.argmax(log_softs, dim=1) 

        test_acc = accuracy(preds.cpu(), y_value.cpu())
        test_f1 = f1(preds.cpu(), y_value.cpu())
        test_loss = F.nll_loss(log_softs, y_value)
        
        self.log('test_acc', test_acc, prog_bar=True)
        self.log('test_f1', test_f1, prog_bar=True)
        self.log('test_loss', test_loss, prog_bar=True)

        return {"test_acc_step": test_acc, "test_f1_step": test_f1,  "test_loss_step": test_loss}

    def test_epoch_end(self, outputs):
        acc_mean = torch.stack([x['test_acc_step'] for x in outputs]).mean()
        f1_mean = torch.stack([x['test_f1_step'] for x in outputs]).mean()
        loss_mean = torch.stack([x['test_loss_step'] for x in outputs]).mean()

        self.log("test_acc", acc_mean, prog_bar=True)
        self.log("test_f1", f1_mean, prog_bar=True)
        self.log("test_loss", loss_mean, prog_bar=True)  
        
    def configure_optimizers(self):

        optimizer = torch.optim.Adam(self.parameters(),
                         lr=self.hparams["lr"],
                         betas=(0.9,0.98),
                         eps=1e-6,
                         weight_decay=self.hparams["w_decay"])
        
        scheduler = LinearWarmupCosineAnnealingLR(optimizer, 
                                                  eta_min=0, # final-lr
                                                  warmup_start_lr=self.hparams["lr"],
                                                  warmup_epochs=self.hparams["warmup_epochs"],
                                                  max_epochs=self.hparams["max_epochs"])
        
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

In [None]:
# Overfitting de 3 minibatches 
#model = Wav2Vec2_sound_detection(hparams)

#trainer = pl.Trainer(gpus=1,
#                     #logger=neptune_logger,
#                     max_epochs=100,
###                     check_val_every_n_epoch=10,
 #                    checkpoint_callback=False, # Disable checkpoint saving.
 #                    overfit_batches=3,
 #                    log_every_n_steps = 1)

#trainer.fit(model, train_dataloader, val_dataloader)
#del model, trainer # Para não ter estouro de mémoria da GPU
#gc.collect()
#torch.cuda.empty_cache()

In [None]:
#!mkdir '/content/drive/MyDrive/Colab Notebooks/Karen/Results/'

In [None]:
pl_model= Wav2Vec2_sound_detection(hparams=hparams)
checkpoint_path = '/content/drive/MyDrive/Colab Notebooks/Karen/Results/'
checkpoint_dir = os.path.dirname(os.path.abspath(checkpoint_path))
print(f'Files in {checkpoint_dir}: {os.listdir(checkpoint_dir)}')
print(f'Saving checkpoints to {checkpoint_dir}')
checkpoint_callback = pl.callbacks.ModelCheckpoint(filename=hparams["version"],
                                                  dirpath=checkpoint_dir,
                                                  save_top_k=1,
                                                  verbose = True, 
                                                  monitor="val_f1", mode="max")
early_stop_callback = pl.callbacks.EarlyStopping(monitor="val_f1", patience=hparams["patience"], mode='max')
lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step')

trainer = pl.Trainer(gpus=1, 
                     precision=16,
                     logger=neptune_logger,
                     # num_sanity_val_steps=0,
                     accumulate_grad_batches=hparams["accum_grads"],
                     enable_checkpointing=True, 
                     callbacks=[early_stop_callback, lr_monitor, checkpoint_callback],
                     max_epochs=hparams["max_epochs"])

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2_4ChannelModel: ['wav2vec2.feature_extractor.conv_layers.6.conv.weight', 'lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2_4ChannelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2_4ChannelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2_4ChannelModel were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2

Files in /content/drive/MyDrive/Colab Notebooks/Karen: ['Fine-tuning wav2vec_ov1.ipynb', 'Fine-tuning wav2vec(1).ipynb', 'wav_separate_sounds', 'Fine-tuning wav2vec_ov1s1.ipynb']
Saving checkpoints to /content/drive/MyDrive/Colab Notebooks/Karen


In [None]:
trainer.fit(pl_model, train_dataloader, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                   | Params
-------------------------------------------------------
0 | model       | Wav2Vec2_4ChannelModel | 93.9 M
1 | projector   | Linear                 | 196 K 
2 | final_layer | Linear                 | 2.8 K 
-------------------------------------------------------
94.1 M    Trainable params
0         Non-trainable params
94.1 M    Total params
188.125   Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

https://app.neptune.ai/kgrosero/IA025-Project-wav2vec2/e/IAP-16
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


Validating: 0it [00:00, ?it/s]

Epoch 0, global step 110: val_f1 reached 0.29441 (best 0.29441), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 221: val_f1 reached 0.32378 (best 0.32378), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 332: val_f1 reached 0.58201 (best 0.58201), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 443: val_f1 reached 0.70262 (best 0.70262), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 554: val_f1 reached 0.78280 (best 0.78280), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 665: val_f1 reached 0.80042 (best 0.80042), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 776: val_f1 reached 0.86440 (best 0.86440), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 887: val_f1 reached 0.89439 (best 0.89439), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 998: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 1109: val_f1 reached 0.94022 (best 0.94022), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 10, global step 1220: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 11, global step 1331: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 12, global step 1442: val_f1 reached 0.96832 (best 0.96832), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 13, global step 1553: val_f1 reached 0.97978 (best 0.97978), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 14, global step 1664: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 15, global step 1775: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 16, global step 1886: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 17, global step 1997: val_f1 reached 0.98070 (best 0.98070), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 18, global step 2108: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 19, global step 2219: val_f1 reached 0.98223 (best 0.98223), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 20, global step 2330: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 21, global step 2441: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 22, global step 2552: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 23, global step 2663: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 24, global step 2774: val_f1 reached 0.98642 (best 0.98642), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 25, global step 2885: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 26, global step 2996: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 27, global step 3107: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 28, global step 3218: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 29, global step 3329: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 30, global step 3440: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 31, global step 3551: val_f1 reached 0.99024 (best 0.99024), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 32, global step 3662: val_f1 reached 0.99029 (best 0.99029), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 33, global step 3773: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 34, global step 3884: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 35, global step 3995: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 36, global step 4106: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 37, global step 4217: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 38, global step 4328: val_f1 reached 0.99335 (best 0.99335), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 39, global step 4439: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 40, global step 4550: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 41, global step 4661: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 42, global step 4772: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 43, global step 4883: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 44, global step 4994: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 45, global step 5105: val_f1 reached 0.99379 (best 0.99379), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 46, global step 5216: val_f1 reached 0.99596 (best 0.99596), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 47, global step 5327: val_f1 reached 0.99611 (best 0.99611), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 48, global step 5438: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 49, global step 5549: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 50, global step 5660: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 51, global step 5771: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 52, global step 5882: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 53, global step 5993: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 54, global step 6104: val_f1 reached 0.99801 (best 0.99801), saving model to "/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 55, global step 6215: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 56, global step 6326: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 57, global step 6437: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 58, global step 6548: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 59, global step 6659: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 60, global step 6770: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 61, global step 6881: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 62, global step 6992: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 63, global step 7103: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 64, global step 7214: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 65, global step 7325: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 66, global step 7436: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 67, global step 7547: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 68, global step 7658: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 69, global step 7769: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 70, global step 7880: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 71, global step 7991: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 72, global step 8102: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 73, global step 8213: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 74, global step 8324: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 75, global step 8435: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 76, global step 8546: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 77, global step 8657: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 78, global step 8768: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 79, global step 8879: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 80, global step 8990: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 81, global step 9101: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 82, global step 9212: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 83, global step 9323: val_f1 was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 84, global step 9434: val_f1 was not in top 1


In [None]:
if not os.path.isfile(checkpoint_dir + hparams["version"]):
    print('Saving processor to: ' + checkpoint_dir + hparams["version"])
    processor.save_pretrained(checkpoint_dir + hparams["version"])

Saving processor to: /content/drive/MyDrive/Colab Notebooks/Karenwav2vec2-sound_detection_train1


In [None]:
best_model = checkpoint_callback.best_model_path
# best_model = "/content/drive/MyDrive/Wav2Vec2_ORVP/wav2vec2_huggingface_fairseq_orvp_test1-epoch=4-step=23459.ckpt"
print(best_model)
test_model = Wav2Vec2_sound_detection.load_from_checkpoint(best_model, hparams=hparams).cuda().eval()

/content/drive/MyDrive/Colab Notebooks/Karen/wav2vec2-sound_detection_train1.ckpt


Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2_4ChannelModel: ['wav2vec2.feature_extractor.conv_layers.6.conv.weight', 'lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2_4ChannelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2_4ChannelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2_4ChannelModel were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2

In [None]:
trainer.test(test_model, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9985923171043396,
 'test_f1': 0.9980103969573975,
 'test_loss': 0.011824632063508034}
--------------------------------------------------------------------------------


[{'test_acc': 0.9985923171043396,
  'test_f1': 0.9980103969573975,
  'test_loss': 0.011824632063508034}]

In [None]:
test_dataloader = DataLoader(test_dataset, batch_size=batch_size,
                            collate_fn = data_collator,
                            shuffle=False, num_workers=0)

In [None]:
trainer.test(test_model, test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9930981397628784,
 'test_f1': 0.9892676472663879,
 'test_loss': 0.04698962718248367}
--------------------------------------------------------------------------------


[{'test_acc': 0.9930981397628784,
  'test_f1': 0.9892676472663879,
  'test_loss': 0.04698962718248367}]