<a href="https://colab.research.google.com/github/unicamp-dl/IA025_2022S1/blob/main/Final_project/Karen_Rosero/Fine_tuning_wav2vec_classf_localiz_ANSYNdataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Projeto final da disclipina IA025 - Introdução ao Aprendizado Profundo

# Sound classification and localization using transformers

> ## Fine-tuning notebook for ANSYN dataset



## Autora: Karen Rosero

# 1. Configurações iniciais

Instalo ad bibliotecas faltantes

In [None]:
!pip install transformers
!pip install ipywidgets
!pip install pytorch-lightning==1.5.10
!pip install nvidia-ml-py3
!pip install neptune-client
!pip install lightning-bolts
!pip install torchmetrics

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Importo as bibliotecas necessárias

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2Config
import torch
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2FeatureEncoder, Wav2Vec2NoLayerNormConvLayer, Wav2Vec2LayerNormConvLayer
from torch import nn
from transformers.activations import ACT2FN
import os
import csv
import torchaudio
import pytorch_lightning as pl
import nvidia_smi
from pytorch_lightning.loggers import CSVLogger
from IPython.display import display, HTML
from dataclasses import dataclass, field
from torch.utils.data import DataLoader
from typing import Any, Dict, List, Optional, Union
from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
from torchmetrics import F1Score
import torch.nn.functional as F
import numpy as np
import contextlib
import gc

GPU a ser utilizada

In [None]:
!nvidia-smi

In [None]:
print(f"Pytorch Lightning Version: {pl.__version__}")
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
print(f"Device name: {nvidia_smi.nvmlDeviceGetName(handle)}")

Defino o logger para salvar as curvas de aprendizado

In [None]:
logger = CSVLogger("/home/lab_acustica/IA025_Project/Results/Treinamento_ov1_loc", name='lightning_logs_v2', version=hparams["version"])

Defino hiperparâmetros

In [None]:
version = "wav2vec2-sound_detection_train_ov1_loc_v2" #@param {type: "string"}
lr = 1e-5#@param {type: "number"}
w_decay = 0#@param {type: "number"}
bs = 8#@param {type: "integer"}
accum_grads = 2#@param {type: "integer"}
patience = 30#@param {type: "integer"}
max_epochs = 300#@param {type: "integer"}
# warmup_steps = 1000#@param {type: "integer"}
hold_epochs = 20#@param {type: "integer"}
pretrained = "facebook/wav2vec2-base-960h"#@param {type: "string"}
wav2vec2_processor = "facebook/wav2vec2-base-960h"#@param {type: "string"}
freeze_finetune_updates = 0#@param {type: "integer"}
warmup_epochs = 100#@param {type: "integer"}
apply_mask=False#@param {type: "boolean"}
mask_time_length= 10#@param {type: "integer"}, era 1

# Define hyperparameters
hparams = {"version": version,
          "lr": lr,
          "w_decay": w_decay,
          "bs": bs,
          "patience": patience,
          "hold_epochs":hold_epochs,
          "accum_grads": accum_grads,
          "pretrained":pretrained,
          "wav2vec2_processor": wav2vec2_processor,
          "freeze_finetune_updates":freeze_finetune_updates,
          "warmup_epochs":warmup_epochs,
          "apply_mask":apply_mask, 
          "mask_time_length":mask_time_length, 
          "max_epochs": max_epochs}
hparams

{'version': 'wav2vec2-sound_detection_train_ov1_loc_v2',
 'lr': 1e-05,
 'w_decay': 0,
 'bs': 2,
 'patience': 30,
 'hold_epochs': 20,
 'accum_grads': 2,
 'pretrained': 'facebook/wav2vec2-base-960h',
 'wav2vec2_processor': 'facebook/wav2vec2-base-960h',
 'freeze_finetune_updates': 0,
 'warmup_epochs': 100,
 'apply_mask': False,
 'mask_time_length': 10,
 'max_epochs': 300}

# 2. Adaptação do modelo wav2vec2 para receber e processar 4 canais de áudio na entrada

Crio o processador original do wav2vec2

In [None]:
processor = Wav2Vec2Processor.from_pretrained(hparams["wav2vec2_processor"], return_attention_mask=True)

In [None]:
print(processor)

O modelo original recebe um canal na entrada. Isso precisa ser mudado para 4 canais

In [None]:
class Wav2Vec2GroupNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 4 # define 4 canais na camada de entrada
        self.out_conv_dim = config.conv_dim[layer_id]

        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        self.activation = ACT2FN[config.feat_extract_activation]

        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)

    def forward(self, hidden_states):
        hidden_states = self.conv(hidden_states)
        hidden_states = self.layer_norm(hidden_states)
        hidden_states = self.activation(hidden_states)
        return hidden_states
    
class Wav2Vec2_4ChannelFeatureEncoder(nn.Module):
    """Construct the features from raw audio waveform"""

    def __init__(self, config):
        super().__init__()

        if config.feat_extract_norm == "group":
            conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [
                Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
            ]
        elif config.feat_extract_norm == "layer":
            conv_layers = [
                Wav2Vec2LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
            ]
        else:
            raise ValueError(
                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
            )
        self.conv_layers = nn.ModuleList(conv_layers)
        self.gradient_checkpointing = False
        self._requires_grad = True

    def _freeze_parameters(self):
        for param in self.parameters():
            param.requires_grad = False
        self._requires_grad = False

    def forward(self, input_values):
        hidden_states = input_values[:] # mudou para que receba todos os canais (4)
        #print("hidden_states", hidden_states.shape)

        # make sure hidden_states require grad for gradient_checkpointing
        if self._requires_grad and self.training:
            hidden_states.requires_grad = True

        for conv_layer in self.conv_layers:
            if self._requires_grad and self.gradient_checkpointing and self.training:

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs)

                    return custom_forward

                hidden_states = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(conv_layer),
                    hidden_states,
                )
            else:
                hidden_states = conv_layer(hidden_states)

        return hidden_states    

# Crio o novo modelo que herda os processos de Wav2Vec2, mas usa o extrator de features baseado em 4 canais
class Wav2Vec2_4ChannelModel(Wav2Vec2Model):
    def __init__(self, config: Wav2Vec2Config):
        super().__init__(config)

        # del self.feature_extractor
        self.feature_extractor = Wav2Vec2_4ChannelFeatureEncoder(config)    

In [None]:
model4c = Wav2Vec2_4ChannelModel.from_pretrained("facebook/wav2vec2-base-960h", 
                                                 conv_dim = (512, 512, 512,512,512,512),
                                                 conv_stride = (5, 2, 2,2,2,2),
                                                 conv_kernel = (10, 3, 3,3,3,2),
                                                 num_feat_extract_layers = 6,
                                                 ignore_mismatched_sizes=True)

Agora a primeira camada convolucional do modelo recebe 4 canais como entrada

In [None]:
print(model4c)

# 3. Dataloader da base de dados ANSYN (TUT Sound Events 2018 - Ambisonic, Anechoic and Synthetic Impulse Response Dataset)

Separando os dados em treinamento, validação e teste

In [None]:
audio_path = '/home/lab_acustica/Documentos/ANSYN_Dataset/wav_separate_sounds_ov1/'

In [None]:
train_splits = ['ov1s1_wav', 'ov1s2_wav']
val_split = 'ov1s3_wav'

X_train = []
X_val = []
X_test = []

for split in train_splits:
    for file_ in os.listdir(audio_path+split):
        if 'tra' in file_:
            X_train.append(file_[:-4])
        elif 'tst' in file_:
            X_test.append(file_[:-4])

for file_ in os.listdir(audio_path+val_split):
    if 'tra' in file_:
        X_val.append(file_[:-4])
    elif 'tst' in file_:
        X_test.append(file_[:-4])        

In [None]:
print('train:', len(X_train), 'val:', len(X_val), 'test:', len(X_test))

In [None]:
class ANSYN_Dataset_SE(torch.utils.data.Dataset):

    def __init__(self, filenames):    
        self.filenames = filenames
        # O path deve ser modificado dependendo da localização dos dados de áudio a carregar
        self.audio_paths1 = '/home/lab_acustica/Documentos/ANSYN_Dataset/wav_separate_sounds_ov1/ov1s1_'
        self.audio_paths2 = '/home/lab_acustica/Documentos/ANSYN_Dataset/wav_separate_sounds_ov1/ov1s2_'
        self.audio_paths3 = '/home/lab_acustica/Documentos/ANSYN_Dataset/wav_separate_sounds_ov1/ov1s3_'

    def process_audio(self, signal, new_sr):
        # right pad if neccesary 
        length_signal = signal.shape[1]
        # 53363
        if length_signal < 32767:
            num_missing_samples = 32767 - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        elif length_signal > 32767:
            signal = signal[:,:32767]
        return signal        
        
    def normalize_layer(self, feats):  
        with torch.no_grad():
            feats = torch.nn.functional.layer_norm(feats, feats.shape)
        return feats

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, index):
        if "ov1_s1" in self.filenames[index]:
            feats, _ = torchaudio.load(self.audio_paths1 +'wav/'+ self.filenames[index]+'.wav')
            target = torch.from_numpy(np.loadtxt(self.audio_paths1 +'des/'+ self.filenames[index]+'.csv', delimiter=',')).float()
        elif "ov1_s2" in self.filenames[index]:
            feats, _ = torchaudio.load(self.audio_paths2 +'wav/'+ self.filenames[index]+'.wav')
            target = torch.from_numpy(np.loadtxt(self.audio_paths2 +'des/'+ self.filenames[index]+'.csv', delimiter=',')).float()
        elif "ov1_s3" in self.filenames[index]:
            feats, _ = torchaudio.load(self.audio_paths3 +'wav/'+ self.filenames[index]+'.wav') 
            target = torch.from_numpy(np.loadtxt(self.audio_paths3 +'des/'+ self.filenames[index]+'.csv', delimiter=',')).float()
        
        feats = self.process_audio(feats, 16000)
        feats = self.normalize_layer(feats)
        return {"input_values": feats, "target":target}

In [None]:
train_dataset = ANSYN_Dataset_SE(X_train)                           
val_dataset =  ANSYN_Dataset_SE(X_val)  
test_dataset = ANSYN_Dataset_SE(X_test)

In [None]:
del X_train, X_val, X_test

In [None]:
print('Número de amostras de treinamento:', len(train_dataset))
print('Número de amostras de validação:', len(val_dataset))
print('Número de amostras de teste:', len(test_dataset))

In [None]:
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"Frequência de amostragem aceita pelo modelo: {target_sampling_rate}")
# Conferindo se os dados de entrada não geram erro no processor
inputs = processor(train_dataset[5]["input_values"], sampling_rate=target_sampling_rate, return_tensors="pt")
print(f'Input values dimensão: {inputs["input_values"].shape}')
print(inputs)

print('Dimensões de entrada do modelo:')
with torch.no_grad():
    outputs = model4c(**inputs)
last_hidden_states = outputs.last_hidden_state

print('Dimensões de saída do modelo: \n',last_hidden_states.shape)

In [None]:
@dataclass
class DataCollatorWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["target"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        #print('batch', batch)
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=True,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )
        #print('labels_batch', labels_batch)


        labels = labels_batch["input_ids"]

        batch["class"] = labels[:,0].long()
        batch["doa"] = labels[:,1:]

        return batch

In [None]:
data_collator = DataCollatorWithPadding(processor=processor,
                                        # max_length=188,
                                        padding=True)

In [None]:
batch_size = hparams["bs"]

train_dataloader = DataLoader(train_dataset, batch_size=batch_size,
                              collate_fn = data_collator,
                              shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size,
                            collate_fn = data_collator,
                            shuffle=False, num_workers=4)

#4. Modulo de PyTorch Lightning para fine-tuning

In [None]:
#Defino as métricas que serão usadas
f1 = F1Score(num_classes=11, average='macro')

class Wav2Vec2_sound_detection(pl.LightningModule):
    def __init__(self, *args, **kwargs):
        super().__init__()

        self.hparams.update(hparams)

        self.freeze_finetune_updates = hparams["freeze_finetune_updates"]
        
        self.model = Wav2Vec2_4ChannelModel.from_pretrained(hparams["pretrained"],
                                                 conv_dim = (512, 512, 512, 512, 512, 512),
                                                 conv_stride = (5, 2, 2, 2, 2, 2),
                                                 conv_kernel = (10, 3, 3, 3, 3, 2),
                                                 num_feat_extract_layers = 6,                                                   
                                                 apply_spec_augment=hparams["apply_mask"],
                                                 #mask_time_length=hparams["mask_time_length"],
                                                 ignore_mismatched_sizes=True)

        # self.model.feature_extractor._freeze_parameters()
        
        # freeze base-model
        # for param in self.model.parameters():
        #     param.requires_grad = False
        
        self.projector = nn.Linear(self.model.config.hidden_size, self.model.config.classifier_proj_size)
        self.final_layer_class = nn.Linear(self.model.config.classifier_proj_size, 11)    
        self.final_layer_doa = nn.Linear(self.model.config.classifier_proj_size, 2)  
        
    def recall_loc(self, loc1, loc2):
        """
        Angular distance between two spherical coordinates
        MORE: https://en.wikipedia.org/wiki/Great-circle_distance
        :return: angular distance in degrees
        """
        loc1 = loc1.cpu()
        loc2 = loc2.cpu()
        dist = np.sin(loc1[:,0]*np.pi/180) * np.sin(loc2[:,0]*np.pi/180) + np.cos(loc1[:,0]*np.pi/180) * np.cos(loc2[:,0]*np.pi/180) * np.cos(np.abs(loc1[:,1]*np.pi/180 - loc2[:,1]*np.pi/180))
        # Making sure the dist values are in -1 to 1 range, else np.arccos kills the job
        dist = np.clip(dist, -1, 1)
        dist = np.arccos(dist) * 180 / np.pi
        dist = dist <= 20
        recall = sum(dist) / loc1.shape[0]
        del loc1, loc2, dist
        return recall        

    def forward(self, samples):

        ft = self.freeze_finetune_updates <= self.trainer.global_step
       
        with torch.no_grad() if not ft else contextlib.ExitStack():
              hidden_states = self.model(**samples).last_hidden_state
        
        padding_mask = self.model._get_feature_vector_attention_mask(hidden_states.shape[1], samples["attention_mask"])

        hidden_states[~padding_mask] = 0.0
        
        pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)

        proj_pooled = self.projector(pooled_output)

        preds_class = self.final_layer_class(proj_pooled)
        
        preds_doa = self.final_layer_doa(proj_pooled)
        
        return F.log_softmax(preds_class, dim=1), preds_doa
    
    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
        
        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
        batch_size = attention_mask.shape[0]

        attention_mask = torch.zeros(
            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
        )
        # these two operations makes sure that all values before the output lengths idxs are attended to
        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
        return attention_mask


    def training_step(self, train_batch, batch_idx):
        
        class_value = train_batch.pop("class")
        doa_value = train_batch.pop("doa")
        
        class_log_softs, doa_preds = self.forward(train_batch)

        ce_loss = F.nll_loss(class_log_softs, class_value)
        mse_loss = F.mse_loss(doa_preds, doa_value)/1000
        loss = 0.2*ce_loss + 0.8*mse_loss
        
        return {'loss':loss, 'ce_loss':ce_loss, 'mse_loss': mse_loss}

    def training_epoch_end(self, outputs):
        loss = torch.stack([x['loss'] for x in outputs]).mean()     
        ce_loss = torch.stack([x['ce_loss'] for x in outputs]).mean() 
        mse_loss = torch.stack([x['mse_loss'] for x in outputs]).mean() 
        #print('loss tr:', loss, ce_loss, mse_loss)

        self.log("train_loss", loss, on_step = False,on_epoch = True, prog_bar=True)
        self.log("train_ce_loss", ce_loss, on_step = False,on_epoch = True, prog_bar=True)
        self.log("train_mse_loss", mse_loss, on_step = False,on_epoch = True, prog_bar=True)
  
    def validation_step(self, val_batch, batch_idx):
        
        class_value = val_batch.pop("class")
        doa_value = val_batch.pop("doa")

        class_log_softs, doa_preds = self.forward(val_batch)
        
        val_ce_loss = F.nll_loss(class_log_softs, class_value)
        val_mse_loss = F.mse_loss(doa_preds, doa_value)/1000
        val_loss = 0.2*val_ce_loss + 0.8*val_mse_loss
        
        class_preds = torch.argmax(class_log_softs, dim=1) 
        val_f1 = f1(class_preds.cpu(), class_value.cpu())
        
        val_recall = self.recall_loc(doa_preds, doa_value)  
        metric_monitor = (val_f1 + val_recall)/2
        
        return {"val_loss_step": val_loss, 'val_ce_loss': val_ce_loss, 'val_mse_loss': val_mse_loss, \
                "val_f1_step": val_f1, 'val_rc':val_recall, 'monitor':metric_monitor}
        #return {"val_acc_step": val_acc, "val_f1_step": val_f1, "val_loss_step": val_loss}

    def validation_epoch_end(self, outputs):
        f1_mean = torch.stack([x['val_f1_step'] for x in outputs]).mean()
        rc_mean = torch.stack([x['val_rc'] for x in outputs]).mean()
        loss_mean = torch.stack([x['val_loss_step'] for x in outputs]).mean()
        loss_mean_ce = torch.stack([x['val_ce_loss'] for x in outputs]).mean()
        loss_mean_mse = torch.stack([x['val_mse_loss'] for x in outputs]).mean()
        monitor = torch.stack([x['monitor'] for x in outputs]).mean()
        #print('val: ', monitor)
        self.log("val_f1_mean", f1_mean, on_step = False, on_epoch = True,prog_bar=True)
        self.log("val_rc_mean", rc_mean, on_step = False,on_epoch = True,prog_bar=True)
        #self.log("val_loss_mean", loss_mean, on_step = False,on_epoch = True,prog_bar=True)
        self.log("val_ce_loss_mean", loss_mean_ce, on_step = False,on_epoch = True,prog_bar=True)
        self.log("val_mse_loss_mean", loss_mean_mse, on_step = False,on_epoch = True,prog_bar=True)
        self.log("val_monitor_mean", monitor, on_step = False, on_epoch = True,prog_bar = True)
        
    def test_step(self, test_batch, batch_idx):
        
        class_value = test_batch.pop("class")
        doa_value = test_batch.pop("doa")

        # predict 
        class_log_softs, doa_preds  = self.forward(test_batch)
        test_ce_loss = F.nll_loss(class_log_softs, class_value)
        test_mse_loss = F.mse_loss(doa_preds, doa_value)/1000
        test_loss = 0.2*test_ce_loss + 0.8*test_mse_loss
        
        class_preds = torch.argmax(class_log_softs, dim=1) 
        test_f1 = f1(class_preds.cpu(), class_value.cpu())

        test_recall = self.recall_loc(doa_preds, doa_value)  
        
        return {"test_loss_step": test_loss, 'test_ce_loss':test_ce_loss, 'test_mse_loss':test_mse_loss, \
                 'test_rc_step':test_recall, 'test_f1_step':test_f1}

    def test_epoch_end(self, outputs):
        t_rc_mean = torch.stack([x['test_rc_step'] for x in outputs]).mean()
        t_f1_mean = torch.stack([x['test_f1_step'] for x in outputs]).mean()
        t_loss_mean = torch.stack([x['test_loss_step'] for x in outputs]).mean()
        ce_t_loss_mean = torch.stack([x['test_ce_loss'] for x in outputs]).mean()
        mse_t_loss_mean = torch.stack([x['test_mse_loss'] for x in outputs]).mean()
        print("recall:", t_rc_mean, "f1:", t_f1_mean, "loss:", t_loss_mean, "ce_loss:", ce_t_loss_mean, "mse_loss:", mse_t_loss_mean)
      

    def configure_optimizers(self):

        optimizer = torch.optim.Adam(self.parameters(),
                         lr=self.hparams["lr"],
                         betas=(0.9,0.98),
                         eps=1e-6,
                         weight_decay=self.hparams["w_decay"])
        
        scheduler = LinearWarmupCosineAnnealingLR(optimizer, 
                                                  eta_min=0, # final-lr
                                                  warmup_start_lr=self.hparams["lr"],
                                                  warmup_epochs=self.hparams["warmup_epochs"],
                                                  max_epochs=self.hparams["max_epochs"])
        
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

### Overfitting de 3 minibatches. Descomentar para testar o código

In [None]:
#model = Wav2Vec2_sound_detection(hparams)

#trainer = pl.Trainer(gpus=1,
#                     logger=logger,
#                     max_epochs=300,
#                     checkpoint_callback=False, # Disable checkpoint saving.
#                     overfit_batches=3,
#                     log_every_n_steps = 1)

#trainer.fit(model, train_dataloader, val_dataloader)
#del model, trainer # Para não ter estouro de mémoria da GPU
#gc.collect()
#torch.cuda.empty_cache()

###Treinamento do modelo adaptado

In [None]:
#!mkdir '/content/drive/MyDrive/Colab Notebooks/Karen/Results/'

In [None]:
pl_model= Wav2Vec2_sound_detection(hparams=hparams)
checkpoint_path = '/home/lab_acustica/IA025_Project/Results/Treinamento_ov1_loc/'
checkpoint_dir = os.path.dirname(os.path.abspath(checkpoint_path))
print(f'Files in {checkpoint_dir}: {os.listdir(checkpoint_dir)}')
print(f'Saving checkpoints to {checkpoint_dir}')
checkpoint_callback = pl.callbacks.ModelCheckpoint(filename=hparams["version"],
                                                  dirpath=checkpoint_dir,
                                                  save_top_k=1,
                                                  verbose = True, 
                                                  monitor="val_monitor_mean", mode="max")
early_stop_callback = pl.callbacks.EarlyStopping(monitor="val_monitor_mean", patience=hparams["patience"], mode='max')
#lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='epoch')
trainer = pl.Trainer(gpus=1, 
                     precision=16,
                     logger=logger,
                     # num_sanity_val_steps=0,
                     accumulate_grad_batches=hparams["accum_grads"],
                     enable_checkpointing=True, 
                     callbacks=[early_stop_callback, checkpoint_callback],
                     max_epochs=hparams["max_epochs"])

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2_4ChannelModel: ['lm_head.weight', 'lm_head.bias', 'wav2vec2.feature_extractor.conv_layers.6.conv.weight']
- This IS expected if you are initializing Wav2Vec2_4ChannelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2_4ChannelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2_4ChannelModel were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2

Files in /home/lab_acustica/IA025_Project/Results: ['Treinamento_ov1_loc', 'Treinamento_ov1s1_loc', 'wav2vec2-sound_detection_ov1s1_loc_curve-v1.ckpt', 'wav2vec2-sound_detection_ov1s1_loc_curve.ckpt', 'Treinamento ov1s1', 'wav2vec2-sound_detection_train_ov1_loc.ckpt']
Saving checkpoints to /home/lab_acustica/IA025_Project/Results


In [None]:
trainer.fit(pl_model, train_dataloader, val_dataloader)#

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type                   | Params
-------------------------------------------------------------
0 | model             | Wav2Vec2_4ChannelModel | 93.9 M
1 | projector         | Linear                 | 196 K 
2 | final_layer_class | Linear                 | 2.8 K 
3 | final_layer_doa   | Linear                 | 514   
-------------------------------------------------------------
94.1 M    Trainable params
0         Non-trainable params
94.1 M    Total params
188.126   Total estimated model params size (MB)
  rank_zero_warn(


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 1774: 'val_monitor_mean' reached 0.17585 (best 0.17585), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 3548: 'val_monitor_mean' reached 0.27624 (best 0.27624), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 2, global step 5322: 'val_monitor_mean' reached 0.30078 (best 0.30078), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 7096: 'val_monitor_mean' reached 0.36617 (best 0.36617), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 8870: 'val_monitor_mean' reached 0.43819 (best 0.43819), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 10644: 'val_monitor_mean' reached 0.52718 (best 0.52718), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 12418: 'val_monitor_mean' reached 0.56425 (best 0.56425), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 14192: 'val_monitor_mean' reached 0.59134 (best 0.59134), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 8, global step 15966: 'val_monitor_mean' reached 0.61819 (best 0.61819), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 9, global step 17740: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 10, global step 19514: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 11, global step 21288: 'val_monitor_mean' reached 0.63430 (best 0.63430), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 12, global step 23062: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 13, global step 24836: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 14, global step 26610: 'val_monitor_mean' reached 0.68768 (best 0.68768), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 15, global step 28384: 'val_monitor_mean' reached 0.78227 (best 0.78227), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 16, global step 30158: 'val_monitor_mean' reached 0.89570 (best 0.89570), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 17, global step 31932: 'val_monitor_mean' reached 0.96726 (best 0.96726), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 18, global step 33706: 'val_monitor_mean' reached 0.97442 (best 0.97442), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 19, global step 35480: 'val_monitor_mean' reached 0.97861 (best 0.97861), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 20, global step 37254: 'val_monitor_mean' reached 0.98676 (best 0.98676), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 21, global step 39028: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 22, global step 40802: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 23, global step 42576: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 24, global step 44350: 'val_monitor_mean' reached 0.98921 (best 0.98921), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 25, global step 46124: 'val_monitor_mean' reached 0.99204 (best 0.99204), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 26, global step 47898: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 27, global step 49672: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 28, global step 51446: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 29, global step 53220: 'val_monitor_mean' reached 0.99289 (best 0.99289), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 30, global step 54994: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 31, global step 56768: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 32, global step 58542: 'val_monitor_mean' reached 0.99491 (best 0.99491), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 33, global step 60316: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 34, global step 62090: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 35, global step 63864: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 36, global step 65638: 'val_monitor_mean' reached 0.99618 (best 0.99618), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 37, global step 67412: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 38, global step 69186: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 39, global step 70960: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 40, global step 72734: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 41, global step 74508: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 42, global step 76282: 'val_monitor_mean' reached 0.99647 (best 0.99647), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 43, global step 78056: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 44, global step 79830: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 45, global step 81604: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 46, global step 83378: 'val_monitor_mean' reached 0.99684 (best 0.99684), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 47, global step 85152: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 48, global step 86926: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 49, global step 88700: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 50, global step 90474: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 51, global step 92248: 'val_monitor_mean' reached 0.99779 (best 0.99779), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 52, global step 94022: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 53, global step 95796: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 54, global step 97570: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 55, global step 99344: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 56, global step 101118: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 57, global step 102892: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 58, global step 104666: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 59, global step 106440: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 60, global step 108214: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 61, global step 109988: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 62, global step 111762: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 63, global step 113536: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 64, global step 115310: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 65, global step 117084: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 66, global step 118858: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 67, global step 120632: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 68, global step 122406: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 69, global step 124180: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 70, global step 125954: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 71, global step 127728: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 72, global step 129502: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 73, global step 131276: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 74, global step 133050: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 75, global step 134824: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 76, global step 136598: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 77, global step 138372: 'val_monitor_mean' reached 0.99793 (best 0.99793), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 78, global step 140146: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 79, global step 141920: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 80, global step 143694: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 81, global step 145468: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 82, global step 147242: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 83, global step 149016: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 84, global step 150790: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 85, global step 152564: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 86, global step 154338: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 87, global step 156112: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 88, global step 157886: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 89, global step 159660: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 90, global step 161434: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 91, global step 163208: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 92, global step 164982: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 93, global step 166756: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 94, global step 168530: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 95, global step 170304: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 96, global step 172078: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 97, global step 173852: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 98, global step 175626: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 99, global step 177400: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 100, global step 179174: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 101, global step 180948: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 102, global step 182722: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 103, global step 184496: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 104, global step 186270: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 105, global step 188044: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 106, global step 189818: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 107, global step 191592: 'val_monitor_mean' was not in top 1


In [None]:
if not os.path.isfile(checkpoint_dir + hparams["version"]):
    print('Saving processor to: ' + checkpoint_dir + hparams["version"])
    processor.save_pretrained(checkpoint_dir + hparams["version"])

Saving processor to: /home/lab_acustica/IA025_Project/Resultswav2vec2-sound_detection_train_ov1_loc_v2


In [None]:
best_model = checkpoint_callback.best_model_path
print(best_model)
test_model = Wav2Vec2_sound_detection.load_from_checkpoint(best_model, hparams=hparams).cuda().eval()

/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_v2.ckpt


Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2_4ChannelModel: ['lm_head.weight', 'lm_head.bias', 'wav2vec2.feature_extractor.conv_layers.6.conv.weight']
- This IS expected if you are initializing Wav2Vec2_4ChannelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2_4ChannelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2_4ChannelModel were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2

In [None]:
trainer.test(test_model, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

recall: tensor(1.) f1: tensor(0.9959) loss: tensor(0.0258, device='cuda:0') ce_loss: tensor(0.0222, device='cuda:0') mse_loss: tensor(0.0267, device='cuda:0')


[{}]

In [None]:
test_dataloader = DataLoader(test_dataset, batch_size=batch_size,
                            collate_fn = data_collator,
                            shuffle=False, num_workers=0)

In [None]:
trainer.test(test_model, test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

recall: tensor(1.) f1: tensor(0.9836) loss: tensor(0.0363, device='cuda:0') ce_loss: tensor(0.0747, device='cuda:0') mse_loss: tensor(0.0267, device='cuda:0')


[{}]