<a href="https://colab.research.google.com/github/unicamp-dl/IA025_2022S1/blob/main/Final_project/Karen_Rosero/Fine_tuning_wav2vec_classf_localiz_REALdataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Projeto final da disclipina IA025 - Introdução ao Aprendizado Profundo
##Sound classification and localization using transformers

> ##Fine-tuning notebook for REAL dataset

###Autora: Karen Rosero

# 1. Configurações iniciais

Importo as bibliotecas necessárias

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2Config
import torch
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2FeatureEncoder, Wav2Vec2NoLayerNormConvLayer, Wav2Vec2LayerNormConvLayer
from torch import nn
from transformers.activations import ACT2FN
import os
import csv
import torchaudio
import pytorch_lightning as pl
import nvidia_smi
from pytorch_lightning.loggers import CSVLogger
from IPython.display import display, HTML
from dataclasses import dataclass, field
from torch.utils.data import DataLoader
from typing import Any, Dict, List, Optional, Union
from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
from torchmetrics import F1Score
import torch.nn.functional as F
import numpy as np
import contextlib
import gc

GPU a ser utilizada

Defino hiperparâmetros

In [None]:
version = "wav2vec2-sound_detection_train_ov1_loc_REALv4" #@param {type: "string"}
lr = 1e-5#@param {type: "number"}
w_decay = 0#@param {type: "number"}
bs = 8#@param {type: "integer"}
accum_grads = 2#@param {type: "integer"}
patience = 30#@param {type: "integer"}
max_epochs = 400#@param {type: "integer"}
# warmup_steps = 1000#@param {type: "integer"}
hold_epochs = 20#@param {type: "integer"}
pretrained = "facebook/wav2vec2-base-960h"#@param {type: "string"}
wav2vec2_processor = "facebook/wav2vec2-base-960h"#@param {type: "string"}
freeze_finetune_updates = 0#@param {type: "integer"}
warmup_epochs = 150#@param {type: "integer"}
apply_mask=False#@param {type: "boolean"}
mask_time_length= 10#@param {type: "integer"}, era 1

# Define hyperparameters
hparams = {"version": version,
          "lr": lr,
          "w_decay": w_decay,
          "bs": bs,
          "patience": patience,
          "hold_epochs":hold_epochs,
          "accum_grads": accum_grads,
          "pretrained":pretrained,
          "wav2vec2_processor": wav2vec2_processor,
          "freeze_finetune_updates":freeze_finetune_updates,
          "warmup_epochs":warmup_epochs,
          "apply_mask":apply_mask, 
          "mask_time_length":mask_time_length, 
          "max_epochs": max_epochs}
hparams

{'version': 'wav2vec2-sound_detection_train_ov1_loc_REALv4',
 'lr': 1e-05,
 'w_decay': 0,
 'bs': 4,
 'patience': 30,
 'hold_epochs': 20,
 'accum_grads': 2,
 'pretrained': 'facebook/wav2vec2-base-960h',
 'wav2vec2_processor': 'facebook/wav2vec2-base-960h',
 'freeze_finetune_updates': 0,
 'warmup_epochs': 150,
 'apply_mask': False,
 'mask_time_length': 10,
 'max_epochs': 400}

In [None]:
logger = CSVLogger("/home/lab_acustica/IA025_Project/Results/Treinamento_ov1_loc", name='lightning_logs_REALv4', version=hparams["version"])

# 2. Adaptação do modelo wav2vec2 para receber e processar 4 canais de áudio na entrada

Crio o processador original do wav2vec2

In [None]:
processor = Wav2Vec2Processor.from_pretrained(hparams["wav2vec2_processor"], return_attention_mask=True)

O modelo original recebe um canal na entrada. Isso precisa ser mudado para 4 canais

In [None]:
class Wav2Vec2GroupNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 4 # define 4 canais na camada de entrada
        self.out_conv_dim = config.conv_dim[layer_id]

        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        self.activation = ACT2FN[config.feat_extract_activation]

        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)

    def forward(self, hidden_states):
        hidden_states = self.conv(hidden_states)
        hidden_states = self.layer_norm(hidden_states)
        hidden_states = self.activation(hidden_states)
        return hidden_states
    
class Wav2Vec2_4ChannelFeatureEncoder(nn.Module):
    """Construct the features from raw audio waveform"""

    def __init__(self, config):
        super().__init__()

        if config.feat_extract_norm == "group":
            conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [
                Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
            ]
        elif config.feat_extract_norm == "layer":
            conv_layers = [
                Wav2Vec2LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
            ]
        else:
            raise ValueError(
                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
            )
        self.conv_layers = nn.ModuleList(conv_layers)
        self.gradient_checkpointing = False
        self._requires_grad = True

    def _freeze_parameters(self):
        for param in self.parameters():
            param.requires_grad = False
        self._requires_grad = False

    def forward(self, input_values):
        hidden_states = input_values[:] # mudou para que receba todos os canais (4)
        #print("hidden_states", hidden_states.shape)

        # make sure hidden_states require grad for gradient_checkpointing
        if self._requires_grad and self.training:
            hidden_states.requires_grad = True

        for conv_layer in self.conv_layers:
            if self._requires_grad and self.gradient_checkpointing and self.training:

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs)

                    return custom_forward

                hidden_states = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(conv_layer),
                    hidden_states,
                )
            else:
                hidden_states = conv_layer(hidden_states)

        return hidden_states    

# Crio o novo modelo que herda os processos de Wav2Vec2, mas usa o extrator de features baseado em 4 canais
class Wav2Vec2_4ChannelModel(Wav2Vec2Model):
    def __init__(self, config: Wav2Vec2Config):
        super().__init__(config)

        # del self.feature_extractor
        self.feature_extractor = Wav2Vec2_4ChannelFeatureEncoder(config)    

Agora a primeira camada convolucional do modelo recebe 4 canais como entrada

# 3. Dataloader da base de dados ANSYN (TUT Sound Events 2018 - Ambisonic, Anechoic and Synthetic Impulse Response Dataset)

In [None]:
audio_path = '/home/lab_acustica/Documentos/REAL_Dataset/wav_separate_sounds_ov1/'

In [None]:
train_splits = ['ov1s1_wav', 'ov1s8_wav']
val_split = 'ov1s9_wav'

X_train = []
X_val = []
X_test = []

for split in train_splits:
    for file_ in os.listdir(audio_path+split):
        if 'tra' in file_:
            X_train.append(file_[:-4])
        elif 'tst' in file_:
            X_test.append(file_[:-4])

for file_ in os.listdir(audio_path+val_split):
    if 'tra' in file_:
        X_val.append(file_[:-4])
    elif 'tst' in file_:
        X_test.append(file_[:-4]) 
        
print(len(X_train), len(X_val), len(X_test))

3185 1594 1187


In [None]:
class REAL_Dataset(torch.utils.data.Dataset):

    def __init__(self, filenames):    
        self.filenames = filenames
        # O path deve ser modificado dependendo da localização dos dados de áudio a carregar
        self.audio_paths1 = '/home/lab_acustica/Documentos/REAL_Dataset/wav_separate_sounds_ov1/ov1s1_'
        self.audio_paths2 = '/home/lab_acustica/Documentos/REAL_Dataset/wav_separate_sounds_ov1/ov1s8_'
        self.audio_paths3 = '/home/lab_acustica/Documentos/REAL_Dataset/wav_separate_sounds_ov1/ov1s9_'

    def process_audio(self, signal, new_sr):
        # right pad if neccesary 
        length_signal = signal.shape[1]
        # 53363
        if length_signal < 64000:
            num_missing_samples = 64000 - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        elif length_signal > 64000:
            signal = signal[:,:64000]
        return signal        
        
    def normalize_layer(self, feats):  
        with torch.no_grad():
            feats = torch.nn.functional.layer_norm(feats, feats.shape)
        return feats

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, index):
        if "ov1_s1" in self.filenames[index]:
            feats, _ = torchaudio.load(self.audio_paths1 +'wav/'+ self.filenames[index]+'.wav')
            target = torch.from_numpy(np.loadtxt(self.audio_paths1 +'des/'+ self.filenames[index]+'.csv', delimiter=',')).float()
        elif "ov1_s8" in self.filenames[index]:
            feats, _ = torchaudio.load(self.audio_paths2 +'wav/'+ self.filenames[index]+'.wav')
            target = torch.from_numpy(np.loadtxt(self.audio_paths2 +'des/'+ self.filenames[index]+'.csv', delimiter=',')).float()
        elif "ov1_s9" in self.filenames[index]:
            feats, _ = torchaudio.load(self.audio_paths3 +'wav/'+ self.filenames[index]+'.wav') 
            target = torch.from_numpy(np.loadtxt(self.audio_paths3 +'des/'+ self.filenames[index]+'.csv', delimiter=',')).float()
        
        feats = self.process_audio(feats, 16000)
        feats = self.normalize_layer(feats)
        return {"input_values": feats, "target":target}

In [None]:
train_dataset = REAL_Dataset(X_train)                           
val_dataset =  REAL_Dataset(X_val)  
test_dataset = REAL_Dataset(X_test)

In [None]:
del X_train, X_val, X_test

In [None]:
@dataclass
class DataCollatorWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["target"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        #print('batch', batch)
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=True,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )
        #print('labels_batch', labels_batch)


        labels = labels_batch["input_ids"]

        batch["class"] = labels[:,0].long()
        batch["doa"] = labels[:,1:]

        return batch

In [None]:
data_collator = DataCollatorWithPadding(processor=processor,
                                        # max_length=188,
                                        padding=True)

In [None]:
batch_size = hparams["bs"]

train_dataloader = DataLoader(train_dataset, batch_size=batch_size,
                              collate_fn = data_collator,
                              shuffle=True, num_workers=8)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size,
                            collate_fn = data_collator,
                            shuffle=False, num_workers=8)

#print('Número de minibatches de treinamento:', len(train_dataloader))
#print('Número de minibatches de validação:', len(val_dataloader))

### Criando o  lightning module

In [None]:
f1 = F1Score(num_classes=8, average='macro')

class Wav2Vec2_sound_detection(pl.LightningModule):
    def __init__(self, *args, **kwargs):
        super().__init__()

        self.hparams.update(hparams)

        self.freeze_finetune_updates = hparams["freeze_finetune_updates"]
        
        self.model = Wav2Vec2_4ChannelModel.from_pretrained(hparams["pretrained"],
                                                 conv_dim = (512, 512, 512, 512, 512, 512),
                                                 conv_stride = (5, 2, 2, 2, 2, 2),
                                                 conv_kernel = (10, 3, 3, 3, 3, 2),
                                                 num_feat_extract_layers = 6,                                                   
                                                 apply_spec_augment=hparams["apply_mask"],
                                                 #mask_time_length=hparams["mask_time_length"],
                                                 ignore_mismatched_sizes=True)

        # self.model.feature_extractor._freeze_parameters()
        
        # freeze base-model
        # for param in self.model.parameters():
        #     param.requires_grad = False
        
        self.projector = nn.Linear(self.model.config.hidden_size, self.model.config.classifier_proj_size)
        self.final_layer_class = nn.Linear(self.model.config.classifier_proj_size, 8)    
        self.final_layer_doa = nn.Linear(self.model.config.classifier_proj_size, 2)  
        
    def recall_loc(self, loc1, loc2):
        """
        Angular distance between two spherical coordinates
        MORE: https://en.wikipedia.org/wiki/Great-circle_distance
        :return: angular distance in degrees
        """
        loc1 = loc1.cpu()
        loc2 = loc2.cpu()
        dist = np.sin(loc1[:,0]*np.pi/180) * np.sin(loc2[:,0]*np.pi/180) + np.cos(loc1[:,0]*np.pi/180) * np.cos(loc2[:,0]*np.pi/180) * np.cos(np.abs(loc1[:,1]*np.pi/180 - loc2[:,1]*np.pi/180))
        # Making sure the dist values are in -1 to 1 range, else np.arccos kills the job
        dist = np.clip(dist, -1, 1)
        dist = np.arccos(dist) * 180 / np.pi
        dist = dist <= 20
        recall = sum(dist) / loc1.shape[0]
        del loc1, loc2, dist
        return recall        

    def forward(self, samples):

        ft = self.freeze_finetune_updates <= self.trainer.global_step
       
        with torch.no_grad() if not ft else contextlib.ExitStack():
              hidden_states = self.model(**samples).last_hidden_state
        
        padding_mask = self.model._get_feature_vector_attention_mask(hidden_states.shape[1], samples["attention_mask"])

        hidden_states[~padding_mask] = 0.0
        
        pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)

        proj_pooled = self.projector(pooled_output)

        preds_class = self.final_layer_class(proj_pooled)
        
        preds_doa = self.final_layer_doa(proj_pooled)
        
        return F.log_softmax(preds_class, dim=1), preds_doa
    
    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
        
        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
        batch_size = attention_mask.shape[0]

        attention_mask = torch.zeros(
            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
        )
        # these two operations makes sure that all values before the output lengths idxs are attended to
        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
        return attention_mask


    def training_step(self, train_batch, batch_idx):
        
        class_value = train_batch.pop("class")
        doa_value = train_batch.pop("doa")
        
        class_log_softs, doa_preds = self.forward(train_batch)

        ce_loss = F.nll_loss(class_log_softs, class_value)
        mse_loss = F.mse_loss(doa_preds, doa_value)/1000
        
        if 0.6<ce_loss<2 and 0.4<mse_loss< 2:
            #print("menor")
            loss = 0.2*ce_loss + 0.8*mse_loss
        elif ce_loss<0.6 and mse_loss<0.4:
            #print("mayor")
            loss = 0.3*ce_loss + 0.7*mse_loss            
        else:
            #print("mayor")
            loss = 0.12*ce_loss + 0.88*mse_loss
            
        #self.log('mse_loss_step', mse_loss, on_step=True, prog_bar=True)
        #self.log('ce_loss_step', ce_loss, on_step=True, prog_bar=True)
        #self.log('loss_step', loss, on_step=True, prog_bar=True)
        
        return {'loss':loss, 'ce_loss':ce_loss, 'mse_loss': mse_loss}

    def training_epoch_end(self, outputs):
        loss = torch.stack([x['loss'] for x in outputs]).mean()     
        ce_loss = torch.stack([x['ce_loss'] for x in outputs]).mean() 
        mse_loss = torch.stack([x['mse_loss'] for x in outputs]).mean() 
        #print('loss tr:', loss, ce_loss, mse_loss)

        self.log("train_loss", loss, on_step = False,on_epoch = True, prog_bar=True)
        self.log("train_ce_loss", ce_loss, on_step = False,on_epoch = True, prog_bar=True)
        self.log("train_mse_loss", mse_loss, on_step = False,on_epoch = True, prog_bar=True)
  
    def validation_step(self, val_batch, batch_idx):
        
        class_value = val_batch.pop("class")
        doa_value = val_batch.pop("doa")

        class_log_softs, doa_preds = self.forward(val_batch)
        
        val_ce_loss = F.nll_loss(class_log_softs, class_value)
        val_mse_loss = F.mse_loss(doa_preds, doa_value)/1000

        if val_ce_loss<1.1 and val_mse_loss<0.5:
            #print("mayor")        
            val_loss = 0.4*val_ce_loss + 0.6*val_mse_loss
        else:
            #print("mayor")
            val_loss = 0.12*val_ce_loss + 0.88*val_mse_loss

              
        class_preds = torch.argmax(class_log_softs, dim=1) 
        val_f1 = f1(class_preds.cpu(), class_value.cpu())
        
        val_recall = self.recall_loc(doa_preds, doa_value)  
        metric_monitor = (val_f1 + val_recall)/2

        #self.log('val_rc', val_recall, prog_bar=True)
        #self.log('val_f1', val_f1, prog_bar=True)
        #self.log('val_loss', val_loss, prog_bar=True)
        #self.log('val_ce_loss', val_ce_loss, prog_bar=True)
        #self.log('val_mse_loss', val_mse_loss, prog_bar=True)
        
        return {"val_loss_step": val_loss, 'val_ce_loss': val_ce_loss, 'val_mse_loss': val_mse_loss, \
                "val_f1_step": val_f1, 'val_rc':val_recall, 'val_monitor': metric_monitor}
        #return {"val_acc_step": val_acc, "val_f1_step": val_f1, "val_loss_step": val_loss}

    def validation_epoch_end(self, outputs):
        f1_mean = torch.stack([x['val_f1_step'] for x in outputs]).mean()
        rc_mean = torch.stack([x['val_rc'] for x in outputs]).mean()
        loss_mean = torch.stack([x['val_loss_step'] for x in outputs]).mean()
        loss_mean_ce = torch.stack([x['val_ce_loss'] for x in outputs]).mean()
        loss_mean_mse = torch.stack([x['val_mse_loss'] for x in outputs]).mean()
        monitor = torch.stack([x['val_monitor'] for x in outputs]).mean()
        #print('val: ', monitor)
        self.log("val_f1_mean", f1_mean, on_step = False, on_epoch = True,prog_bar=True)
        self.log("val_rc_mean", rc_mean, on_step = False,on_epoch = True,prog_bar=True)
        self.log("val_loss_mean", loss_mean, on_step = False,on_epoch = True,prog_bar=True)
        self.log("val_ce_loss_mean", loss_mean_ce, on_step = False,on_epoch = True,prog_bar=True)
        self.log("val_mse_loss_mean", loss_mean_mse, on_step = False,on_epoch = True,prog_bar=True)
        self.log("val_monitor_mean", monitor, on_step = False, on_epoch = True,prog_bar = True)
        
    def test_step(self, test_batch, batch_idx):
        
        class_value = test_batch.pop("class")
        doa_value = test_batch.pop("doa")

        # predict 
        class_log_softs, doa_preds  = self.forward(test_batch)
        test_ce_loss = F.nll_loss(class_log_softs, class_value)
        test_mse_loss = F.mse_loss(doa_preds, doa_value)/1000

        if test_ce_loss<1.1 and test_mse_loss<0.5:
            #print("mayor")
            test_loss = 0.3*test_ce_loss + 0.7*test_mse_loss 
        else:
            test_loss = 0.12*test_ce_loss + 0.88*test_mse_loss
             
            
        class_preds = torch.argmax(class_log_softs, dim=1) 
        test_f1 = f1(class_preds.cpu(), class_value.cpu())

        test_recall = self.recall_loc(doa_preds, doa_value)  
        #self.log('test_acc', test_acc, prog_bar=True)
        #self.log('test_f1', test_f1, prog_bar=True)
        #self.log('test_loss', test_loss, prog_bar=True
        
        return {"test_loss_step": test_loss, 'test_ce_loss':test_ce_loss, 'test_mse_loss':test_mse_loss, \
                 'test_rc_step':test_recall, 'test_f1_step':test_f1}

        #return {"test_acc_step": test_acc, "test_f1_step": test_f1,  "test_loss_step": test_loss}

    def test_epoch_end(self, outputs):
        t_rc_mean = torch.stack([x['test_rc_step'] for x in outputs]).mean()
        t_f1_mean = torch.stack([x['test_f1_step'] for x in outputs]).mean()
        t_loss_mean = torch.stack([x['test_loss_step'] for x in outputs]).mean()
        ce_t_loss_mean = torch.stack([x['test_ce_loss'] for x in outputs]).mean()
        mse_t_loss_mean = torch.stack([x['test_mse_loss'] for x in outputs]).mean()
        print("recall:", t_rc_mean, "f1:", t_f1_mean, "loss:", t_loss_mean, "ce_loss:", ce_t_loss_mean, "mse_loss:", mse_t_loss_mean)
        #self.log("test_rc", t_rc_mean, prog_bar=True)
        #self.log("test_f1", t_f1_mean, prog_bar=True)
        #self.log("test_loss", t_loss_mean, prog_bar=True)  
        #self.log("test_loss_ce", ce_t_loss_mean, prog_bar=True) 
        #self.log("test_loss_mse", mse_t_loss_mean, prog_bar=True) 
      

    def configure_optimizers(self):

        optimizer = torch.optim.Adam(self.parameters(),
                         lr=self.hparams["lr"],
                         betas=(0.9,0.98),
                         eps=1e-6,
                         weight_decay=self.hparams["w_decay"])
        
        scheduler = LinearWarmupCosineAnnealingLR(optimizer, 
                                                  eta_min=0, # final-lr
                                                  warmup_start_lr=self.hparams["lr"],
                                                  warmup_epochs=self.hparams["warmup_epochs"],
                                                  max_epochs=self.hparams["max_epochs"])
        
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

In [None]:
# Overfitting de 3 minibatches 
#model = Wav2Vec2_sound_detection(hparams)

#trainer = pl.Trainer(gpus=1,
#                     logger=logger,
#                     max_epochs=300,
#                     check_val_every_n_epoch=10,
#                     checkpoint_callback=False, # Disable checkpoint saving.
#                     overfit_batches=3,
#                     log_every_n_steps = 1)

#trainer.fit(model, train_dataloader, val_dataloader)
#del model, trainer # Para não ter estouro de mémoria da GPU
#gc.collect()
#torch.cuda.empty_cache()

In [None]:
#!mkdir '/content/drive/MyDrive/Colab Notebooks/Karen/Results/'

In [None]:
pl_model= Wav2Vec2_sound_detection(hparams=hparams)
checkpoint_path = '/home/lab_acustica/IA025_Project/Results/Treinamento_ov1_loc/'
checkpoint_dir = os.path.dirname(os.path.abspath(checkpoint_path))
print(f'Files in {checkpoint_dir}: {os.listdir(checkpoint_dir)}')
print(f'Saving checkpoints to {checkpoint_dir}')
checkpoint_callback = pl.callbacks.ModelCheckpoint(filename=hparams["version"],
                                                  dirpath=checkpoint_dir,
                                                  save_top_k=1,
                                                  verbose = True, 
                                                  monitor="val_monitor_mean", mode="max")
early_stop_callback = pl.callbacks.EarlyStopping(monitor="val_monitor_mean", patience=hparams["patience"], mode='max')
#lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='epoch')
#hparams["patienceepoch
trainer = pl.Trainer(gpus=1, 
                     precision=16,
                     logger=logger,
                     # num_sanity_val_steps=0,
                     accumulate_grad_batches=hparams["accum_grads"],
                     enable_checkpointing=True, 
                     callbacks=[early_stop_callback, checkpoint_callback],
                     max_epochs=hparams["max_epochs"])

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2_4ChannelModel: ['lm_head.weight', 'lm_head.bias', 'wav2vec2.feature_extractor.conv_layers.6.conv.weight']
- This IS expected if you are initializing Wav2Vec2_4ChannelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2_4ChannelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2_4ChannelModel were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2

Files in /home/lab_acustica/IA025_Project/Results: ['Treinamento_ov1_loc', 'Treinamento_ov1s1_loc', 'wav2vec2-sound_detection_train_ov1_loc_REALv2-v1.ckpt', 'wav2vec2-sound_detection_ov1s1_loc_curve-v1.ckpt', 'wav2vec2-sound_detection_train_ov1_loc_REALv2-v2.ckpt', 'wav2vec2-sound_detection_train_ov1_loc_REAL.ckpt', 'wav2vec2-sound_detection_train_ov1_loc_v2.ckpt', 'wav2vec2-sound_detection_train_ov1_loc_REALv3.ckpt', 'wav2vec2-sound_detection_ov1s1_loc_curve.ckpt', 'wav2vec2-sound_detection_train_ov1_loc_REALv2-v3.ckpt', 'Treinamento ov1s1', 'wav2vec2-sound_detection_train_ov1_loc_REALv2-v5.ckpt', 'wav2vec2-sound_detection_train_ov1_loc_REALv2-v4.ckpt', 'wav2vec2-sound_detection_train_ov1_loc_REALv2-v6.ckpt', 'wav2vec2-sound_detection_train_ov1_loc_REALv2.ckpt', 'wav2vec2-sound_detection_train_ov1_loc.ckpt']
Saving checkpoints to /home/lab_acustica/IA025_Project/Results


In [None]:
trainer.fit(pl_model, train_dataloader, val_dataloader)#

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type                   | Params
-------------------------------------------------------------
0 | model             | Wav2Vec2_4ChannelModel | 93.9 M
1 | projector         | Linear                 | 196 K 
2 | final_layer_class | Linear                 | 2.1 K 
3 | final_layer_doa   | Linear                 | 514   
-------------------------------------------------------------
94.1 M    Trainable params
0         Non-trainable params
94.1 M    Total params
188.124   Total estimated model params size (MB)
  rank_zero_warn(


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 399: 'val_monitor_mean' reached 0.14944 (best 0.14944), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 798: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 2, global step 1197: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 1596: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 1995: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 2394: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 2793: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 3192: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 8, global step 3591: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 9, global step 3990: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 10, global step 4389: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 11, global step 4788: 'val_monitor_mean' reached 0.15716 (best 0.15716), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 12, global step 5187: 'val_monitor_mean' reached 0.17465 (best 0.17465), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 13, global step 5586: 'val_monitor_mean' reached 0.18948 (best 0.18948), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 14, global step 5985: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 15, global step 6384: 'val_monitor_mean' reached 0.19776 (best 0.19776), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 16, global step 6783: 'val_monitor_mean' reached 0.22145 (best 0.22145), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 17, global step 7182: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 18, global step 7581: 'val_monitor_mean' reached 0.23693 (best 0.23693), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 19, global step 7980: 'val_monitor_mean' reached 0.24376 (best 0.24376), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 20, global step 8379: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 21, global step 8778: 'val_monitor_mean' reached 0.25651 (best 0.25651), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 22, global step 9177: 'val_monitor_mean' reached 0.29500 (best 0.29500), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 23, global step 9576: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 24, global step 9975: 'val_monitor_mean' reached 0.30797 (best 0.30797), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 25, global step 10374: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 26, global step 10773: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 27, global step 11172: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 28, global step 11571: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 29, global step 11970: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 30, global step 12369: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 31, global step 12768: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 32, global step 13167: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 33, global step 13566: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 34, global step 13965: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 35, global step 14364: 'val_monitor_mean' reached 0.32616 (best 0.32616), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 36, global step 14763: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 37, global step 15162: 'val_monitor_mean' reached 0.33816 (best 0.33816), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 38, global step 15561: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 39, global step 15960: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 40, global step 16359: 'val_monitor_mean' reached 0.34205 (best 0.34205), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 41, global step 16758: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 42, global step 17157: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 43, global step 17556: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 44, global step 17955: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 45, global step 18354: 'val_monitor_mean' reached 0.37173 (best 0.37173), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 46, global step 18753: 'val_monitor_mean' reached 0.37361 (best 0.37361), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 47, global step 19152: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 48, global step 19551: 'val_monitor_mean' reached 0.38011 (best 0.38011), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 49, global step 19950: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 50, global step 20349: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 51, global step 20748: 'val_monitor_mean' reached 0.39663 (best 0.39663), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 52, global step 21147: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 53, global step 21546: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 54, global step 21945: 'val_monitor_mean' reached 0.40528 (best 0.40528), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 55, global step 22344: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 56, global step 22743: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 57, global step 23142: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 58, global step 23541: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 59, global step 23940: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 60, global step 24339: 'val_monitor_mean' reached 0.41900 (best 0.41900), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 61, global step 24738: 'val_monitor_mean' reached 0.42926 (best 0.42926), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 62, global step 25137: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 63, global step 25536: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 64, global step 25935: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 65, global step 26334: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 66, global step 26733: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 67, global step 27132: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 68, global step 27531: 'val_monitor_mean' reached 0.42961 (best 0.42961), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 69, global step 27930: 'val_monitor_mean' reached 0.43726 (best 0.43726), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 70, global step 28329: 'val_monitor_mean' reached 0.44854 (best 0.44854), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 71, global step 28728: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 72, global step 29127: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 73, global step 29526: 'val_monitor_mean' reached 0.46608 (best 0.46608), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 74, global step 29925: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 75, global step 30324: 'val_monitor_mean' reached 0.48081 (best 0.48081), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 76, global step 30723: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 77, global step 31122: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 78, global step 31521: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 79, global step 31920: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 80, global step 32319: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 81, global step 32718: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 82, global step 33117: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 83, global step 33516: 'val_monitor_mean' reached 0.48290 (best 0.48290), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 84, global step 33915: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 85, global step 34314: 'val_monitor_mean' reached 0.49977 (best 0.49977), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 86, global step 34713: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 87, global step 35112: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 88, global step 35511: 'val_monitor_mean' reached 0.50822 (best 0.50822), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 89, global step 35910: 'val_monitor_mean' reached 0.52016 (best 0.52016), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 90, global step 36309: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 91, global step 36708: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 92, global step 37107: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 93, global step 37506: 'val_monitor_mean' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 94, global step 37905: 'val_monitor_mean' reached 0.52079 (best 0.52079), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc_REALv4.ckpt' as top 1


In [None]:
if not os.path.isfile(checkpoint_dir + hparams["version"]):
    print('Saving processor to: ' + checkpoint_dir + hparams["version"])
    processor.save_pretrained(checkpoint_dir + hparams["version"])

In [None]:
best_model = checkpoint_callback.best_model_path
# best_model = "/content/drive/MyDrive/Wav2Vec2_ORVP/wav2vec2_huggingface_fairseq_orvp_test1-epoch=4-step=23459.ckpt"
print(best_model)
test_model = Wav2Vec2_sound_detection.load_from_checkpoint(best_model, hparams=hparams).cuda().eval()

Version1 --> VAL recall: tensor(0.9935) f1: tensor(0.9668) loss: tensor(0.0639, device='cuda:0') ce_loss: tensor(0.1308, device='cuda:0') mse_loss: tensor(0.0471, device='cuda:0')
TEST --> recall: tensor(0.9752) f1: tensor(0.9061) loss: tensor(0.1690, device='cuda:0') ce_loss: tensor(0.4810, device='cuda:0') mse_loss: tensor(0.0909, device='cuda:0')
TRAIN --> Epoch 125, global step 27846: 'val_monitor_mean' reached 0.98017 (best 0.98017), saving model to '/home/lab_acustica/IA025_Project/Results/wav2vec2-sound_detection_train_ov1_loc.ckpt' as top 1


Version2 --> 0.3 class 0.7 localization , paró en época 90
VAL recall: tensor(0.4543) f1: tensor(0.7610) loss: tensor(0.6054, device='cuda:0') ce_loss: tensor(0.7607, device='cuda:0') mse_loss: tensor(0.5388, device='cuda:0')
TEST recall: tensor(0.3878) f1: tensor(0.6883) loss: tensor(1.0342, device='cuda:0') ce_loss: tensor(0.9555, device='cuda:0') mse_loss: tensor(1.0680, device='cuda:0')
val_loss_mean monitored


com 0.25*ce_loss + 0.75*mse_loss a perda de classificação melhorou rápido demais e a localização não, então ia gerar overfitting


In [None]:
trainer.test(test_model, val_dataloader)

In [None]:
test_dataloader = DataLoader(test_dataset, batch_size=batch_size,
                            collate_fn = data_collator,
                            shuffle=False, num_workers=8)

In [None]:
trainer.test(test_model, test_dataloader)

In [None]:
del model, trainer # Para não ter estouro de mémoria da GPU
gc.collect()
torch.cuda.empty_cache()