In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio

import os
import sys

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Fri Feb  4 13:18:56 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    22W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
%%capture
!pip install transformers
!pip install datasets

In [5]:
%%capture
!sudo apt-get install git-lfs


In [6]:
%%capture
!git lfs install 
!--system 
!--skip-repo

In [7]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [8]:
from datasets import load_dataset, load_metric

data_files = {
    'train': '/content/drive/My Drive/type3/data/train_dm.csv', 
    'valid': '/content/drive/My Drive/type3/data/valid_dm.csv'
    }

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_data = dataset['train']
valid_data = dataset['valid']

print(train_data)
print(valid_data)

Using custom data configuration default-062326d00fd6b1dd


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-062326d00fd6b1dd/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-062326d00fd6b1dd/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['file', 'label', 'path'],
    num_rows: 227
})
Dataset({
    features: ['file', 'label', 'path'],
    num_rows: 48
})


In [9]:
repo_name = "wav2vec2-large-xls-r-300m-dm32"

In [10]:
input_col = 'path'
output_col = 'label'
audio_len = 32

In [11]:
label_list = train_data.unique(output_col)
label_list.sort()
num_classes = len(label_list)
print(f"Number of classes: {num_classes}")
print(f"Classes: {label_list}")

Number of classes: 2
Classes: ['dementia', 'nodementia']


In [12]:
from random import randint

def random_subsample(wav: np.ndarray, max_length: float, sample_rate: int = 16000):
    """Randomly sample chunks of `max_length` seconds from the input audio"""
    sample_length = int(round(sample_rate * max_length))
    if len(wav) <= sample_length:
        return wav
    random_offset = randint(0, len(wav) - sample_length - 1)
    return wav[random_offset : random_offset + sample_length]

In [13]:
from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor

In [14]:
model_name = "facebook/wav2vec2-xls-r-300m"
pooling_mode = "mean"

In [15]:
config = AutoConfig.from_pretrained(
    model_name, 
    num_labels=num_classes,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
    )

setattr(config, "pooling_mode", pooling_mode) 

Downloading:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

In [16]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name,)
target_sampling_rate = feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

The target sampling rate: 16000


In [17]:

def speech_to_array(path):
    speech, sr = torchaudio.load(path)
    transform = torchaudio.transforms.Resample(sr, 16000)
    speech = transform(speech)[0].numpy().squeeze()
    return random_subsample(speech, max_length=audio_len)
    

def label_to_id(label, label_list):
    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1
    return label

def preprocess_fn(examples):
    speech_list = [speech_to_array(path) for path in examples[input_col]]
    target_list = [label_to_id(label, label_list) for label in examples[output_col]]
    result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
    result['labels'] = list(target_list)

    return result


In [18]:
train_data = train_data.map(preprocess_fn, batch_size=8, batched=True, num_proc=4,)
valid_data = valid_data.map(preprocess_fn, batch_size=8, batched=True, num_proc=4,)

  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)


In [19]:
#train_data[0].keys()

In [20]:

#seq_len = [train_data[i]['path'] for i in range(len(train_data)) if len(train_data[i]['input_values']) < 128000]

In [21]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput

@dataclass
class SpeechClassifierModelOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    

In [22]:
import torch 
import torch.nn as nn
from transformers.models.wav2vec2.modeling_wav2vec2 import (Wav2Vec2PreTrainedModel, Wav2Vec2Model)

class Wav2Vec2ClassificationHead(nn.Module):
    """head for wav2vec classification task"""
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dense(x)
        x = self.dropout(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merge_strategy(self, hidden_states, mode="mean"):
        if mode == "mean":
            return torch.mean(hidden_states, dim=1)
        elif mode == "max":
            return torch.max(hidden_states, dim=1)[0]
        elif mode == "sum":
            return torch.sum(hidden_states, dim=1)
        else:
            raise ValueError(f"Unknown merge strategy: {mode}")

    def forward(
        self,
        input_values,
        attention_mask=None,
        output_attentions=None, 
        output_hidden_states=None,
        return_dict=None,
        labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(input_values, 
                            attention_mask=attention_mask, 
                            output_attentions=output_attentions, 
                            output_hidden_states=output_hidden_states, 
                            return_dict=return_dict)  

        hidden_states = outputs[0]
        hidden_states = self.merge_strategy(hidden_states, self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierModelOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [23]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`):
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """
    feature_extractor: Wav2Vec2FeatureExtractor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"],} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.feature_extractor.pad( input_features, 
                                    padding=self.padding, 
                                    max_length=self.max_length,
                                    pad_to_multiple_of=self.pad_to_multiple_of,
                                    return_tensors="pt",
        )

        batch['labels'] = torch.tensor(label_features, dtype=d_type)

        return batch

In [24]:
data_collator = DataCollatorCTCWithPadding(feature_extractor=feature_extractor, padding=True)

In [25]:
import numpy as np
from transformers import EvalPrediction

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    return {'accuracy': (preds == p.label_ids).astype(np.float32).mean().item()}

In [26]:
#from transformers import Wav2Vec2ForSequenceClassification
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name, config=config)

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForSpeechClassification: ['quantizer.codevectors', 'quantizer.weight_proj.bias', 'project_hid.bias', 'project_hid.weight', 'project_q.weight', 'project_q.bias', 'quantizer.weight_proj.weight']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['classifier.out_proj.weight', 'classifier.dens

In [27]:
model.freeze_feature_extractor()

In [28]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir= repo_name,
    group_by_length=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    gradient_checkpointing=True,
    num_train_epochs=22,
    logging_dir=None,
    save_steps=110,
    eval_steps=34,
    logging_steps=110,
    learning_rate=1e-4,
    save_total_limit=2,
    fp16=True,
    push_to_hub=True,
)




In [29]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """

        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_amp:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            loss.backward()

        return loss.detach()

In [30]:
trainer = CTCTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=valid_data,
    tokenizer=feature_extractor,
)

Cloning https://huggingface.co/shreyasgite/wav2vec2-large-xls-r-300m-dm32 into local empty directory.


Download file pytorch_model.bin:   0%|          | 9.90k/1.18G [00:00<?, ?B/s]

Download file training_args.bin:  62%|######1   | 1.84k/2.98k [00:00<?, ?B/s]

Download file runs/Feb04_04-51-37_ca941771a459/1643950383.662408/events.out.tfevents.1643950383.ca941771a459.1…

Download file runs/Feb04_04-51-37_ca941771a459/events.out.tfevents.1643950383.ca941771a459.126.0:  22%|##2    …

Clean file runs/Feb04_04-51-37_ca941771a459/1643950383.662408/events.out.tfevents.1643950383.ca941771a459.126.…

Clean file training_args.bin:  34%|###3      | 1.00k/2.98k [00:00<?, ?B/s]

Clean file runs/Feb04_04-51-37_ca941771a459/events.out.tfevents.1643950383.ca941771a459.126.0:  12%|#2        …

Clean file pytorch_model.bin:   0%|          | 1.00k/1.18G [00:00<?, ?B/s]

Using amp half precision backend


In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: file, path.
***** Running training *****
  Num examples = 227
  Num Epochs = 22
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 308


Step,Training Loss,Validation Loss,Accuracy
34,No log,0.67691,0.645833
68,No log,0.686422,0.520833
102,No log,0.659566,0.604167
136,0.710600,0.620758,0.6875
170,0.710600,0.615168,0.6875
204,0.710600,0.616676,0.6875
238,0.646400,0.578222,0.770833
272,0.646400,0.601116,0.729167
306,0.646400,0.568775,0.791667


The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: file, path.
***** Running Evaluation *****
  Num examples = 48
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: file, path.
***** Running Evaluation *****
  Num examples = 48
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: file, path.
***** Running Evaluation *****
  Num examples = 48
  Batch size = 8
Saving model checkpoint to wav2vec2-large-xls-r-300m-dm32/checkpoint-110
Configuration saved in wav2vec2-large-xls-r-300m-dm32/checkpoint-110/config.json
Model weights saved in wav2vec2-large-xls-r-300m-dm32/checkpoint-110/pytorch_model.bin
Configuration saved in wav2vec2-large-xls-r-300m-dm32/checkpoint-110/pre

TrainOutput(global_step=308, training_loss=0.6171477924693715, metrics={'train_runtime': 3847.0299, 'train_samples_per_second': 1.298, 'train_steps_per_second': 0.08, 'total_flos': 4.67333502153112e+18, 'train_loss': 0.6171477924693715, 'epoch': 21.97})

In [None]:
trainer.push_to_hub(repo_name)