# Part 2. Train model
This notebook is used for training, i uploaded the output from notebook 1 to google drive and then download the data to vast.ai server via gdown. 

In [None]:
%%capture
!sudo apt update
!sudo apt install build-essential -y
!sudo apt-get install manpages-dev -y
!sudo apt install gcc -y
!apt-get install git-lfs -y
!sudo apt install unzip -y

In [None]:
%%capture
!pip install gdown -q
!pip install transformers -q
!pip install datasets -q
!pip install jiwer -q
!gdown your_own_id
!pip install huggingface_hub -q


In [None]:
!unzip -q saved.zip 

In [None]:
!rm -rf saved.zip 

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
!git config --global credential.helper store

In [3]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [4]:
from datasets import load_from_disk
import numpy as np
urdudata = load_from_disk("saved")

In [5]:
urdudata

DatasetDict({
    train: Dataset({
        features: ['input_values', 'input_length', 'labels'],
        num_rows: 48827
    })
    test: Dataset({
        features: ['input_values', 'input_length', 'labels'],
        num_rows: 3298
    })
})

In [6]:
from transformers import Wav2Vec2CTCTokenizer
tokenizer = Wav2Vec2CTCTokenizer("./saved/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [7]:
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [8]:
from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [9]:
import torch
import datasets
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [10]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [11]:
wer_metric = datasets.load_metric("wer")
cer_metric = datasets.load_metric("cer")

In [12]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer,"cer": cer}

In [13]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)


model.config.ctc_zero_infinity=True

Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2ForCTC: ['project_q.weight', 'project_q.bias', 'project_hid.bias', 'project_hid.weight', 'quantizer.codevectors', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to u

In [14]:
model.freeze_feature_extractor()
model.gradient_checkpointing_enable()



In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./urdumodel",
  group_by_length=True,
  per_device_train_batch_size=24,
  per_device_eval_batch_size=24,
  gradient_accumulation_steps=4,
  evaluation_strategy="epoch",
  num_train_epochs=20,
  fp16=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=3e-4,
  save_total_limit=2,
  push_to_hub=True

)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=urdudata["train"],
    eval_dataset=urdudata["test"],
    tokenizer=processor.feature_extractor,
)

/workspace/./urdumodel is already a clone of https://huggingface.co/Talha/urdumodel. Make sure you pull the latest changes with `repo.git_pull()`.
Using cuda_amp half precision backend


In [17]:
trainer.train()

Loading model from urdumodel/checkpoint-7500.
The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 48827
  Num Epochs = 20
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 4
  Total optimization steps = 10160
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 14
  Continuing training from global step 7500
  Will skip the first 14 epochs then the first 1552 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/1552 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss,Wer,Cer
14,0.2912,0.492872,0.376268,0.148619
15,0.2969,0.499024,0.374941,0.148141
16,0.2946,0.494343,0.373498,0.148504
17,0.2851,0.489323,0.371703,0.147708
18,0.279,0.497661,0.369284,0.146382
19,0.2718,0.49387,0.36983,0.146471


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3298
  Batch size = 24
Saving model checkpoint to ./urdumodel/checkpoint-8000
Configuration saved in ./urdumodel/checkpoint-8000/config.json
Model weights saved in ./urdumodel/checkpoint-8000/pytorch_model.bin
Feature extractor saved in ./urdumodel/checkpoint-8000/preprocessor_config.json
Feature extractor saved in ./urdumodel/preprocessor_config.json
Deleting older checkpoint [urdumodel/checkpoint-7000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running Evaluation **

TrainOutput(global_step=10160, training_loss=0.07465291323624258, metrics={'train_runtime': 24494.024, 'train_samples_per_second': 39.869, 'train_steps_per_second': 0.415, 'total_flos': 2.0885985938243928e+20, 'train_loss': 0.07465291323624258, 'epoch': 20.0})

In [18]:
processor.save_pretrained('./urdumodel')

Feature extractor saved in ./urdumodel/preprocessor_config.json
tokenizer config file saved in ./urdumodel/tokenizer_config.json
Special tokens file saved in ./urdumodel/special_tokens_map.json


In [19]:
trainer.push_to_hub()

Saving model checkpoint to ./urdumodel
Configuration saved in ./urdumodel/config.json
Model weights saved in ./urdumodel/pytorch_model.bin
Feature extractor saved in ./urdumodel/preprocessor_config.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.33k/1.18G [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/Talha/urdumodel
   390a038..4238c1d  main -> main

Dropping the following result as it does not have all the necessary fields:
{}
To https://huggingface.co/Talha/urdumodel
   4238c1d..3279d9d  main -> main



'https://huggingface.co/Talha/urdumodel/commit/4238c1d4ba5878e4ca456c634312666616b763e9'