# TranScorer for STT

Train a transformer to compute the vector from your accoustic model to a character representation of your speech.

In [1]:
# Check if you have access to the smi
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Mar  5 22:09:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:43:00.0  On |                  N/A |
|  0%   59C    P3    39W / 170W |    611MiB / 12288MiB |     15%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Installing TranScorerLM

In [None]:
!pip install git+https://github.com/wasertech/TranScorerLM.git

## Dataset
Use `datasets.load_dataset()` to load you datasets.

In [None]:
from pathlib import  Path
from glob import glob
from datasets import  DatasetDict, load_dataset

def load_wav2txt(data_path: Path):
    if data_path.exists() and data_path.isdir():
        wav2txt = DatasetDict()
        train_files = glob(f"{str(data_path)}/**/*-train.csv")
        dev_files = glob(f"{str(data_path)}/**/*-dev.csv")
        test_files = glob(f"{str(data_path)}/**/*-test.csv")

        wav2txt['train'] = load_dataset('csv', data_files=train_files)
        wav2txt['eval'] = load_dataset('csv', data_files=dev_files)
        wav2txt['test'] = load_dataset('csv', data_files=test_files)

        return wav2txt
    else:
        raise Exception(f"Invalid data_path. {data_path} doesn't exists or is not a directory.")

w2t = load_wav2txt("./wav2txt")

w2t

## Training a Scorer

In [None]:
!trainscorer \
    --model_name_or_path "./transcorer" \
    --dataset_name "common_voice" \
    --audio_column_name wav \
    --label_column_name language \
    --output_dir "./transcorer/models" \
    --overwrite_output_dir \
    --remove_unused_columns False \
    --do_train \
    --do_eval \
    --fp16 \
    --learning_rate 3e-4 \
    --max_length_seconds 16 \
    --attention_mask False \
    --warmup_ratio 0.1 \
    --num_train_epochs 10 \
    --per_device_train_batch_size 8 \
    --gradient_accumulation_steps 4 \
    --per_device_eval_batch_size 1 \
    --dataloader_num_workers 8 \
    --logging_strategy steps \
    --logging_steps 10 \
    --evaluation_strategy epoch \
    --save_strategy epoch \
    --load_best_model_at_end True \
    --metric_for_best_model accuracy \
    --save_total_limit 3 \
    --seed 0 \
    --push_to_hub

# Testing your model

In [None]:
from transformers import Wav2Vec2ProcessorWithLM, AutoModelForCTC,
from datasets import load_dataset
import torch
 
# load model and tokenizer
processor = Wav2Vec2ProcessorWithLM.from_pretrained("./transcorer")
model = AutoModelForCTC.from_pretrained("./transcorer")

# load dummy dataset and read soundfiles
test_ds = load_dataset('csv', data_files=['file_1-test.csv', 'file_2-test.csv'])

# tokenize
input_values = processor(test_ds[0]["wav"]["array"], return_tensors="pt", padding="longest").input_values  # Batch size 1

# retrieve logits
logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

transcription