# Wav2Vec - English Single

In [2]:
import torch
import librosa
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import numpy as np
model_name = "skpawar1305/wav2vec2-base-finetuned-digits"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)


In [22]:
from pydub import AudioSegment
from pydub.silence import split_on_silence
import sounddevice as sd

test_file = "12345_68772.wav"

def get_nparray(audiosegment: AudioSegment):
    samples = audiosegment.get_array_of_samples()
    samples_float = librosa.util.buf_to_float(samples,n_bytes=4, dtype=np.float32)
    if audiosegment.channels==2:
        sample_left= np.copy(samples_float[::2])
        sample_right= np.copy(samples_float[1::2])
        sample_all = np.array([sample_left,sample_right])
    else:
        sample_all = samples_float

    return sample_all


data = AudioSegment.from_wav(test_file)
print(f"db = {data.dBFS}")
sd.play(get_nparray(data), samplerate=16000)

db = -21.15406995820191


In [30]:
from tqdm import tqdm
chunks = split_on_silence(data, min_silence_len=20, keep_silence=40, silence_thresh=-25)
print(len(chunks))
for _c in tqdm(chunks):
    sd.play(get_nparray(_c), samplerate=16000)
    sd.wait()

# sd.play(get_nparray(data))


9


100%|██████████| 9/9 [00:07<00:00,  1.22it/s]


In [31]:
def inference(chunk):
    inputs = feature_extractor(chunk, sampling_rate=16000, padding=True, return_tensors="pt")

    logits = model(**inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    labels = [model.config.id2label[_id] for _id in predicted_ids.tolist()]
    if not len(labels):
        print("dead")
    return labels[0]

for _c in chunks:
    print(inference(get_nparray(_c)))

one
_unknown_
_unknown_
two
_unknown_
_unknown_
three
four
five


# Wav2Vec - Chinese Sequence


In [4]:
from huggingsound import SpeechRecognitionModel
from pprint import pprint
import os

model = SpeechRecognitionModel("wbbbbb/wav2vec2-large-chinese-zh-cn", device="cuda")

11/09/2022 20:18:25 - INFO - huggingsound.speech_recognition.model - Loading model...


loading configuration file config.json from cache at C:\Users\24103/.cache\huggingface\hub\models--wbbbbb--wav2vec2-large-chinese-zh-cn\snapshots\369f73139f85a98570ff74e641dc93d421a3860e\config.json
Model config Wav2Vec2Config {
  "_name_or_path": "wbbbbb/wav2vec2-large-chinese-zh-cn",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": true,
  "

In [2]:
dataset = './zh_seq'
dataset = [os.path.join(dataset, f) for f in os.listdir(dataset)]
eval_files = dataset.copy()
tmp = []
for d in dataset:
    label = os.path.splitext(os.path.split(d)[1])[0].split('_')[0]
    tmp.append({"path": d, "transcription": label})
dataset = tmp
del tmp

In [42]:
pprint(model.transcribe(dataset))

100%|██████████| 24/24 [00:17<00:00,  1.35it/s]

[{'end_timestamps': [980],
  'probabilities': [0.8219219446182251],
  'start_timestamps': [960],
  'transcription': '伊'},
 {'end_timestamps': [860, 1360, 1800, 2260],
  'probabilities': [0.9657955169677734,
                    0.9955781102180481,
                    0.7799317240715027,
                    0.8337723612785339],
  'start_timestamps': [840, 1340, 1780, 2240],
  'transcription': '一一一一'},
 {'end_timestamps': [680, 1180, 1840, 2440, 3000],
  'probabilities': [0.9978392720222473,
                    0.999473512172699,
                    0.9999866485595703,
                    0.9977414608001709,
                    0.9924236536026001],
  'start_timestamps': [660, 1160, 1820, 2420, 2980],
  'transcription': '一二三四五'},
 {'end_timestamps': [960],
  'probabilities': [0.9705238342285156],
  'start_timestamps': [940],
  'transcription': '七'},
 {'end_timestamps': [1120, 1700, 2240],
  'probabilities': [0.9857266545295715, 0.9892627596855164, 0.9728983640670776],
  'start_timestamps':




In [6]:
model.finetune("train_output", dataset)

11/09/2022 20:19:01 - INFO - huggingsound.speech_recognition.model - Loading training data...
11/09/2022 20:19:01 - INFO - huggingsound.speech_recognition.model - Converting data format...
11/09/2022 20:19:01 - INFO - huggingsound.speech_recognition.model - Preparing data input and labels...


  0%|          | 0/24 [00:00<?, ?ex/s]

11/09/2022 20:19:03 - INFO - huggingsound.speech_recognition.model - Starting fine-tuning process...


PyTorch: setting up devices


11/09/2022 20:19:03 - INFO - huggingsound.trainer - Getting dataset stats...
11/09/2022 20:19:03 - INFO - huggingsound.trainer - Training dataset size: 24 samples, 0.017777777777777778 hours


loading configuration file config.json from cache at C:\Users\24103/.cache\huggingface\hub\models--wbbbbb--wav2vec2-large-chinese-zh-cn\snapshots\369f73139f85a98570ff74e641dc93d421a3860e\config.json
Model config Wav2Vec2Config {
  "_name_or_path": "wbbbbb/wav2vec2-large-chinese-zh-cn",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": true,
  "

11/09/2022 20:19:07 - INFO - huggingsound.trainer - Building trainer...




11/09/2022 20:19:08 - INFO - huggingsound.trainer - Starting training...


Feature extractor saved in train_output\preprocessor_config.json
tokenizer config file saved in train_output\tokenizer_config.json
Special tokens file saved in train_output\special_tokens_map.json
added tokens file saved in train_output\added_tokens.json
The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: length. If length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 24
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9
  Number of trainable parameters = 316528819


RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 6.00 GiB total capacity; 5.13 GiB already allocated; 0 bytes free; 5.29 GiB reserved in total by PyTorch)