In [None]:
from huggingface_hub import login

login(token="")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from transformers import pipeline

model = pipeline(
    "automatic-speech-recognition",
    model="KoelLabs/xlsr-english-01",
    device="cpu",
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


In [3]:
print("Transcription:", model("./audio_files/tly.ogg").get("text", ""))

ValueError: ffmpeg was not found but is required to load audio files from filename

In [None]:
print("Transcription:", model("./audio_files/word_january.mp3").get("text", ""))

Transcription: dʒænjuwɛɹi


In [None]:
print("Transcription:", model("./audio_files/january_tyan1.ogg").get("text", ""))

Transcription: dʒænjuɛɹi


In [None]:
print("Transcription:", model("./audio_files/january_tyan2.ogg").get("text", ""))

Transcription: dʒænuɝɹi


In [None]:
print("Transcription:", model("./audio_files/january_tyan3.ogg").get("text", ""))

Transcription: ɛnudɹi


In [None]:
print("Transcription:", model("./audio_files/january_tyan4.ogg").get("text", ""))

Transcription: dʒɛnɹi


In [None]:
print("Transcription:", model("./audio_files/january_tyan5.ogg").get("text", ""))

Transcription: tʃæɹuɝi


In [None]:
print("Transcription:", model("./audio_files/january_tyan6.ogg").get("text", ""))

Transcription: tʃeɪnnoʊdɝɹi


In [None]:
print("Transcription:", model(
    "./audio_files/word_january_japan1.mp3").get("text", ""))

Transcription: zʌnjuadi


In [None]:
print("Transcription:", model(
    "./audio_files/word_january_japan2.mp3").get("text", ""))

Transcription: zɑmuɑɾi


In [None]:
print("Transcription:", model(
    "./audio_files/word_january_japan3.mp3").get("text", ""))

Transcription: ʒʌnwɑdi


In [None]:
print("Transcription:", model(
    "./audio_files/word_january_japan3.mp3").get("text", ""))

In [None]:
print("Transcription:", model("./audio_files/audio_06.mp3").get("text", ""))

Transcription: hɛloʊθʰaɪɛnaɪsθʰumitjuəɡɛn


In [None]:
print("Transcription:", model("./audio_files/audio_01.mp3").get("text", ""))

Transcription: hɛloʊtaɪɛnnaɪstɪmitjuʌɡɛn


In [None]:
print("Transcription:", model("./audio_files/audio_05.mp3").get("text", ""))

Transcription: baɪannaɪtwiɡoʊəɡɛn


In [None]:
print("Transcription:", model("./audio_files/audio_japan_wrong.mp3").get("text", ""))

Transcription: hɑdɔtiminaɪstumitɔjuɑɡeɪn


In [None]:
print("Transcription:", model("./audio_files/audio_russian.mp3").get("text", ""))

Transcription: hjulʊtɛnaɪstʊmitjuəɡɛn


In [None]:
print("Transcription:", model("./audio_files/audio_16s.mp3").get("text", ""))

Transcription: joʊsʌpdʒoʊzoʊtoʊmɑsɛvɪtʃmɑɹtʃsɪkstinθnaɪntinoʊeɪtɑktoʊbɚfɪftinθnaɪntinnaɪnifɔɹsɚboʊkɹoʊeɪʃɪnjoʊsʌptoʊmɑʃɪvɪtʃwʌzɪnʌmɛɹɪkɪnikɑnʌmɪstɛnhɪstɔɹiɛnhuspɛʃl̩tiwʌzðiɛkɪnɑmɪkɛnsoʊʃl̩hɪstɚiʌvjuɡoʊslɑviʌ


In [None]:
print("Transcription:", model("./audio_files/word_march.mp3").get("text", ""))

Transcription: mɑɹtʃ


In [2]:
import torch
import librosa
import soundfile as sf
from transformers import AutoProcessor, AutoModelForCTC

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
model_id = "KoelLabs/xlsr-english-01"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForCTC.from_pretrained(model_id).to(device)

array, sample_rate = sf.read("./audio_files/tly.ogg")
array = librosa.resample(array, orig_sr=sample_rate,
                         target_sr=processor.feature_extractor.sampling_rate)
batch = [array]

input_values = (
    processor(
        batch,
        sampling_rate=processor.feature_extractor.sampling_rate,
        return_tensors="pt",
        padding=True,
    )
    .input_values.type(torch.float32)
    .to(model.device)
)
with torch.no_grad():
    logits = model(input_values).logits
predicted_ids_batch = torch.argmax(logits, dim=-1)
transcription_batch = [processor.decode(ids) for ids in predicted_ids_batch]

# get the start and end timestamp for each phoneme
phonemes_with_time_batch = []
for predicted_ids in predicted_ids_batch:
    predicted_ids = predicted_ids.tolist()
    duration_sec = input_values.shape[1] / \
        processor.feature_extractor.sampling_rate

    ids_w_time = [
        (i / len(predicted_ids) * duration_sec, _id)
        for i, _id in enumerate(predicted_ids)
    ]

    current_phoneme_id = processor.tokenizer.pad_token_id
    current_start_time = 0
    phonemes_with_time = []
    for time, _id in ids_w_time:
        if current_phoneme_id != _id:
            if current_phoneme_id != processor.tokenizer.pad_token_id:
                phonemes_with_time.append(
                    (processor.decode(current_phoneme_id), current_start_time, time)
                )
            current_start_time = time
            current_phoneme_id = _id

    phonemes_with_time_batch.append(phonemes_with_time)

print(transcription_batch)
print(phonemes_with_time_batch)

['tʃɛnjuɑɹin']
[[('tʃ', 0.5039244186046512, 0.5240813953488372), ('ɛ', 0.5643953488372093, 0.5845523255813954), ('n', 0.7659651162790698, 0.8062790697674419), ('j', 0.8062790697674419, 0.8264360465116279), ('u', 0.846593023255814, 0.86675), ('ɑ', 1.0481627906976745, 1.0683197674418605), ('ɹ', 1.2497325581395349, 1.269889534883721), ('i', 1.290046511627907, 1.310203488372093), ('n', 1.4714593023255813, 1.4916162790697673)]]
