In [None]:
'''https://huggingface.co/openai/whisper-large-v3'''

In [None]:
pip install --upgrade git+https://github.com/huggingface/transformers.git accelerate datasets[audio]

In [12]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe("tanuljunk_kutyaul.m4a", generate_kwargs={"language": "hungarian", "task": "translate"}, return_timestamps=True)
print(result["text"])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 Hello everyone, good afternoon! This is a very interesting topic. Why do we have to talk about the dog? Why do we talk about the dog? Why don't we talk about the dog? Well, we say that we have to talk about the dog. We don't hold this presentation by accident, we don't at this moment, we are in the middle of a huge change, a dog paradigm shift. I would like to prove this right away. I would like to start with a question. put your hand on the dog's hand who chose a specific dog for you. So when he went to the dog house, he went to the dog house, then he looked at the little dog with that eye, that he would be a racing dog from Egiliti, or a lion dog, or he would hunt with you. Put your hand on the hand who chose your dog like this. Three, four, five. Thank you. And now I would like to ask you to raise your hand if you chose your dog as a family member. You went to the dog for a walk. Okay, and you raised your hand, by the way, who is the first. If I had raised this question a hundred y

In [19]:
result["chunks"]

[{'timestamp': (0.0, 3.0), 'text': ' Hello everyone, good afternoon!'},
 {'timestamp': (3.0, 6.0), 'text': ' This is a very interesting topic.'},
 {'timestamp': (6.0, 10.0), 'text': ' Why do we have to talk about the dog?'},
 {'timestamp': (10.0, 13.0),
  'text': " Why do we talk about the dog? Why don't we talk about the dog?"},
 {'timestamp': (13.0, 16.0),
  'text': ' Well, we say that we have to talk about the dog.'},
 {'timestamp': (16.0, 35.0),
  'text': " We don't hold this presentation by accident, we don't at this moment, we are in the middle of a huge change, a dog paradigm shift."},
 {'timestamp': (35.0, 37.0),
  'text': ' I would like to prove this right away.'},
 {'timestamp': (37.0, 46.4),
  'text': " I would like to start with a question. put your hand on the dog's hand who chose a specific dog for you."},
 {'timestamp': (46.4, 49.2),
  'text': ' So when he went to the dog house, he went to the dog house,'},
 {'timestamp': (49.2, 54.5),
  'text': ' then he looked at the l

In [23]:
def convert_to_srt(result, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        for idx, entry in enumerate(result, start=1):
            start_time = "{:02d}:{:02d}:{:06.3f}".format(
                int(entry['timestamp'][0]) // 3600,
                int(entry['timestamp'][0]) % 3600 // 60,
                entry['timestamp'][0] % 60
            )
            end_time = "{:02d}:{:02d}:{:06.3f}".format(
                int(entry['timestamp'][1]) // 3600,
                int(entry['timestamp'][1]) % 3600 // 60,
                entry['timestamp'][1] % 60
            )
            text = entry['text']

            file.write(f"{idx}\n")
            file.write(f"{start_time.replace('.', ',')} --> {end_time.replace('.', ',')}\n")
            file.write(f"{text}\n\n")

In [24]:
output_file = 'output2.srt'
convert_to_srt(result["chunks"], output_file)