### Transcribe Example

In [6]:
import whisper

model = whisper.load_model("large")

100%|█████████████████████████████████████| 2.88G/2.88G [00:41<00:00, 73.6MiB/s]


In [7]:
model = model.to("cuda")

In [8]:
audio_file = "audio_files/Gege - interviu.m4a" 

In [9]:
result = model.transcribe(audio_file, language="ro")
# print(result["text"])

In [10]:
with open(f"{audio_file}.txt", "w") as text_file:
    text_file.write(result["text"])

### Run on all audio files

In [11]:
audio_files = [
    "audio_files/Gege - interviu.m4a",
    "audio_files/Ana B - Interviu.m4a",
    "audio_files/Rebeca Avram - Interviu.m4a",
    "audio_files/Valentin Toc - Interviu.m4a",
    "audio_files/Loredana - Techcelerator.m4a",
    "audio_files/Interview - Alex - Licenseware.m4a",
]

In [14]:
import os
for audio_file in audio_files:
    assert os.path.exists(audio_file)

In [16]:
from tqdm import tqdm

for audio_file in tqdm(audio_files):
    print(f"Transcribing {audio_file}...")
    result = model.transcribe(audio_file, language="ro")
    with open(f"{audio_file}.txt", "w") as text_file:
        text_file.write(result["text"])

  0%|                                                                                             | 0/6 [00:00<?, ?it/s]

Transcribing audio_files/Gege - interviu.m4a...


 17%|██████████████                                                                      | 1/6 [09:03<45:16, 543.31s/it]

Transcribing audio_files/Ana B - Interviu.m4a...


 33%|████████████████████████████                                                        | 2/6 [16:16<31:54, 478.65s/it]

Transcribing audio_files/Rebeca Avram - Interviu.m4a...


 50%|██████████████████████████████████████████                                          | 3/6 [27:09<27:54, 558.32s/it]

Transcribing audio_files/Valentin Toc - Interviu.m4a...


 67%|████████████████████████████████████████████████████████                            | 4/6 [32:35<15:33, 466.66s/it]

Transcribing audio_files/Loredana - Techcelerator.m4a...


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [36:44<06:27, 387.82s/it]

Transcribing audio_files/Interview - Alex - Licenseware.m4a...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [42:06<00:00, 421.10s/it]


### Take all files, and add new lines after each sentence.

In [19]:
import re

def split_text_into_sentences(text, max_length=80):
    # Split text into sentences using regex
    sentences = re.split(r'(?<=[.!?]) +', text)
    processed_sentences = []

    for sentence in sentences:
        if len(sentence) > max_length:
            # Split long sentences
            words = sentence.split()
            current_sentence = ''
            for word in words:
                if len(current_sentence) + len(word) + 1 > max_length:
                    processed_sentences.append(current_sentence)
                    current_sentence = word
                else:
                    if current_sentence:
                        current_sentence += ' ' + word
                    else:
                        current_sentence = word
            if current_sentence:
                processed_sentences.append(current_sentence)
        else:
            processed_sentences.append(sentence)
    
    return processed_sentences

def process_file(input_file, output_file, max_length=80):
    with open(input_file, 'r') as file:
        text = file.read()
    
    sentences = split_text_into_sentences(text, max_length)
    
    with open(output_file, 'w') as file:
        for sentence in sentences:
            file.write(sentence + '\n')

In [20]:
for audio_file in audio_files:
    # Example usage
    input_file = f'{audio_file}.txt'
    output_file = f'{audio_file}_processed.txt'
    process_file(input_file, output_file)