In [1]:
# %pip install git+https://github.com/m-bain/whisperx.git

In [2]:
import re

# Türkçe karakterler için özel büyütme ve küçültme eşlemeleri
turkish_upper_chars = {"ı": "I", "i": "İ", "ş": "Ş", "ğ": "Ğ", "ü": "Ü", "ö": "Ö", "ç": "Ç"}
turkish_lower_chars = {v: k for k, v in turkish_upper_chars.items()}


def turkish_upper(s):
    return "".join(turkish_upper_chars.get(c, c.upper()) for c in s)


def turkish_lower(s):
    return "".join(turkish_lower_chars.get(c, c.lower()) for c in s)

def normalize_text(text: str) -> str:
    """
    Apply normalization to text described in Moonshine: Speech Recognition for Live Transcription and Voice Commands
    https://arxiv.org/html/2410.15608v2

    Handle Turkish specific characters, use helper functions defined earlier.
    """

    text = turkish_lower(text)
    text = re.sub(r'[^a-zçğıöşü]', ' ', text).replace("  ", " ")
    return text

In [None]:
import whisperx

# import gc
import json
import time
import random
from pathlib import Path
from ytk.normalizer import normalize_dictation
from evaluate import load

from asrtk.utils.text import normalize_text

print(normalize_text("Çok iyi ve nazik biriydi. Prusya’daki ilk karşılaşmamızda onu konuşturmayı başarmıştım. Bana o yaz North Cape’de bulunduğunu ve Nijni Novgorod panayırına gitmeyi çok istediğini anlatmıştı.,;)([-*])"))

wer_metric = load("wer")
SAMPLE_RATE = 16000  # Sample rate in Hz
NUM_SAMPLES_TO_TRIM = 0  # 4 * SAMPLE_RATE  # Number of samples in 4 seconds
SYNT_LABELS = False
device = "cuda"
batch_size = 1  # reduce if low on GPU mem
compute_type = "int8"  # change to "int8" if low on GPU mem (may reduce accuracy)
# compute_type = "float32"

asr_options = {
    # "temperatures": 0.0,
    # "beam_size": 1,
    "condition_on_previous_text": False,
    # "initial_prompt": ""
    "hotwords": None,
    "multilingual": False,
}

# model = whisperx.load_model("base", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  #
model = whisperx.load_model("N:/models/faster/ysdede/whisper-large-v3-turbo", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  #

# model = whisperx.load_model("N:/models/faster/ysdede/whisper-base-15k", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  #
# model = whisperx.load_model("N:/models/faster/ysdede/whisper-base-e1", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  #
# model = whisperx.load_model(r"N:\models\faster\ysdede\whisper-large-v3-turbo-med-tr-30k", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # Çok iyi. İlk finetune, frozen, drop active, bütün parametreler vardı. Bu ctranslatre2 tokenizer vs json dosyaların ıhatalı oluşturduğu için kötü sonuç veriyormuş meğer.
# model = whisperx.load_model(r"N:\models\faster\ysdede\whisper-large-v3-turbo-med-tr-2-40k", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # iyi, encoder frozen olmalı. İlki yarım kalınca ve inference ı hatalı yapınca buna geçmiştim. Kısaltmaları tam öğrenememiş. Yukarıdaki daha iyi.
# model = whisperx.load_model("N:/models/faster/ysdede/whisper-khanacademy-large-v3-turbo-tr", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # Başarılı, bazı terimler ve kısaltmaları bilmiyor.


# model = whisperx.load_model(r"N:\models\faster\ysdede\whisper-base-med-tr-120k", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # olacak gibi ama bazı terimlerde bariz hatalar yapıyor. Henüz görmemiş.
# model = whisperx.load_model(r"N:\models\faster\ysdede\whisper-base-med-tr-2nd-30k", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # olacak gibi, bazı terimleri öğrenmemiş. Henüz görmemiş.
# model = whisperx.load_model(r"N:\models\faster\ysdede\whisper-tiny-med-tr-65k", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # olmaya çalışıyor


# model = whisperx.load_model("N:/models/faster/ysdede/whisper-tiny-dsntt1", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # , asr_options=asr_options) kötü

# model = whisperx.load_model("N:/models/faster/ysdede/base-dsntt1-tr", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)
# model = whisperx.load_model("N:/models/faster/ysdede/small-dsntt1-tr", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # iyi
# model = whisperx.load_model("N:/models/faster/ysdede/medium-dsntt1-LoRA", device, compute_type=compute_type, download_root="n:/whisperx_models", language="tr", asr_options=asr_options)
# model = whisperx.load_model("N:/models/faster/ysdede/v2-dsntt1-LoRA-2-ck32000", device, compute_type=compute_type, download_root="n:/whisperx_models", language="tr")  # , asr_options=asr_options)
# model = whisperx.load_model("N:/models/faster/ysdede/small-re-dsn1-ck6k", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr")  # , asr_options=asr_options)
# model = whisperx.load_model("N:/models/faster/ysdede/large-v2-dsntt1-tr-ck71k", device, compute_type=compute_type, download_root="n:/whisperx_models", language="tr", asr_options=asr_options)

# model = whisperx.load_model("N:/models/faster/ysdede/large-v3-dsntt1-tr", device, compute_type=compute_type, download_root="n:/whisperx_models", language="tr")  # , asr_options=asr_options)
# model = whisperx.load_model("small", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr")  # , asr_options=asr_options)

# model.model.feature_extractor.mel_filters = model.model.feature_extractor.get_mel_filters(model.model.feature_extractor.sampling_rate, model.model.feature_extractor.n_fft, n_mels=128)

In [3]:
def get_largest_files(dataset_dir, n=25, file_type="opus"):
    dataset_dir = Path(dataset_dir)
    opus_files = list(dataset_dir.rglob(f'*.{file_type}'))
    print(len(opus_files))
    sorted_opus_files = sorted(opus_files, key=lambda x: x.stat().st_size, reverse=True)
    top_n_files = sorted_opus_files[:n]
    top_n_files_with_size = [(str(file), file.stat().st_size) for file in top_n_files]

    for file, size in top_n_files_with_size:
        print(f"{file}: {size / 1024:.0f} KB")
    return top_n_files

In [4]:
def get_random_files(dataset_dir, n=25, file_type="opus"):
    seed=42
    dataset_dir = Path(dataset_dir)
    opus_files = list(dataset_dir.rglob(f'*.{file_type}'))
    random.Random(seed).shuffle(opus_files)
    top_n_files = opus_files[:n]
    top_n_files_with_size = [(str(file), file.stat().st_size) for file in top_n_files]

    for file, size in top_n_files_with_size:
        print(f"{file}: {size / 1024:.0f} KB")
    return top_n_files

In [5]:
def transcript(audio_path, normalize=True):
    start = time.time()
    audio = whisperx.load_audio(audio_path)

    # if not SYNT_LABELS:
    #     audio = audio[NUM_SAMPLES_TO_TRIM:]  # If not using synthetic labels, trim the first 4 seconds cause RL recordings have 4 seconds of garbage at the beginning.

    num_samples = audio.shape[0]  # Get the number of samples from the shape
    audio_length = round(num_samples / SAMPLE_RATE, 3)

    result = model.transcribe(audio, batch_size=batch_size, print_progress=False, language="tr")
    transcription_time = round(time.time() - start, 2)
    prediction = ""
    for result in result["segments"]:
        prediction += result["text"] + " "

    prediction = normalize_dictation(prediction)

    return prediction, transcription_time, audio_length


In [None]:
# from ytk.normalizer import fill_dates
# from ytk.utils import turkish_capitalize
from webvtt import read as read_vtt

SYNT_LABELS = False
n = 5000

# INPUT_DIR = r"N:\dataset_v3\YENI_SPLIT_LQ_NOISY"
INPUT_DIR = r"N:\dataset_v3\YENI_SPLIT"
# INPUT_DIR = r"N:\dataset_v3\commonvoice_17_tr\commonvoice_17_tr_fixed\test"

file_list = get_random_files(INPUT_DIR, n, file_type="mp3")
print(len(file_list))

total_wer = 0
avg_wer = 0
processed_files = 0
label_pairs = []

def get_text_from_file(file_path):
    """Extract text from either VTT or TXT file."""
    try:
        # Check file extension
        if file_path.suffix.lower() == '.vtt':
            captions = read_vtt(str(file_path))
            transcription = ' '.join(caption.text.replace('\n', ' ') for caption in captions)
        else:  # Assume txt file
            with open(file_path, 'r', encoding='utf-8') as f:
                transcription = f.read().replace('\n', ' ')
        
        return transcription.strip()
    except Exception as e:
        print(f"Error reading file {file_path}: {str(e)}")
        return None

for audio_path in file_list:
    try:
        # Try VTT first, then TXT
        vtt_file = audio_path.with_suffix('.vtt')
        txt_file = audio_path.with_suffix('.txt')
        
        if vtt_file.exists():
            reference = get_text_from_file(vtt_file)
            transcript_file = vtt_file
        elif txt_file.exists():
            reference = get_text_from_file(txt_file)
            transcript_file = txt_file
        else:
            print(f"No transcript file found for {audio_path}")
            continue
            
        if not reference or len(reference) < 5:
            continue


        prediction, transcription_time, audio_length = transcript(audio_path)
        # prediction = prediction.replace(" x ", "x")

        if "No active speech found in audio" in prediction:
            continue

        rf, pre = reference, prediction

        prediction = normalize_text(prediction)
        reference = normalize_text(reference)

        wer = wer_metric.compute(references=[reference], predictions=[prediction])
        total_wer += wer
        processed_files += 1
        avg_wer = total_wer / processed_files

        label_pairs.append((f"{wer * 100:0.02f}", rf, pre))

        print(rf)
        print(pre)
        print(f"{processed_files}/{len(file_list)} Avg WER: {avg_wer * 100:0.02f}%, WER: {wer * 100:0.02f}%, {transcript_file} - Duration: {audio_length} sec, Speed: {transcription_time / audio_length:0.02f}")

    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

with open("labels-base-new.tsv", "w", encoding="utf-8") as lf:
    for wer, r, p in label_pairs:
        lf.write(f"{wer}\t{r}\t{p}\n")

