In [1]:
# %pip install git+https://github.com/m-bain/whisperx.git

In [None]:
import re

turkish_upper_chars = {"ı": "I", "i": "İ", "ş": "Ş", "ğ": "Ğ", "ü": "Ü", "ö": "Ö", "ç": "Ç"}

turkish_hatted_chars = {
    "â": "a",
    "Â": "A",
    "î": "i",
    "Î": "I",
    "û": "u",
    "Û": "U",
    "ô": "o",  # Bazı metinlerde karşılaşılabiliyor, standartta yer almamakla birlikte eklenmiştir.
    "Ô": "O"
}
turkish_lower_chars = {v: k for k, v in turkish_upper_chars.items()}

def replace_hatted_characters(s):
    for k, v in turkish_hatted_chars.items():
        s = s.replace(k, v)
    return s

def turkish_lower(s):
    return "".join(turkish_lower_chars.get(c, c.lower()) for c in s)

def normalize_text(text: str) -> str:
    """
    Apply normalization to text described in Moonshine: Speech Recognition for Live Transcription and Voice Commands
    https://arxiv.org/html/2410.15608v2

    Handle Turkish specific characters, use helper functions defined earlier.
    """
    text = text.replace(" '", "'").replace("' ", " ").replace("'", "")
    text = replace_hatted_characters(text)
    text = turkish_lower(text)
    text = re.sub(r'[^a-zçğıöşü]', ' ', text).replace("  ", " ")
    return text.strip()

print(normalize_text("âîôû Çok iyi ve nazik biriydi. Prusya'daki ilk karşılaşmamızda onu konuşturmayı başarmıştım. Bana o yaz North Cape’de bulunduğunu ve Nijni Novgorod panayırına gitmeyi çok istediğini anlatmıştı.,;)([-*])"))
print(replace_hatted_characters("âîôû Çok iyi ve nazik biriydi. Prusya'daki ilk karşılaşmamızda onu konuşturmayı başarmıştım. Bana o yaz North Cape’de bulunduğunu ve Nijni Novgorod panayırına gitmeyi çok istediğini anlatmıştı.,;)([-*])"))

In [None]:
import whisperx
import numpy as np
# import gc
import json
import time
import random
from pathlib import Path
from evaluate import load


wer_metric = load("wer")
SAMPLE_RATE = 16000  # Sample rate in Hz
NUM_SAMPLES_TO_TRIM = 0  # 4 * SAMPLE_RATE  # Number of samples in 4 seconds
SYNT_LABELS = False
device = "cuda"
batch_size = 1  # reduce if low on GPU mem
compute_type = "int8"  # change to "int8" if low on GPU mem (may reduce accuracy)
# compute_type = "float32"

asr_options = {
    # "temperatures": 0.0,
    # "beam_size": 5,
    "condition_on_previous_text": False,
    # "initial_prompt": ""
    "hotwords": None,
    "multilingual": False,
}

# vad_options = {"vad_onset": 0.500, "vad_offset": 0.363}
                                                                                                                                                                                                                #    cv 17 test   # tr tts       | dsn test
# model = whisperx.load_model("N:/models/faster/ysdede/whisper-large-v3-turbo", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # 12.91   0.058   / 12    0.032 | 14.75  0.071 | 20
# model = whisperx.load_model("N:/models/faster/ysdede/whisper-base-turkish-0", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)   # 20.5           / 19.3  0.066 | 36.34  0.137 | 33   0.08
# model = whisperx.load_model("N:/models/faster/ysdede/whisper-base-turkish-1", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)   # 17.7    0.069  / 14.86 0.06  | 37.9   0.147 | 32.8 0.08 
# model = whisperx.load_model("N:/models/faster/ysdede/whisper-base-turkish-1.1", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options) # 17      0.065  / 13.95 0.056 | 36  0.143
# model = whisperx.load_model("N:/models/faster/ysdede/whisper-small-turkish-0", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # 13.6    0.054  / 8.48  0.048 | 22.6   0.094 | 28
# model = whisperx.load_model("N:/models/faster/ysdede/whisper-small-turkish-1", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # 15.05   0.061  / 8.65  0.026 | 24.7  0.099  | 27.5  0.063
# model = whisperx.load_model("N:/models/faster/ysdede/whisper-tiny-turkish-0", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)   # 25.8    0.096  / 20.5  0.065
# model = whisperx.load_model("N:/models/faster/Systran/faster-whisper-base", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)     # 33      0.130   / 44  0.157
# model = whisperx.load_model("N:/models/faster/Systran/faster-whisper-small", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)    # 21      0.084  | 29.25 0.295             | 27.6  0.106



# model = whisperx.load_model("N:/models/faster/ysdede/base-dsntt1-tr", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)
# model = whisperx.load_model("N:/models/faster/ysdede/small-dsntt1-tr", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)
# model = whisperx.load_model(r"N:\models\faster\ysdede\whisper-small-turkish-0-med-0", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)
# model = whisperx.load_model("N:/models/faster/ysdede/whisper-base-turkish-1.1-med", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)


# model = whisperx.load_model("Systran/faster-whisper-tiny", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)                      # 


# model = whisperx.load_model(r"N:\models\faster\ysdede\whisper-large-v3-turbo-med-tr-30k", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # Çok iyi. İlk finetune, frozen, drop active, bütün parametreler vardı. Bu ctranslatre2 tokenizer vs json dosyaların ıhatalı oluşturduğu için kötü sonuç veriyormuş meğer.
# model = whisperx.load_model(r"N:\models\faster\ysdede\whisper-large-v3-turbo-med-tr-2-40k", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # iyi, encoder frozen olmalı. İlki yarım kalınca ve inference ı hatalı yapınca buna geçmiştim. Kısaltmaları tam öğrenememiş. Yukarıdaki daha iyi.
# model = whisperx.load_model("N:/models/faster/ysdede/whisper-khanacademy-large-v3-turbo-tr", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)  # Başarılı, bazı terimler ve kısaltmaları bilmiyor.


# model = whisperx.load_model(r"n:\models\faster\emre\whisper-medium-turkish-2", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)
model = whisperx.load_model("deepdml/faster-whisper-large-v3-turbo-ct2", device, compute_type=compute_type, download_root="n:\\whisperx_models", language="tr", asr_options=asr_options)

In [4]:
def get_largest_files(dataset_dir, n=25, file_type="opus"):
    dataset_dir = Path(dataset_dir)
    opus_files = list(dataset_dir.rglob(f'*.{file_type}'))
    print(len(opus_files))
    sorted_opus_files = sorted(opus_files, key=lambda x: x.stat().st_size, reverse=True)
    top_n_files = sorted_opus_files[:n]
    top_n_files_with_size = [(str(file), file.stat().st_size) for file in top_n_files]

    for file, size in top_n_files_with_size:
        print(f"{file}: {size / 1024:.0f} KB")
    return top_n_files

In [5]:
def get_random_files(dataset_dir, n=25, file_type="opus"):
    seed=42
    dataset_dir = Path(dataset_dir)
    opus_files = list(dataset_dir.rglob(f'*.{file_type}'))
    random.Random(seed).shuffle(opus_files)
    top_n_files = opus_files[:n]
    top_n_files_with_size = [(str(file), file.stat().st_size) for file in top_n_files]

    for file, size in top_n_files_with_size:
        print(f"{file}: {size / 1024:.0f} KB")
    return top_n_files

In [6]:
def transcript(audio, normalize=True):
    start = time.time()

    if isinstance(audio, str):
        audio = whisperx.load_audio(audio)

    num_samples = audio.shape[0]  # Get the number of samples from the shape
    audio_length = round(num_samples / SAMPLE_RATE, 3)

    result = model.transcribe(audio, batch_size=batch_size, print_progress=False, language="tr")
    transcription_time = round(time.time() - start, 2)
    prediction = ""
    for result in result["segments"]:
        prediction += result["text"] + " "

    return prediction.strip(), transcription_time, audio_length


In [7]:
def levenshtein_distance(str1: str, str2: str) -> int:
    """
    Calculate the Levenshtein distance between two strings.
    
    Args:
        str1: First string
        str2: Second string
        
    Returns:
        int: The minimum number of single-character edits needed to change str1 into str2
    """
    # Create a matrix of size (len(str1) + 1) x (len(str2) + 1)
    matrix = [[0 for _ in range(len(str2) + 1)] for _ in range(len(str1) + 1)]
    
    # Initialize first row and column
    for i in range(len(str1) + 1):
        matrix[i][0] = i
    for j in range(len(str2) + 1):
        matrix[0][j] = j
    
    # Fill in the rest of the matrix
    for i in range(1, len(str1) + 1):
        for j in range(1, len(str2) + 1):
            if str1[i-1] == str2[j-1]:
                matrix[i][j] = matrix[i-1][j-1]
            else:
                matrix[i][j] = min(
                    matrix[i-1][j] + 1,    # deletion
                    matrix[i][j-1] + 1,    # insertion
                    matrix[i-1][j-1] + 1   # substitution
                )
    
    return matrix[len(str1)][len(str2)]


In [8]:
def normalized_levenshtein_distance(str1: str, str2: str) -> float:
    """
    Calculate the normalized Levenshtein distance between two strings.
    Returns a value between 0.0 (identical) and 1.0 (completely different).
    
    Args:
        str1: First string
        str2: Second string
        
    Returns:
        float: Normalized distance between 0.0 (identical) and 1.0 (completely different)
    """
    # Get the raw Levenshtein distance
    distance = levenshtein_distance(str1, str2)
    
    # Normalize by the length of the longer string
    max_length = max(len(str1), len(str2))
    
    # Avoid division by zero
    if max_length == 0:
        return 0.0 if len(str1) == len(str2) else 1.0
        
    return round(distance / max_length, 3)

In [9]:
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained sentence embedding model fine-tuned on semantic similarity tasks
# st_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')  # handles hatted chars
# st_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
# st_model = SentenceTransformer("emrecan/bert-base-turkish-cased-mean-nli-stsb-tr")  # iyi * 
st_model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1")  # iyi ama hatted da kalıyor
# st_model = SentenceTransformer("oguuzhansahin/bi-encoder-mnrl-dbmdz-bert-base-turkish-cased-margin_3.0-msmarco-tr-10k")  # , device='cpu'

# st_model = SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
# st_model = SentenceTransformer("sentence-transformers/LaBSE")

def sentence_similarity(sent1, sent2):
    # Compute the embeddings for both sentences
    embedding1 = st_model.encode(sent1, convert_to_tensor=True)
    embedding2 = st_model.encode(sent2, convert_to_tensor=True)
    
    # Calculate the cosine similarity between the two embeddings
    cosine_sim = util.cos_sim(embedding1, embedding2)
    return cosine_sim.item()

In [None]:
from datasets import load_dataset
from ytk.normalizer import fill_dates

SYNT_LABELS = False
n = 9999  # Number of samples to evaluate
# from huggingface_hub import notebook_login, login

# login(token="TOKEN")
# Load the Huggingface dataset

ds = "ysdede/yeni-split-0"
# ds = "ysdede/yeni-split-lq-noisy"
# ds = "ysdede/ds_test"
# ds = "ysdede/rad-vits-1"
# ds = "erenfazlioglu/turkishvoicedataset"
# ds = "ysdede/commonvoice_17_tr_fixed"

split = "test"
dataset = load_dataset(ds, split=split, streaming=False, trust_remote_code=True)


total_wer = 0
avg_wer = 0
processed_files = 0
label_pairs = []
total_dist = 0
avg_dist = 0
total_sim = 0
avg_dist = 0

total_audio_duration = 0
total_transcription_time = 0
avg_speed = 0

for i, example in enumerate(dataset):
    if i >= n:
        break
    try:
        audio_array = example["audio"]["array"].astype(np.float32)
        sampling_rate = example["audio"]["sampling_rate"]
        reference = example["transcription"]

        if "/aa/yyyy" in reference:
            reference = fill_dates(reference)

        prediction, transcription_time, audio_length = transcript(audio_array)

        if "No active speech found in audio" in prediction:
            continue  # Skip this example

        rf, pre = reference, prediction

        prediction = normalize_text(prediction)
        reference = normalize_text(reference)

        wer = wer_metric.compute(references=[reference], predictions=[prediction])
        lev_dist = normalized_levenshtein_distance(reference, prediction)
        similarity = sentence_similarity(replace_hatted_characters(rf), replace_hatted_characters(pre))

        total_wer += wer
        processed_files += 1
        avg_wer = total_wer / processed_files
        total_dist += lev_dist
        avg_dist = total_dist / processed_files
        total_sim += similarity
        avg_sim = total_sim / processed_files

        total_audio_duration += audio_length
        total_transcription_time += transcription_time
        avg_speed = round(total_audio_duration / total_transcription_time, 2)

        label_pairs.append((f"{wer * 100:0.02f}", lev_dist, rf, pre))

        print(rf)
        print(pre)
        print(f"{processed_files}/{len(dataset)} Avg WER: {avg_wer * 100:0.02f}%, WER: {wer * 100:0.02f}%, Avg Dist: {avg_dist:0.03f}, Distance: {lev_dist}, Sim: {similarity:0.03f}, AvgSim: {avg_sim:0.03f}, Dur: {audio_length} sec, time: {transcription_time}, Speed: {audio_length/transcription_time:0.02f}x, AvgSpeed: {avg_speed}x")


    except Exception as e:
        print(f"Error processing example: {e}")


with open("labels-base-new.tsv", "w", encoding="utf-8") as lf:
    for wer, lev_dist, r, p in label_pairs:
        lf.write(f"{wer}\t{lev_dist}\t{r}\t{p}\n")
