In [63]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
import jiwer

In [74]:
TYPES = ["bcd", "car", "dat", "id", "mon", "phn", "tim"]

def normalize_transcription(text="", type=""):
    result = text
    if type in ["car", "id", "bcd", "phn"]:
        result = re.sub(r'[\W_]+', '', text).lower()
    elif type in ["dat", "mon", "tim"]:
        if(isinstance(text, float)):
            text = str(text)
        result = text.strip().lower()
    return result


def get_type_and_audio(path = ""):
    type = path.split("/")[2]
    audio = path.split("/")[-1].split('.')[0]
    return pd.Series([type, audio])

def get_labels(path = "", dataset = 'full',types = "all",delimiter = " ~ "):
    if types == "all":
        types = TYPES.copy()
    
    if dataset == 'full':
        dataset = 'dataset'

    dfs = []
    for type in types:
        df = pd.read_csv(Path(path) / f'{type}_{dataset}.csv', dtype='str')
        df['label'] = df['transcript'].apply(normalize_transcription, type=type)
        dfs.append(df)
    labels = pd.concat(dfs, axis = 0)
    return labels

def get_inferences(file = "", types = "all"):
    tscp = pd.read_csv(file, dtype='str')
    tscp["normalized_inference"] = tscp.apply(lambda x: normalize_transcription(text = x['transcription'],
                                             type = x['data_type']), axis = 1)
    return tscp

def exact_match(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return float((y_true == y_pred).sum() / len(y_true))

def levenshtein_distance(s1, s2):
    m, n = len(s1), len(s2)
    # Create a distance matrix with dimensions (m+1) x (n+1)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Initialize the first row and column of the matrix.
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    
    # Compute distances for each substring pair.
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i - 1] == s2[j - 1]:
                cost = 0
            else:
                cost = 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # deletion
                dp[i][j - 1] + 1,      # insertion
                dp[i - 1][j - 1] + cost  # substitution
            )
    return dp[m][n]


def cer(y_true, y_pred):
    total_distance = 0
    total_chars = 0

    for ref, hyp in zip(y_true, y_pred):
        total_distance += levenshtein_distance(ref, hyp)
        total_chars += len(ref)

    overall_cer = total_distance / total_chars if total_chars > 0 else 0
    return overall_cer

def wer(y_true, y_pred):
    """
    Calculate corpus-level WER by joining all sentences and computing WER on the concatenated texts.
    
    Parameters:
        y_true (pd.Series or list of str): Series (or list) of reference transcripts.
        y_pred (pd.Series or list of str): Series (or list) of predicted transcripts.
        
    Returns:
        float: Corpus-level Word Error Rate.
    """

    transformation = jiwer.Compose([
        jiwer.ToLowerCase(), 
        jiwer.Strip()
    ])

    # Convert lists to pandas Series if necessary
    if not isinstance(y_true, pd.Series):
        y_true = pd.Series(y_true)
    if not isinstance(y_pred, pd.Series):
        y_pred = pd.Series(y_pred)
    
    # Join the entire corpus into single strings
    ref_corpus = " ".join(y_true.tolist())
    hyp_corpus = " ".join(y_pred.tolist())
    
    # Compute the corpus-level WER
    return jiwer.wer(ref_corpus, hyp_corpus)


def model_benchmark(data_path, model, dataset = "full", types = "all"):
    model_path = Path(data_path) / "inferences" / f'{model}.csv'
    if dataset == "full":
        label_path = Path(data_path) / 'dataset' 
    else:
        label_path = Path(data_path) / 'dataset' / dataset

    inference = get_inferences(model_path, types = types)

    if types == "all":
        types = TYPES
    types_list = [[type] for type in types] + [types]


    result = []
    for type_item in types_list:
        labels = get_labels(label_path, types=type_item, dataset=dataset)
        df = inference.merge(labels, on = 'audio') 

        count = df.shape[0]
        EM = exact_match(df['label'], df['normalized_inference'])
        CER = cer(df['label'], df['normalized_inference'])
        WER = wer(df['label'], df['normalized_inference'])
        if len(type_item) == 1:
            data_type = type_item[0]
        else:
            data_type = "all"

        result.append({
            "data-type": data_type,
            "count": count,
            "EM": EM,
            "CER": CER,
            "WER": WER
        })
    return result

def print_asr_benchmark(data, model):
    """
    Prints a formatted ASR model benchmark table.
    
    Parameters:
    - data: list of dictionaries with keys 'data-type', 'count', 'EM', 'CER', and 'WER'
    """
    # Print header
    header = "{:<12} {:<7} {:<8} {:<8} {:<8}".format("Data Type", "Count", "EM", "CER", "WER")
    separator = "-" * len(header)
    
    print(f"{model} benchmark\n".upper())
    print(header)
    print(separator)
    
    # ANSI escape sequences for bold yellow formatting
    BOLD_YELLOW = "\033[1;33m"
    RESET = "\033[0m"
    
    # Print each data row; highlight the 'all' row
    for record in data:
        row = "{:<12} {:<7} {:.4f}   {:.4f}   {:.4f}".format(
            record["data-type"],
            record["count"],
            record["EM"],
            record["CER"],
            record["WER"]
        )
        if record["data-type"] == "all":
            row = BOLD_YELLOW + row + RESET
        print(row)


In [75]:
data_path = '../Data'
model = 'whisper-large-v3'
dataset = 'train'
types = ['bcd']
model_path = Path(data_path) / "inferences" / f'{model}.csv'
label_path = Path(data_path) / 'dataset' 

inference = get_inferences(model_path, types = types)
labels = get_labels(label_path, types=types)
df = inference.merge(labels, on = 'audio') 
df


Unnamed: 0,audio,audio_filepath_x,data_type,transcription,normalized_inference,audio_filepath_y,speaker,transcript,label
0,575298922_459913_459922_25170,/train/bank-cards/voices/575298922_459913_4599...,bcd,"29-74, 51-98, 17-81, 61-72",2974519817816172,../Data/audio/bcd/voices/575298922_459913_4599...,575298922,2974519817816172,2974519817816172
1,1229817727_454607_454617_25403,/train/bank-cards/voices/1229817727_454607_454...,bcd,17-88-23-12-41-59-89-61,1788231241598961,../Data/audio/bcd/voices/1229817727_454607_454...,1229817727,1788231241598961,1788231241598961
2,828296631_470091_470123_25362,/train/bank-cards/voices/828296631_470091_4701...,bcd,12-31-99-93-33-99-37-82,1231999333993782,../Data/audio/bcd/voices/828296631_470091_4701...,828296631,1231999333993782,1231999333993782
3,314849420_422111_422114_25221,/train/bank-cards/voices/314849420_422111_4221...,bcd,22-48-7369-22-684-447,2248736922684447,../Data/audio/bcd/voices/314849420_422111_4221...,314849420,2248736922684447,2248736922684447
4,341881303_439547_439556_25304,/train/bank-cards/voices/341881303_439547_4395...,bcd,"2, 6, 4, 7, 3, 9, 4, 6, 2, 8, 4, 9, 8, 1, 5, 8",2647394628498158,../Data/audio/bcd/voices/341881303_439547_4395...,341881303,2647394628498158,2647394628498158
...,...,...,...,...,...,...,...,...,...
750,314849420_476149_476188_25173,/train/bank-cards/voices/314849420_476149_4761...,bcd,"58, 24, 16, 13, 71, 51, 76, 63",5824161371517663,../Data/audio/bcd/voices/314849420_476149_4761...,314849420,5824161371517663,5824161371517663
751,1347919237_445797_445869_25324,/train/bank-cards/voices/1347919237_445797_445...,bcd,"14, 82, 69, 87, 28, 18, 15, 15",1482698728181515,../Data/audio/bcd/voices/1347919237_445797_445...,1347919237,1482698728181515,1482698728181515
752,1297961140_475537_475548_25883,/train/bank-cards/voices/1297961140_475537_475...,bcd,"1, 4, 9, 4, 1, 1, 3, 4, 7, 5, 8, 7, 8, 6, 3, 9",1494113475878639,../Data/audio/bcd/voices/1297961140_475537_475...,1297961140,1494113475878639,1494113475878639
753,619589413_437632_437637_25483,/train/bank-cards/voices/619589413_437632_4376...,bcd,24-65-95-94-64-29-14-82,2465959464291482,../Data/audio/bcd/voices/619589413_437632_4376...,619589413,2465959464291482,2465959464291482


In [76]:
types = TYPES
model = "whisper-large-v3"
result = model_benchmark("../Data", model, dataset='test', types = types)
print_asr_benchmark(result, model = model)

WHISPER-LARGE-V3 BENCHMARK

Data Type    Count   EM       CER      WER     
-----------------------------------------------
bcd          111     0.8559   0.0152   0.1441
car          122     0.5328   0.1756   0.4672
dat          655     0.1435   0.4835   0.7763
id           391     0.4041   0.3073   0.5959
mon          503     0.1193   0.3262   0.5317
phn          125     0.8480   0.0504   0.1520
tim          27      0.0000   0.3780   0.6778
[1;33mall          1934    0.2989   0.3589   0.6218[0m


In [78]:
types = TYPES
model = "whisper-turbo"
result = model_benchmark("../Data", model, dataset='test',types = types)
print_asr_benchmark(result, model = model)

WHISPER-TURBO BENCHMARK

Data Type    Count   EM       CER      WER     
-----------------------------------------------
bcd          111     0.6847   1.0890   0.3153
car          122     0.2869   1.8829   0.7131
dat          655     0.0107   1.0091   1.1483
id           391     0.2890   1.2889   0.7110
mon          503     0.1352   0.7298   0.6943
phn          125     0.5120   1.4768   0.4880
tim          27      0.0370   0.9319   1.1556
[1;33mall          1934    0.1882   1.0058   0.8795[0m


In [68]:
types = ['phn', 'bcd']
model = "whisper-large-v3"
result = model_benchmark("../Data", model, types = types)
print_asr_benchmark(result, model = model)

WHISPER-LARGE-V3 BENCHMARK

Data Type    Count   EM       CER      WER     
-----------------------------------------------
phn          789     0.8365   0.0736   0.1635
bcd          755     0.8596   0.0864   0.1404
[1;33mall          1544    0.8478   0.0814   0.1522[0m


In [69]:
types = ['id', 'car']
model = "whisper-large-v3"
result = model_benchmark("../Data", model, types = types)
print_asr_benchmark(result, model = model)

WHISPER-LARGE-V3 BENCHMARK

Data Type    Count   EM       CER      WER     
-----------------------------------------------
id           2343    0.4106   0.2786   0.5894
car          693     0.4935   0.2482   0.5065
[1;33mall          3036    0.4295   0.2730   0.5705[0m


In [70]:
types = ['dat', 'tim', 'mon']
model = "whisper-large-v3"
result = model_benchmark("../Data", model, types = types)
print_asr_benchmark(result, model = model)

WHISPER-LARGE-V3 BENCHMARK

Data Type    Count   EM       CER      WER     
-----------------------------------------------
dat          4079    0.1701   0.4179   0.6910
tim          169     0.0296   0.3203   0.6003
mon          2650    0.0962   0.3685   0.5673
[1;33mall          6898    0.1383   0.3978   0.6332[0m
