# Main Metrics

Set the language and whether to normalize the transcripts/ground truth.

In [148]:
NORM = False
LANG = "en"

## Load Data

In [149]:
import pandas as pd

if LANG == "ml":
    file_path = "survey/survey_malayalam.csv"
elif LANG == "en":
    file_path = "survey/survey_english.csv"
elif LANG == "ar":
    file_path = "survey/survey_arabic.csv"
df = pd.read_csv(file_path)

In [150]:
ground_path = f"transcriptions/{LANG}/ground.txt"
ground = pd.read_csv(ground_path, sep="|", header=None)[1]

## Collate and Modify

The below loads the phonemizer, for calculating Phoneme Error Rate (PER).

In [151]:
from phonemizer import phonemize
from phonemizer.separator import Separator
from phonemizer.backend import EspeakBackend
from difflib import ndiff
import re


separator = Separator(phone='-', word=';')
backend = EspeakBackend(LANG if LANG != "en" else "en-us")

def phonemizer(text):
    phonemes = backend.phonemize([text], separator=separator)
    phonemes = [e for e in re.split(r'[-;]', phonemes[0]) if e]
    return phonemes

def per(str1, str2):
    phonemes1 = phonemizer(str1)
    phonemes2 = phonemizer(str2)

    counter = {"+": 0, "-": 0}
    distance = 0
    for edit_code, *_ in ndiff(phonemes1, phonemes2):
        if edit_code == " ":
            distance += max(counter.values())
            counter = {"+": 0, "-": 0}
        else: 
            if edit_code != '?':
                counter[edit_code] += 1
    
    distance += max(counter.values())
    return distance/len(phonemes1)

Calculates all metrics and collates them into a single dataframe.

In [152]:
from jiwer import wer, cer

if NORM:
    from whisper.normalizers import EnglishTextNormalizer
    norm = EnglishTextNormalizer()

data = []
for i in range(50):
    question = []
    for j in range(4):
        q = df[f"Q{i+1}_{j+1}"]
        trans, scores = q[0], q[1:]
        if NORM:
            trans = norm(trans)
        subdata = {
            'transcript': trans,
            'scores': [float(s) for s in scores],
            'wer': wer(ground[i], trans),
            'cer': cer(ground[i], trans),
            'per': per(ground[i], trans)
        }
        question.append(subdata)        
    data.append(question)

## Find Correlations

The correlation between the raw evaluator scores and the metrics is calculated below.

In [153]:
import numpy as np
from scipy.stats import rankdata

scores = np.array([[list(x['scores']) for x in q] for q in data])
scores_rank = rankdata(scores, method='average', axis=1)

metrics = {}
metrics_rank = {}
metrics_flat = {}
for met in ['wer', 'cer', 'per']:
    metrics[met] = np.array([[x[met] for x in q] for q in data])
    # make metric into 50 x 4 x num_evaluators by copying
    metrics[met] = np.tile(metrics[met].reshape(50, 4, 1), (1, 1, scores.shape[-1]))
    metrics_rank[met] = rankdata(metrics[met], method='average', axis=1)
    metrics_flat[met] = metrics[met].flatten()

# flatten scores
scores_flat = scores.flatten()

print("Score Correlation")
for met in ['wer', 'cer', 'per']:
    print(f"{met}: {round(np.corrcoef(metrics_flat[met], scores_flat)[0, 1]*-100, 2)}")


Score Correlation
wer: 52.99
cer: 54.69
per: 52.63


The rank correlation between the rankings of the metrics and the evaluator scores is calculated below.

In [154]:
from scipy.stats import spearmanr, ttest_rel
from itertools import combinations

print("\nSpearman's Rank Correlation")
spearmanr_metric = {}
for met in ['wer', 'cer', 'per']:
    spearmanr_metric[met] = []
    for (s_arr, m_arr) in zip(scores_rank, metrics_rank[met]):
        for s, m in zip(s_arr.T, m_arr.T):
            spearmanr_metric[met].append(spearmanr(s, m)[0])
    
    # replace nan with 0
    spearmanr_metric[met] = [0 if np.isnan(x) else x for x in spearmanr_metric[met]]

    print(f"{met}: {round(np.mean(spearmanr_metric[met]) * -100, 2)}")


Spearman's Rank Correlation


wer: 68.51
cer: 73.47
per: 43.12


## Additional Calculations

Calculates the T-test p-value to determine if the correlation difference between the two metrics is statistically significant.

In [155]:
print("\nT-test using Spearman's Rank Correlation")
for met_comb in combinations(['wer', 'cer', 'per'], 2):
    rv1 = spearmanr_metric[met_comb[0]]
    rv2 = spearmanr_metric[met_comb[1]]
    ttest_res = ttest_rel(rv1, rv2, alternative='greater')
    print(f"{met_comb[0]} vs {met_comb[1]}: {ttest_res.pvalue}")


T-test using Spearman's Rank Correlation
wer vs cer: 1.1073872458614983e-12
wer vs per: 1.0
cer vs per: 1.0


Kendalll's W is an interrater reliability metric calculated between the 20 evaluators. 

In [156]:
def kendall_w(expt_ratings):
    # calculate correction for ties
    T = []
    for q in expt_ratings:
        Tq = 0
        for sub in q.T:
            _,  counts = np.unique(sub, axis=0, return_counts=True)
            Tq += (counts**3 - counts).sum()
        T.append(Tq)
    T = np.array(T)
    
    m = expt_ratings.shape[2] #raters
    n = expt_ratings.shape[1] # items rated
    denom = m**2*(n**3-n) - m*T
    R = np.sum(expt_ratings.T, axis=0)
    nom = 12 * np.sum(R**2, axis=0) - 3*m**2*n*(n+1)**2
    W = nom/denom
    return W.mean()

print("Kendall's W")
round(kendall_w(scores_rank), 4)

Kendall's W


0.6211