In [None]:
import os, sys, glob
import git

root = git.Repo('.', search_parent_directories=True).working_tree_dir
os.chdir(root)
print(f"Changed working directory to {root}")

import pandas as pd

In [18]:
asr_outputs = glob.glob('data/asr_outputs/*.csv')
vsr_outputs = glob.glob('data/vsr_outputs/*.csv')
print(f"{len(asr_outputs)=} {len(vsr_outputs)=}")

# Concatenate ASR outputs in to a single dataframe
def read_transcriptions(paths):
    transcriptions = {}
    for path in paths:
        df = pd.read_csv(path)

        # path_col is the first column
        path_col, text_col = df.columns[:2]

        # Filenames are of the format: `<prefix path>/id08701/z8t-KFSoYLI/00478.<ext>`
        # We want to extract the filename `id08701/z8t-KFSoYLI/00478` from them
        df[path_col] = df[path_col].apply(
            lambda x: '/'.join(os.path.splitext(x)[0].split('/')[-3:])
        )

        for i, r in df.iterrows():
            if r[text_col] == 'None' or pd.isna(r[text_col]):
                continue
            if 'language' in r and r['language'] != 'en':
                continue
            transcriptions[r[path_col]] = r[text_col].strip()
    return transcriptions

asr = read_transcriptions(asr_outputs)
vsr = read_transcriptions(vsr_outputs)

ref, hyp = [], []
for k in vsr:
    if k in asr:
        ref.append(asr[k])
        hyp.append(vsr[k])
print(f"Loaded {len(ref)} reference and hypothesis pairs from ASR and VSR outputs.")

len(asr_outputs)=64 len(vsr_outputs)=497
Loaded 31341 reference and hypothesis pairs from ASR and VSR outputs.


In [22]:
import random
import copy
import jiwer

_ref = copy.deepcopy(ref)
_hyp = copy.deepcopy(hyp)
random.shuffle(_ref)
random.shuffle(_hyp)

out = jiwer.process_words(
    reference=ref,
    hypothesis=hyp,
    reference_transform=jiwer.wer_standardize, 
    hypothesis_transform=jiwer.wer_standardize
)

print(f"WER: {out.wer:.4f}")

WER: 0.3899
