In [1]:
import jiwer
from math import floor, ceil
import nltk
import numpy as np
import string
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.manifold as skmf
import re


In [2]:
file_name = []
for i in range(1, 11):
    name = 'transcript ' + f'{i:03}' + '.txt'
    file_name.append(name)

In [3]:
def _load_file_as_string(filepath) -> str:
    # Works for JSON and TXT Files
    with open(filepath, "r", encoding="utf-8") as file:
        return file.read()

In [4]:
 #multi_transcript=_load_file_as_string(file_name[0]).split('\n')

In [5]:
# multi_transcript

In [6]:
file_name

['transcript 001.txt',
 'transcript 002.txt',
 'transcript 003.txt',
 'transcript 004.txt',
 'transcript 005.txt',
 'transcript 006.txt',
 'transcript 007.txt',
 'transcript 008.txt',
 'transcript 009.txt',
 'transcript 010.txt']

In [7]:
def error_func(s1, s2):
    wer = jiwer.wer(
                s1,
                s2,
            ) #calculates word error rate
    #print(f"Word Error Rate (WER) :", wer)
    wil = jiwer.wil(
                s1,
                s2,
            )# calculates word information loss
    #print(f"Word Information Loss (WIL) :", wil)
    mer = jiwer.mer(
                s1,
                s2,
            )#calculates match error rate
    #print(f"Match Error Rate (MER) :", mer)
    cer = jiwer.cer(
                s1,
                s2,
            )#calculates character error rate
    #print(f"Character Error Rate (CER) :", cer)
    return (wer, wil,mer, cer)

In [8]:
 # Function to calculate the
# Jaro Similarity of two s
def jaro_distance(s1, s2):
     
    # If the s are equal
    if (s1 == s2):
        return 1.0
 
    # Length of two s
    len1 = len(s1)
    len2 = len(s2)
 
    # Maximum distance upto which matching
    # is allowed
    max_dist = floor(max(len1, len2) / 2) - 1
 
    # Count of matches
    match = 0
 
    # Hash for matches
    hash_s1 = [0] * len(s1)
    hash_s2 = [0] * len(s2)
 
    # Traverse through the first
    for i in range(len1):
 
        # Check if there is any matches
        for j in range(max(0, i - max_dist), 
                       min(len2, i + max_dist + 1)):
             
            # If there is a match
            if (s1[i] == s2[j] and hash_s2[j] == 0):
                hash_s1[i] = 1
                hash_s2[j] = 1
                match += 1
                break
 
    # If there is no match
    if (match == 0):
        return 0.0
 
    # Number of transpositions
    t = 0
    point = 0
 
    # Count number of occurrences
    # where two characters match but
    # there is a third matched character
    # in between the indices
    for i in range(len1):
        if (hash_s1[i]):
 
            # Find the next matched character
            # in second
            while (hash_s2[point] == 0):
                point += 1
 
            if (s1[i] != s2[point]):
                t += 1
            point += 1
    t = t//2
 
    # Return the Jaro Similarity
    return (match/ len1 + match / len2 +
            (match - t) / match)/ 3.0
 
# Driver code
s1 = "CRATE"
s2 = "TRACE"
 
# Prjaro Similarity of two s
print(round(jaro_distance(s1, s2),6))
 

0.733333


In [9]:
def leven(s1, s2):
    lev = nltk.edit_distance(s1, s2, substitution_cost=1, transpositions=False)
    return lev

In [10]:
def jaccard(s1,s2):
    s1=s1.split()
    s2=s2.split()
    union=list(set(s1+s2))
    intersection=list(set(s1)-(set(s1)-set(s2)))
    jaccard_coeff = float(len(intersection))/len(union)
    return jaccard_coeff

In [11]:
def find_outliers_IQR(df):
    """Transcription Length"""
    q1=df.quantile(0.25)
    q3=df.quantile(0.75)
    IQR=q3-q1
    outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]
    return outliers

In [12]:
def metrics(s1,s2):
    
    jacc = jaccard(s1,s2)

    jaro = jaro_distance(s1,s2)
    wer, wil,mer, cer = error_func(s1,s2)
    lev = leven(s1, s2)
    return (jacc, jaro, lev, wer,wil,mer,cer)

In [13]:
def main(transcript_first,transcript_others):

    w1 =[]
    w2 =[]
    w3 =[]
    w4 =[]
    w5 =[]
    w6 =[]
    w7 =[]
    length=[]
    name=[]
    len_og = len(transcript_first.split())
    transcript = 1
    for i in transcript_others:
        jacc, jaro, leven, wer,wil,mer,cer=metrics(transcript_first,i)
        print('Metrics Complete')
        w1.append(jacc)
        w2.append(jaro)
        w3.append(leven)
        w4.append(wer)
        w5.append(wil)
        w6.append(mer)
        w7.append(cer)
        length.append(abs(len_og-len(i.split())))
        transcript = transcript+1
        name.append(transcript)


    final = pd.DataFrame({'file name':name,'nova jaccard':w1,'nova jaro':w2,'nova leven':w3,
                 'nova wer':w4,'nova wil':w5,'nova mer':w6, 'nova cer':w7,'length':length})
    return final

In [14]:
import torch
torch.cuda.is_available()

True

In [15]:
def get_comparison(num):
    print(file_name[i])
    multi_transcript=_load_file_as_string(file_name[i]).split('\n')
    print(len(multi_transcript))
    if '' in multi_transcript[:-1]:
        print(True)
        return None
    else:
        print(False)
        metric_frame= main(multi_transcript[0],multi_transcript[1:])
        print('Comparison Complete')
        return metric_frame
       

In [21]:
torch.cuda.init()
device = "cuda"
with torch.cuda.device(device):
    for i in [3,7,9]:
        print(get_comparison(i))

False
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Metrics Complete
Comparison Complete
    file name  nova jaccard  nova jaro  nova leven  nova wer  nova wil  \
0           2          1.00   1.000000           0   0.00000  0.000000   
1           3          1.00   1.000000           0   0.00000  0.000000   
2           4          1.00   1.000000           0   0.00000  0.000000   
3           5          1.00   1.000000           0   0.00000  0.000000   
4           6          0.99   0.904906           4   0.00597  0.011905   
5           7       

In [17]:
metrics

<function __main__.metrics(s1, s2)>

In [18]:
#metrics.to_excel('Final LLM Metrics.xlsx', engine='xlsxwriter',sheet_name='LLM Metrics')

In [None]:
import seaborn as sns
sns.violinplot(data=metrics,x="whisper jaccard", split=True, inner="quart")