In [None]:
from itertools import chain
from typing import List


def _word2char(reference: List[List[str]], hypothesis: List[List[str]]):
    # tokenize each word into an integer
    vocabulary = set(chain(*reference, *hypothesis))

    if "" in vocabulary:
        raise ValueError(
            "Empty strings cannot be a word. "
            "Please ensure that the given transform removes empty strings."
        )

    word2char = dict(zip(vocabulary, range(len(vocabulary))))

    reference_chars = [
        "".join([chr(word2char[w]) for w in sentence]) for sentence in reference
    ]
    hypothesis_chars = [
        "".join([chr(word2char[w]) for w in sentence]) for sentence in hypothesis
    ]

    return reference_chars, hypothesis_chars


In [4]:
from itertools import chain
from typing import List


def _word2char(reference: List[List[str]], hypothesis: List[List[str]]):
    # Tokenize each word into an integer
    vocabulary = set(chain(*reference, *hypothesis))

    if "" in vocabulary:
        raise ValueError(
            "Empty strings cannot be a word. "
            "Please ensure that the given transform removes empty strings."
        )

    word2char = dict(zip(vocabulary, range(len(vocabulary))))

    reference_chars = [
        "".join([chr(word2char[w]) for w in sentence]) for sentence in reference
    ]
    hypothesis_chars = [
        "".join([chr(word2char[w]) for w in sentence]) for sentence in hypothesis
    ]

    return word2char, reference_chars, hypothesis_chars

In [5]:
# Original reference and hypothesis
ref = ['안녕하세요, 반갑습니다 123']
hyp = ['안 반하습니다 123 3']

# Get the word2char mapping and encoded character lists
word2char, word2char_ref, word2char_hyp = _word2char(ref, hyp)

# Create a reverse mapping from characters back to words
char2word = {chr(v): k for k, v in word2char.items()}

# Decipher the encoded character lists back to words
deciphered_ref = [
    [char2word[char] for char in sentence] for sentence in word2char_ref
]
deciphered_hyp = [
    [char2word[char] for char in sentence] for sentence in word2char_hyp
]

# Print the original and deciphered versions
print("Original Reference:", ref)
#print("Encoded Reference:", word2char_ref)
print("Character divided Reference:", deciphered_ref)

print("Original Hypothesis:", hyp)
#print("Encoded Hypothesis:", word2char_hyp)
print("Character divided Hypothesis:", deciphered_hyp)


Original Reference: ['안녕하세요, 반갑습니다 123']
Character divided Reference: [['안', '녕', '하', '세', '요', ',', ' ', '반', '갑', '습', '니', '다', ' ', '1', '2', '3']]
Original Hypothesis: ['안 반하습니다 123 3']
Character divided Hypothesis: [['안', ' ', '반', '하', '습', '니', '다', ' ', '1', '2', '3', ' ', '3']]


In [6]:
import jiwer

out = jiwer.process_words(
    ref,
    hyp,
)

print(jiwer.visualize_alignment(out))


sentence 1
REF: 안녕하세요, 반갑습니다 123 *
HYP:      안 반하습니다 123 3
          S     S     I

number of sentences: 1
substitutions=2 deletions=0 insertions=1 hits=1

mer=75.00%
wil=91.67%
wip=8.33%
wer=100.00%



In [7]:
import jiwer

out = jiwer.process_characters(
    ref,
    hyp,
)

print(jiwer.visualize_alignment(out))

sentence 1
REF: 안녕하세요, 반갑습니다 12**3
HYP: 안***** 반하습니다 123 3
      DDDDD  S      II 

number of sentences: 1
substitutions=1 deletions=5 insertions=2 hits=10

cer=50.00%



In [19]:
import mecab

mecab = mecab.MeCab()
def wer(ref, hyp ,debug=False, new=True):
    # # 형태소기반 어절분리
    if new:
        r = mecab.morphs(ref)
        h = mecab.morphs(hyp)
    else:
        r = ref.split()
        h = hyp.split()
    
    # # 토큰 기반 어절분리
    # r = [x.replace('▁', '') for x in tokenizer.tokenize(ref)]
    # h = [x.replace('▁', '') for x in tokenizer.tokenize(hyp)]
    
    #costs will holds the costs, like in the Levenshtein distance algorithm
    costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
    # backtrace will hold the operations we've done.
    # so we could later backtrace, like the WER algorithm requires us to.
    backtrace = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]

    OP_OK = 0
    OP_SUB = 1
    OP_INS = 2
    OP_DEL = 3

    DEL_PENALTY=1 # Tact
    INS_PENALTY=1 # Tact
    SUB_PENALTY=1 # Tact
    # First column represents the case where we achieve zero
    # hypothesis words by deleting all reference words.
    for i in range(1, len(r)+1):
        costs[i][0] = DEL_PENALTY*i
        backtrace[i][0] = OP_DEL

    # First row represents the case where we achieve the hypothesis
    # by inserting all hypothesis words into a zero-length reference.
    for j in range(1, len(h) + 1):
        costs[0][j] = INS_PENALTY * j
        backtrace[0][j] = OP_INS

    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                costs[i][j] = costs[i-1][j-1]
                backtrace[i][j] = OP_OK
            else:
                substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
                insertionCost    = costs[i][j-1] + INS_PENALTY   # penalty is always 1
                deletionCost     = costs[i-1][j] + DEL_PENALTY   # penalty is always 1

                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
                if costs[i][j] == substitutionCost:
                    backtrace[i][j] = OP_SUB
                elif costs[i][j] == insertionCost:
                    backtrace[i][j] = OP_INS
                else:
                    backtrace[i][j] = OP_DEL

    # back trace though the best route:
    i = len(r)
    j = len(h)
    numSub = 0
    numDel = 0
    numIns = 0
    numCor = 0
    if debug:
        # print("OP\tREF\tHYP")
        lines = []
    while i > 0 or j > 0:
        if backtrace[i][j] == OP_OK:
            numCor += 1
            i-=1
            j-=1
            if debug:
                lines.append("OK\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_SUB:
            numSub +=1
            i-=1
            j-=1
            if debug:
                lines.append("SUB\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_INS:
            numIns += 1
            j-=1
            if debug:
                lines.append("INS\t" + "****" + "\t" + h[j])
        elif backtrace[i][j] == OP_DEL:
            numDel += 1
            i-=1
            if debug:
                lines.append("DEL\t" + r[i]+"\t"+"****")
                
    
    if debug:
        reversed_lines = reversed(lines)
        for line in reversed_lines:
            print(line)
        print("Ncor " + str(numCor))
        print("Nsub " + str(numSub))
        print("Ndel " + str(numDel))
        print("Nins " + str(numIns))
        print("WER " + str((numSub + numDel + numIns) / (float) (len(r)))) 
        
        
    return numCor, numSub, numDel, numIns, (numSub + numDel + numIns) / (float) (len(r)), lines, (float) (len(r))

In [20]:
mecab.morphs(ref[0])
mecab

<mecab.mecab.MeCab at 0x7fbb15fb92d0>

In [22]:
numCor, numSub, numDel, numIns, wer_score, lines, _ = wer(ref[0].replace(",", ''), hyp[0], debug=True)
print("C", numCor)
print("S", numSub)
print("D", numDel)
print("I", numIns)
print("WER: ", wer_score)

DEL	안녕	****
DEL	하	****
SUB	세요	안
SUB	반갑	반하
OK	습니다	습니다
OK	123	123
INS	****	3
Ncor 2
Nsub 2
Ndel 2
Nins 1
WER 0.8333333333333334
C 2
S 2
D 2
I 1
WER:  0.8333333333333334


In [10]:
from soynlp.hangle import jamo_levenshtein
from soynlp.hangle import levenshtein

def cer2(ref, hyp):
    ref = ref.replace(' ', '')
    hyp = hyp.replace(' ', '')
    
    # print(ref)
    # print(hyp)
    dist = jamo_levenshtein(hyp, ref)
    length = len(ref)
    return dist, length, dist/length


In [23]:
jamo_levenshtein('안', '아')

0.3333333333333333

In [24]:
jamo_levenshtein('안', 'ㅇ')

0.6666666666666666

In [13]:
dist, length, cer = cer(ref[0], hyp[0])
dist, length, cer

(6.666666666666667, 14, 0.4761904761904762)

In [None]:
mistakes = 2 + 1 + 1 
total = 14

In [15]:
from soynlp.hangle import decompose
for c1, c2 in zip(ref[0], hyp[0]):
    print(decompose(c1), decompose(c2))

('ㅇ', 'ㅏ', 'ㄴ') ('ㅇ', 'ㅏ', 'ㄴ')
('ㄴ', 'ㅕ', 'ㅇ') None
('ㅎ', 'ㅏ', ' ') ('ㅂ', 'ㅏ', 'ㄴ')
('ㅅ', 'ㅔ', ' ') ('ㅎ', 'ㅏ', ' ')
('ㅇ', 'ㅛ', ' ') ('ㅅ', 'ㅡ', 'ㅂ')
None ('ㄴ', 'ㅣ', ' ')
None ('ㄷ', 'ㅏ', ' ')
('ㅂ', 'ㅏ', 'ㄴ') None
('ㄱ', 'ㅏ', 'ㅂ') None
('ㅅ', 'ㅡ', 'ㅂ') None
('ㄴ', 'ㅣ', ' ') None
('ㄷ', 'ㅏ', ' ') None
None None


In [35]:

def ser(ref, hyp ,debug=False, new=True):
    lines = []
    # split by phoneme
    r = list(ref)
    h = list(hyp)
    
    #costs will holds the costs, like in the Levenshtein distance algorithm
    costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
    # backtrace will hold the operations we've done.
    # so we could later backtrace, like the WER algorithm requires us to.
    backtrace = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]

    OP_OK = 0
    OP_SUB = 1
    OP_INS = 2
    OP_DEL = 3

    DEL_PENALTY=1 # Tact
    INS_PENALTY=1 # Tact
    SUB_PENALTY=1 # Tact
    # First column represents the case where we achieve zero
    # hypothesis words by deleting all reference words.
    for i in range(1, len(r)+1):
        costs[i][0] = DEL_PENALTY*i
        backtrace[i][0] = OP_DEL

    # First row represents the case where we achieve the hypothesis
    # by inserting all hypothesis words into a zero-length reference.
    for j in range(1, len(h) + 1):
        costs[0][j] = INS_PENALTY * j
        backtrace[0][j] = OP_INS

    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                costs[i][j] = costs[i-1][j-1]
                backtrace[i][j] = OP_OK
            else:
                substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
                insertionCost    = costs[i][j-1] + INS_PENALTY   # penalty is always 1
                deletionCost     = costs[i-1][j] + DEL_PENALTY   # penalty is always 1

                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
                if costs[i][j] == substitutionCost:
                    backtrace[i][j] = OP_SUB
                elif costs[i][j] == insertionCost:
                    backtrace[i][j] = OP_INS
                else:
                    backtrace[i][j] = OP_DEL

    # back trace though the best route:
    i = len(r)
    j = len(h)
    numSub = 0
    numDel = 0
    numIns = 0
    numCor = 0
    if debug:
        # print("OP\tREF\tHYP")
        lines = []
    while i > 0 or j > 0:
        if backtrace[i][j] == OP_OK:
            numCor += 1
            i-=1
            j-=1
            if debug:
                lines.append("OK\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_SUB:
            numSub +=1
            i-=1
            j-=1
            if debug:
                lines.append("SUB\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_INS:
            numIns += 1
            j-=1
            if debug:
                lines.append("INS\t" + "****" + "\t" + h[j])
        elif backtrace[i][j] == OP_DEL:
            numDel += 1
            i-=1
            if debug:
                lines.append("DEL\t" + r[i]+"\t"+"****")
                
    
    if debug:
        reversed_lines = reversed(lines)
        for line in reversed_lines:
            print(line)
        print("Ncor " + str(numCor))
        print("Nsub " + str(numSub))
        print("Ndel " + str(numDel))
        print("Nins " + str(numIns))
        print("WER " + str((numSub + numDel + numIns) / (float) (len(r)))) 
        
        
    return numCor, numSub, numDel, numIns, (numSub + numDel + numIns) / (float) (len(r)), lines, (float) (len(r))

In [28]:
from soynlp.hangle import decompose

def cer(ref, hyp ,debug=False, new=True):
    lines = []
    # split by phoneme
    ref = list(ref)
    r = []
    for c in ref:
        ctuple = decompose(c)
        if not ctuple:
            continue
        for ct in ctuple:
            if ct != ' ':
                r.append(ct)
    hyp = list(hyp)
    h = []
    for c in hyp:
        ctuple = decompose(c)
        if not ctuple:
            continue
        for ct in ctuple:
            if ct != ' ':
                h.append(ct)
    
    #costs will holds the costs, like in the Levenshtein distance algorithm
    costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
    # backtrace will hold the operations we've done.
    # so we could later backtrace, like the WER algorithm requires us to.
    backtrace = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]

    OP_OK = 0
    OP_SUB = 1
    OP_INS = 2
    OP_DEL = 3

    DEL_PENALTY=1 # Tact
    INS_PENALTY=1 # Tact
    SUB_PENALTY=1 # Tact
    # First column represents the case where we achieve zero
    # hypothesis words by deleting all reference words.
    for i in range(1, len(r)+1):
        costs[i][0] = DEL_PENALTY*i
        backtrace[i][0] = OP_DEL

    # First row represents the case where we achieve the hypothesis
    # by inserting all hypothesis words into a zero-length reference.
    for j in range(1, len(h) + 1):
        costs[0][j] = INS_PENALTY * j
        backtrace[0][j] = OP_INS

    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                costs[i][j] = costs[i-1][j-1]
                backtrace[i][j] = OP_OK
            else:
                substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
                insertionCost    = costs[i][j-1] + INS_PENALTY   # penalty is always 1
                deletionCost     = costs[i-1][j] + DEL_PENALTY   # penalty is always 1

                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
                if costs[i][j] == substitutionCost:
                    backtrace[i][j] = OP_SUB
                elif costs[i][j] == insertionCost:
                    backtrace[i][j] = OP_INS
                else:
                    backtrace[i][j] = OP_DEL

    # back trace though the best route:
    i = len(r)
    j = len(h)
    numSub = 0
    numDel = 0
    numIns = 0
    numCor = 0
    if debug:
        # print("OP\tREF\tHYP")
        lines = []
    while i > 0 or j > 0:
        if backtrace[i][j] == OP_OK:
            numCor += 1
            i-=1
            j-=1
            if debug:
                lines.append("OK\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_SUB:
            numSub +=1
            i-=1
            j-=1
            if debug:
                lines.append("SUB\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_INS:
            numIns += 1
            j-=1
            if debug:
                lines.append("INS\t" + "****" + "\t" + h[j])
        elif backtrace[i][j] == OP_DEL:
            numDel += 1
            i-=1
            if debug:
                lines.append("DEL\t" + r[i]+"\t"+"****")
                
    
    if debug:
        reversed_lines = reversed(lines)
        for line in reversed_lines:
            print(line)
        print("Ncor " + str(numCor))
        print("Nsub " + str(numSub))
        print("Ndel " + str(numDel))
        print("Nins " + str(numIns))
        print("WER " + str((numSub + numDel + numIns) / (float) (len(r)))) 
        
        
    return numCor, numSub, numDel, numIns, (numSub + numDel + numIns) / (float) (len(r)), lines, (float) (len(r))

In [29]:
ref = ['안녕하세요 반갑습니다 123']
hyp = ['안 반하습니다 123 3']

numCor, numSub, numDel, numIns, ser_score, lines, _ = ser(ref[0], hyp[0], debug=True)
print("-"*50)
numCor, numSub, numDel, numIns, cer_score, lines, _ = cer(ref[0], hyp[0], debug=True)

OK	안	안
DEL	녕	****
DEL	하	****
DEL	세	****
DEL	요	****
OK	 	 
OK	반	반
SUB	갑	하
OK	습	습
OK	니	니
OK	다	다
OK	 	 
OK	1	1
OK	2	2
INS	****	3
INS	****	 
OK	3	3
Ncor 10
Nsub 1
Ndel 4
Nins 2
WER 0.4666666666666667
--------------------------------------------------
OK	ㅇ	ㅇ
OK	ㅏ	ㅏ
DEL	ㄴ	****
OK	ㄴ	ㄴ
DEL	ㅕ	****
DEL	ㅇ	****
DEL	ㅎ	****
DEL	ㅏ	****
DEL	ㅅ	****
DEL	ㅔ	****
DEL	ㅇ	****
DEL	ㅛ	****
OK	ㅂ	ㅂ
OK	ㅏ	ㅏ
OK	ㄴ	ㄴ
SUB	ㄱ	ㅎ
OK	ㅏ	ㅏ
DEL	ㅂ	****
OK	ㅅ	ㅅ
OK	ㅡ	ㅡ
OK	ㅂ	ㅂ
OK	ㄴ	ㄴ
OK	ㅣ	ㅣ
OK	ㄷ	ㄷ
OK	ㅏ	ㅏ
Ncor 14
Nsub 1
Ndel 10
Nins 0
WER 0.44


In [37]:
with open('stuff/predictions_VC-JHJ_Woman_40s-01-08-35.48_test.txt', 'r') as f:
    ref = f.readlines()

with open('stuff/references_VC-JHJ_Woman_40s-01-08-35.48_test.txt', 'r') as f:
    pred = f.readlines()

In [38]:
ref = [x.strip() for x in ref]
pred = [x.strip() for x in pred]

import re

ref = [re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', ref.replace('\n', '')).strip() for ref in ref]
pred = [re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', pred.replace('\n', '')).strip() for pred in pred]

In [32]:
import evaluate

# Evaluate the model
metric = evaluate.load("wer")
metric_cer = evaluate.load("cer")

wer = 100 * metric.compute(predictions=pred, references=ref)
cer_ = 100 * metric_cer.compute(predictions=pred, references=ref)
print(f"WER: {wer:.2f}%")
print(f"CER: {cer_:.2f}%")

WER: 47.70%
CER: 27.81%


In [33]:
ref = ['안녕하세요 반갑습니다 123']
hyp = ['안 반하습니다 123 3']

dist, length, cer_score = cer2(ref[0], hyp[0])
ser = 100 * metric_cer.compute(predictions=hyp, references=ref)
print(f"CER: {cer_score*100}%")
print(f"SER: {ser}%")

CER: 43.58974358974359%
SER: 46.666666666666664%


In [8]:
import mecab
mecab = mecab.MeCab()


def remove_sound(text: str) -> str:
    results = []
    for w, p in mecab.pos(text):
        if p == 'IC' and len(w) == 1:
            continue # something do you want
        results.append(w + ' ')
    return ''.join(results)

remove_sound('그래 아 어  hello')

'그래 hello is '

In [12]:
from soynlp.hangle import decompose, compose

def ignore_similar(text: str) -> str:
    results = []

    for c in text:
        # Try decomposing the character
        phonemes = decompose(c)
        if phonemes is None :
            # If decomposition fails, keep the character as is
            results.append(c)
            continue
        onset, nucleus, coda = phonemes

        if nucleus in ['ㅔ', 'ㅐ', 'ㅒ', 'ㅖ']:
            nucleus = 'ㅔ'
        elif nucleus in  ['ㅙ', 'ㅚ', 'ㅞ']:
            nucleus = 'ㅙ'
        results.append(compose(onset, nucleus, coda))
    return ''.join(results)

# Test the function
print(ignore_similar('안녕하세요 애 게 세 외국인'))


안녕하세요 에 게 세 왜국인


In [117]:
decompose('왜')

('ㅇ', 'ㅙ', ' ')

In [72]:
total_cer = 0
for r, p in zip(ref, pred):
    dist, length, cer_score = cer2(r, p)
    total_cer += cer_score
total_cer /= len(ref)
print(f"CER: {total_cer:.4f}%")

CER: 0.2559%


In [39]:
total_ser = 0
total_cer = 0

total_ins1 = 0
total_del1 = 0
total_sub1 = 0

total_ins2 = 0
total_del2 = 0
total_sub2 = 0

for r, p in zip(ref, pred):
    #r = remove_sound(r)
    #p = remove_sound(p)
    #r = ignore_similar(r)
    #p = ignore_similar(p)
    #if not r or not p:
    #    continue
    numCor1, numSub1, numDel1, numIns1, ser_score, lines, total1 = ser(r, p, debug=False)
    numCor2, numSub2, numDel2, numIns2, cer_score, lines, total2 = cer(r, p, debug=False)
    orig_ser = metric_cer.compute(predictions=[p], references=[r])
    orig_cer = cer2(p, r)[2]
    if ser_score != orig_ser:
        print("SER")
        print('ref:', r)
        print('hyp:', p)
        print('or', orig_ser)
        print('my', ser_score)
        print('-'*10)
    if cer_score != orig_cer:
        print("CER")
        print('ref:', r)
        print('hyp:', p)
        print('or', orig_cer)
        print('my', cer_score)
        print('-'*10)
    #dist, length, cer_score = cer2(r, p)
    total_ser += ser_score
    total_cer += cer_score

    total_ins1 += numIns1/total1
    total_del1 += numDel1/total1
    total_sub1 += numSub1/total1

    total_ins2 += numIns2/total2
    total_del2 += numDel2/total2
    total_sub2 += numSub2/total2
total_ser /= len(ref)
total_cer /= len(ref)

total_ins1 /= len(ref)
total_del1 /= len(ref)
total_sub1 /= len(ref)

total_ins2 /= len(ref)
total_del2 /= len(ref)
total_sub2 /= len(ref)



print(f"SER: {total_ser*100:.2f}%")
print(f"CER: {total_cer*100:.2f}%")

print(f"INS SER: {total_ins1*100:.2f}%")
print(f"DEL SER: {total_del1*100:.2f}%")
print(f"SUB SER: {total_sub1*100:.2f}%")

print(f"INS CER: {total_ins2*100:.2f}%")
print(f"DEL CER: {total_del2*100:.2f}%")
print(f"SUB CER: {total_sub2*100:.2f}%")

CER
ref: 얼리 말하는 말투가 그런거든
hyp: 원래 말하는 말투가 그런 거더라고
or 0.2619047619047619
my 0.26666666666666666
----------
CER
ref: 그게 오는 거래
hyp: 떼오는 거래
or 0.26666666666666666
my 0.23076923076923078
----------
CER
ref: 어 이 안 하고
hyp: 그래요 이제 안 가고
or 0.45833333333333337
my 0.7272727272727273
----------
CER
ref: 우리 엄마가 때는 하는 말이야 뭐 할 때 꼭 엄마랑 사치해라 엄마도 아프다운 거지
hyp: 우리 엄마가 맨날 하는 말이 뭐 할 때 그 엄마랑 상의해라 엄마도 안타까운거지
or 0.18279569892473116
my 0.21621621621621623
----------
CER
ref: 되게 좋았네
hyp: 커피를 되게 좋아서
or 0.5
my 0.8333333333333334
----------
CER
ref: 근데 동생이 언니 언니 혜원은 진짜 뭐 그래 언니가 왜 그러니까
hyp: 이렇게 얘기를 했는데 그 동생이 언니 혀는 진짜 그래 그래서 내가 왜 그러니까
or 0.47311827956989244
my 0.5932203389830508
----------
CER
ref: 아유 내가 밥 살게
hyp: 아이 내가 밥 살게
or 0.047619047619047616
my 0.0625
----------
CER
ref: 10키로가 얼마야 10키로가
hyp: 그게 얼마야 10kg가
or 0.6
my 0.47368421052631576
----------
CER
ref: 한동안 안 못 봤어 거기 있어가지고
hyp: 한동안 안가져갔어 꼴뵈기 싫어가지고
or 0.3125
my 0.37142857142857144
----------
CER
ref: 했지 그러니까 내가 안 한다고 하면
hyp: 했어야 돼 그러니까 내가 안 한다고 하면
or 0.166666666

In [20]:
ref= '그게 오는 거래'
hyp= '떼오는 거래'

numCor2, numSub2, numDel2, numIns2, cer_score, lines, total2 = cer(ref, hyp, debug=True)

DEL	ㄱ	****
DEL	ㅡ	****
SUB	ㄱ	ㄸ
OK	ㅔ	ㅔ
OK	ㅇ	ㅇ
OK	ㅗ	ㅗ
OK	ㄴ	ㄴ
OK	ㅡ	ㅡ
OK	ㄴ	ㄴ
OK	ㄱ	ㄱ
OK	ㅓ	ㅓ
OK	ㄹ	ㄹ
OK	ㅐ	ㅐ
Ncor 10
Nsub 1
Ndel 2
Nins 0
WER 0.23076923076923078
