In [2]:
import re
import konlpy
mecab = konlpy.tag.Mecab()

In [7]:
digit_name = ['영', '일', '이', '삼', '사', '오', '육', '칠', '팔', '구']
unit = ['', '십', '백', '천']
unit_10k = ['', '만', '억', '조', '경', '해', '자', '양', '구', '간', '정', '재', '극', '항하사', '아승기', '나유타', '불가사의', '무량대수']

# (1234, 100) -> [34, 12]
def split_digit(num:int, div:int = 10) -> list:
	ret = []
	while num!=0:
		num, rem = divmod(num, div)
		ret.append(rem)
	return ret

def num2kr(num : int, mode=0) -> str:
	if num>=pow(10000, len(unit_10k)+1):
		raise ValueError("Value exceeds 10e72; cannot be read")

	digit_10k = split_digit(num, 10000)

	if mode==1:
		for i in range(len(digit_10k)):
			digit = split_digit(digit_10k[i])
			tmp = []
			for j in range(len(digit)):
				if digit[j]!=0:
					tmp.append(digit_name[digit[j]] + unit[j])
			digit_10k[i] = ''.join(reversed(tmp))

	kr_str = []
	for i in range(len(digit_10k)):
		if digit_10k[i]!=0:
			kr_str.append(str(digit_10k[i]) + unit_10k[i])

	glue = '' if mode==1 else ' '
	kr_str = glue.join(reversed(kr_str))

	return kr_str

def numeric_to_korean(string):
    """After search numeric number, converted Korean
    if number <= 12, converted as korean. eg) 일, 이, 삼, ..., 십이
    elif number > 12, converted each number as korean. eg) 102 => 일공이

    Args:
        string (str): sentence
    """
    
    # delete all characters except numeric characters
    numeric_ = re.sub('[^0-9]', ' ', string)
    
    # numeric_ko_dict = {
    #     '0': '영',
    #     "1": '일', 
    #     "2": '이', 
    #     "3": '삼', 
    #     "4": '사', 
    #     '5': '오', 
    #     '6': '육', 
    #     '7': '칠', 
    #     '8': '팔', 
    #     '9': '구',
    #     '10': '십', 
    #     '11': '십일', 
    #     '12': '십이'
    #     }
    
    for n in numeric_.strip().split(' '):
        if n == '':
            continue
        
        string = string.replace(n, num2kr(int(n), 1))
        
        # if int(n) > 12:
        #     tmp = ''
        #     for c in n:
        #         tmp += numeric_ko_dict[c]
        #     string = string.replace(n, tmp)
        # else:
        #     string = string.replace(n, numeric_ko_dict[n])
            
    
    return string

def string_preprocessing(string):
    # 전처리
    # 괄호 및 괄호 안 내용 제거
    string = re.sub('\([^)]*\)', '', string)
    # punctuation 제거
    string = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', string) # punctuation 을 공백(whitespace) 로 수정
    # 숫자가 있으면 한국어로 변형
    string = numeric_to_korean(string)
    # 한국어를 제외한 나머지 제거
    string = re.sub('[^가-힣 ]', '', string)
    
    return string.strip()

In [8]:
# WER
def wer(ref, hyp , debug=False, split='whitespace'): # 어절 분리 방법
    """_summary_

    Args:
        ref (string): reference sentence
        hyp (string): prediction sentence
        debug (bool, optional): Show details. Defaults to False.
        split (str, optional): sentence split strategy. Defaults to 'whitespace'.

    Returns:numCor, numSub, numDel, numIns, len(r), (numSub + numDel + numIns) / (float) (len(r)), lines
        tuple: number of Correction, number of substitutions, number of deletions, number of insertion, WER score, detail list
    """
    
    assert split in ['whitespace', 'pos']
    
    if split == 'whitespace':
        # 띄어쓰기 기반 어절분리
        r = ref.split()
        h = hyp.split()
    
    elif split == 'pos':
        # 형태소기반 어절분리
        r = mecab.morphs(ref)
        h = mecab.morphs(hyp)
    
    # # 토큰 기반 어절분리
    # r = [x.replace('▁', '') for x in tokenizer.tokenize(ref)]
    # h = [x.replace('▁', '') for x in tokenizer.tokenize(hyp)]
    
    #costs will holds the costs, like in the Levenshtein distance algorithm
    costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
    # backtrace will hold the operations we've done.
    # so we could later backtrace, like the WER algorithm requires us to.
    backtrace = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]

    OP_OK = 0
    OP_SUB = 1
    OP_INS = 2
    OP_DEL = 3

    DEL_PENALTY=1 # Tact
    INS_PENALTY=1 # Tact
    SUB_PENALTY=1 # Tact
    # First column represents the case where we achieve zero
    # hypothesis words by deleting all reference words.
    for i in range(1, len(r)+1):
        costs[i][0] = DEL_PENALTY*i
        backtrace[i][0] = OP_DEL

    # First row represents the case where we achieve the hypothesis
    # by inserting all hypothesis words into a zero-length reference.
    for j in range(1, len(h) + 1):
        costs[0][j] = INS_PENALTY * j
        backtrace[0][j] = OP_INS

    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                costs[i][j] = costs[i-1][j-1]
                backtrace[i][j] = OP_OK
            else:
                substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
                insertionCost    = costs[i][j-1] + INS_PENALTY   # penalty is always 1
                deletionCost     = costs[i-1][j] + DEL_PENALTY   # penalty is always 1

                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
                if costs[i][j] == substitutionCost:
                    backtrace[i][j] = OP_SUB
                elif costs[i][j] == insertionCost:
                    backtrace[i][j] = OP_INS
                else:
                    backtrace[i][j] = OP_DEL

    # back trace though the best route:
    i = len(r)
    j = len(h)
    numSub = 0
    numDel = 0
    numIns = 0
    numCor = 0
    if debug:
        # print("OP\tREF\tHYP")
        lines = []
    while i > 0 or j > 0:
        if backtrace[i][j] == OP_OK:
            numCor += 1
            i-=1
            j-=1
            if debug:
                lines.append("OK\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_SUB:
            numSub +=1
            i-=1
            j-=1
            if debug:
                lines.append("SUB\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_INS:
            numIns += 1
            j-=1
            if debug:
                lines.append("INS\t" + "****" + "\t" + h[j])
        elif backtrace[i][j] == OP_DEL:
            numDel += 1
            i-=1
            if debug:
                lines.append("DEL\t" + r[i]+"\t"+"****")
                
    
    if debug:
        reversed_lines = reversed(lines)
        # for line in reversed_lines:
        #     print(line)
        # print("Ncor " + str(numCor))
        # print("Nsub " + str(numSub))
        # print("Ndel " + str(numDel))
        # print("Nins " + str(numIns))
        # print("WER " + str((numSub + numDel + numIns) / (float) (len(r)))) 
    
    return numCor, numSub, numDel, numIns, len(r), (numSub + numDel + numIns) / (float) (len(r)), lines


In [9]:
from soynlp.hangle import jamo_levenshtein
from soynlp.hangle import levenshtein

def ser(ref, hyp):
    ref = ref.replace(' ', '') # 모든 띄어쓰기 제거
    hyp = hyp.replace(' ', '') # 모든 띄어쓰기 제거
    
    dist = levenshtein(hyp, ref) # 
    length = len(ref)
    
    return dist, length, dist/length

def cer(ref, hyp):
    ref = ref.replace(' ', '') # 모든 띄어쓰기 제거
    hyp = hyp.replace(' ', '') # 모든 띄어쓰기 제거
    
    # print(ref)
    # print(hyp)
    dist = jamo_levenshtein(hyp, ref) # 
    length = len(ref)
    
    return dist, length, dist/length

In [10]:
ref = "으아악 집에 가고싶어"
pred = "으아악 살려줘 집에 가고싶어"


# WER with whitespace
nCor, nSub, nDel, nIns, ref_len, wer_score, lines = wer(ref, pred, True, split='whitespace') 
print("WER(whitespace) : ", wer_score)
print("WER(whitespace w/o insert) : ", (nSub + nDel) / ref_len) # without insertion
# WER with pos
nCor, nSub, nDel, nIns, ref_len, wer_score, lines = wer(ref, pred, True, split='pos')
print("WER(pos) : ", wer_score)
print("WER(pos w/o insert) : ", (nSub + nDel) / ref_len) # without insertion


print("SER : ", ser(ref, pred)[-1]) # ser
print("CER : ", cer(ref, pred)[-1]) # cer


WER(whitespace) :  0.3333333333333333
WER(whitespace w/o insert) :  0.0
WER(pos) :  0.14285714285714285
WER(pos w/o insert) :  0.0
SER :  0.3333333333333333
CER :  0.3333333333333333
