# Korean Phonetic Vectors Evaluation

In [1]:
import pandas as pd
import pickle
from itertools import product
from collections import Counter
import sys
import re
import numpy as np
from numpy import dot
from numpy.linalg import norm
from KoG2P.g2p import runKoG2P
from generate_phonetic_feature_vectors import *

In [2]:
runKoG2P('박물관', 'KoG2P/rulebook.txt')

'p0 aa ng mm uu ll k0 wa nf'

In [3]:
df = pd.read_csv('data/ipa_feats.csv')
df.features = df.features.apply(lambda x: tuple(x.split()))
phone_feature_map = dict(df.values)
phone_feature_map['^'] = tuple(['bgn']) 
phone_feature_map['$'] = tuple(['end']) 
phone_feature_map

{'p0': ('blb', 'stp', 'pln', 'vcd'),
 'ph': ('blb', 'stp', 'asp'),
 'pp': ('blb', 'stp', 'tns'),
 't0': ('alv', 'stp', 'pln', 'vcd'),
 'th': ('alv', 'stp', 'asp'),
 'tt': ('alv', 'stp', 'tns'),
 'k0': ('vlr', 'stp', 'pln', 'vcd'),
 'kh': ('vlr', 'stp', 'tns'),
 'kk': ('vlr', 'stp', 'tns'),
 's0': ('alv', 'frc', 'pln'),
 'ss': ('alv', 'frc', 'tns'),
 'h0': ('glt', 'frc', 'pln'),
 'c0': ('plt', 'stp', 'pln', 'vcd'),
 'ch': ('plt', 'stp', 'asp'),
 'cc': ('plt', 'stp', 'tns'),
 'mm': ('blb', 'nsl', 'vcd'),
 'nn': ('alv', 'nsl', 'vcd'),
 'rr': ('alv', 'lqd', 'vcd'),
 'pf': ('blb', 'stp', 'pln', 'vcd'),
 'tf': ('alv', 'stp', 'pln', 'vcd'),
 'kf': ('vlr', 'stp', 'pln', 'vcd'),
 'mf': ('blb', 'nsl', 'vcd'),
 'nf': ('alv', 'nsl', 'vcd'),
 'ng': ('vlr', 'nsl', 'vcd'),
 'll': ('alv', 'lqd', 'vcd'),
 'ii': ('fnt', 'unr', 'hgh', 'vwl'),
 'ee': ('fnt', 'unr', 'umd', 'vwl'),
 'qq': ('fnt', 'unr', 'lmd', 'vwl'),
 'aa': ('cnt', 'unr', 'low', 'vwl'),
 'xx': ('bck', 'unr', 'hgh', 'vwl'),
 'vv': ('bck', '

In [4]:
with open ('feat2vec.pickle', 'rb') as fp:
    feat2vec = pickle.load(fp)

In [56]:
def word2phonvec(word):
    """Takes a korean word and returns a phonetic vector of it
    """
    phones = runKoG2P(re.sub('[^가-힣]','', word), 'KoG2P/rulebook.txt') 
    features = Counter(feature_bigrams(phones.split(), phone_feature_map))
    word_phonvec = np.zeros(512)
    count = 0
    for feature in features:
        word_phonvec += feat2vec[feature] * features[feature]
        count += features[feature]

    word_phonvec = word_phonvec / count if count > 0 else word_phonvec
    
    # 마지막에 음절 수 곱해주기 
    return word_phonvec * len(word)

## Phonetic Similarity & Cosine Similarity

In [7]:
# ranges -1 to 1
def cosine_similarity(a,b):
    return dot(a, b)/(norm(a)*norm(b))

In [57]:
# 발음이 유사한 경우 
cosine_similarity(word2phonvec('박물관'), word2phonvec('퐉물관'))

0.9314748044769416

In [58]:
# 발음이 유사한 경우 
cosine_similarity(word2phonvec('박물관'), word2phonvec('박물콴'))

0.9494476961192483

In [59]:
# 발음이 유사하지 않은 경우 
cosine_similarity(word2phonvec('박물관'), word2phonvec('전시회'))

0.29366140575036814

In [60]:
# 발음이 유사한 경우 
cosine_similarity(word2phonvec('수영'), word2phonvec('쑤영'))

0.9136299173965139

In [61]:
# 발음이 유사하지 않은 경우 
cosine_similarity(word2phonvec('수영'), word2phonvec('영수'))

0.5325486369502718

## Arthimetic

In [62]:
cosine_similarity(word2phonvec('참'), (word2phonvec('참기름')))

0.6068710788634828

In [63]:
cosine_similarity(word2phonvec('기름'), (word2phonvec('참기름')))

0.8751706035195114

In [64]:
# 참 + 기름 = 참기름 
cosine_similarity((word2phonvec('참') + word2phonvec('기름')), word2phonvec('참기름'))

0.9430762726744816

In [65]:
# 참기 + 름 = 참기름 
cosine_similarity((word2phonvec('참기') + word2phonvec('름')), word2phonvec('참기름'))

0.9462407254796422

In [66]:
# 참기름 - 참 = 기름 
cosine_similarity((word2phonvec('참기름') - word2phonvec('참')), word2phonvec('기름'))

0.9099441563065853

In [74]:
# 조식 - 조 + 중 = 중식 
cosine_similarity((word2phonvec('조식') - word2phonvec('조') + word2phonvec('중')), word2phonvec('중식'))

0.8672185659299296

In [88]:
# 해 + 도지 = 해돋이 
cosine_similarity((word2phonvec('해') + word2phonvec('도지')), word2phonvec('해돋이'))

0.912673896140464

In [91]:
# 꼰 + 닙 = 꽃잎
cosine_similarity((word2phonvec('꼰') + word2phonvec('닙')), word2phonvec('꽃잎'))

0.865353758996784

In [92]:
# 꽃잎 - 꼰 = 닙 
cosine_similarity((word2phonvec('꽃잎') - word2phonvec('꼰')), word2phonvec('닙'))

0.7597654734583635

In [101]:
# 으 + 이 = 의
cosine_similarity((word2phonvec('으') + word2phonvec('이')), word2phonvec('의'))

0.9561226741931529