In [1]:
import pickle
import pandas as pd
import numpy as np
from scipy.spatial import distance
import scipy.stats as stats
import csv

In [2]:
from collections import defaultdict, Counter

In [3]:
# instantiate a dict phone --> feature vector
with open('data/phoneme_vectors/feature_dict.p', "rb") as input_file:
    phone2vec = pickle.load(input_file)

In [4]:
word2ipa = {}

with open('data/words_ipa/de.tsv') as csv_file:
    
    csv_reader = csv.DictReader(csv_file, delimiter='\t')#(csv_file, delimiter='\t', )
    
    
    #print(csv_reader)
    
    for row in csv_reader:
        word2ipa[row["Word"]] =  row["IPA transcription"]
        

In [5]:
word2ipa['Lösung']

'l øː z ʊ ŋ'

In [6]:
# phoneme set
PHONEME_SET = set([ipa  for w, ipa in set(word2ipa.items()) for ipa in ipa.split() ])

In [7]:
PHONEME_SET

{'a',
 'aː',
 'b',
 'd',
 'eː',
 'f',
 'h',
 'i',
 'iː',
 'j',
 'k',
 'l',
 'm',
 'n',
 'oː',
 'p',
 'r',
 's',
 't',
 'uː',
 'v',
 'x',
 'yː',
 'z',
 'ç',
 'øː',
 'ŋ',
 'œ',
 'ɐ',
 'ɔ',
 'ə',
 'ɛ',
 'ɛː',
 'ɡ',
 'ɪ',
 'ʀ',
 'ʃ',
 'ʊ',
 'ʏ',
 'ʒ'}

In [8]:
phone2dist = defaultdict(lambda: defaultdict(float))

for p1 in PHONEME_SET:
    for p2 in PHONEME_SET:
        
        dist = distance.hamming(phone2vec[p1], phone2vec[p2])#**(1/2)

        phone2dist[p1][p2] = dist

In [9]:
phone2dist['z']['s'], phone2dist['p']['b'], phone2dist['p']['z'], phone2dist['p']['ʊ'],  

(0.02631578947368421, 0.02631578947368421, 0.2631578947368421, 0.5)

In [10]:
# String-based Levenstein Distance (PWLD)
def LD(s, t):
        ''' From Wikipedia article; Iterative with two matrix rows. '''
        if s == t: return 0
        elif len(s) == 0: return len(t)
        elif len(t) == 0: return len(s)
        v0 = [None] * (len(t) + 1)
        v1 = [None] * (len(t) + 1)
        for i in range(len(v0)):
            v0[i] = i
        for i in range(len(s)):
            v1[0] = i + 1
            for j in range(len(t)):
                cost = 0 if s[i] == t[j] else 1
                v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
            for j in range(len(v0)):
                v0[j] = v1[j]
                
        return v1[len(t)]

In [11]:
# Phonologically-Weighted Levenstein Distance (PWLD)

def PWLD(s, t):
        ''' From Wikipedia article; Iterative with two matrix rows. '''
        if s == t: return 0
        elif len(s) == 0: return len(t)
        elif len(t) == 0: return len(s)
        v0 = [None] * (len(t) + 1)
        v1 = [None] * (len(t) + 1)
        for i in range(len(v0)):
            v0[i] = i
        for i in range(len(s)):
            v1[0] = i + 0.5
            for j in range(len(t)):
                cost = 0 if s[i] == t[j] else phone2dist[s[i]][t[j]]
                v1[j + 1] = min(v1[j] + 0.5, v0[j + 1] + 0.5, v0[j] + cost)
            for j in range(len(v0)):
                
                
                v0[j] = v1[j]
                
        return v1[len(t)]

In [13]:
LD('k ɪ ʀ ç ə'.split(), 'k ɪ ʀ ʃ ə'.split())

1

In [14]:
PWLD('k ɪ ʀ ç ə'.split(), 'k ɪ ʀ ʃ ə'.split())

0.15789473684210525

In [15]:
PWLD(word2ipa['Lösung'].split(), word2ipa['Lösung'].split())

0

In [16]:
PWLD(word2ipa['Lösung'].split(), word2ipa['schließen'].split())

1.2105263157894737

In [17]:
PWLD(word2ipa['schlossen'].split(), word2ipa['schließen'].split())

0.21052631578947367