In this work, we utilized a Word2Vec model to generate vector representations of IPA segments, inspired by the approach described in the paper “IPA Alignment Using Vector Representations” by Pavel Sofroniev and Çağri Çöltekin(https://github.com/pavelsof/ipavec/blob/master/paper/thesis.pdf). The Word2Vec model was trained specifically for generating these vector representations and is based on the methodology provided in their research.

In [1]:
!pip install gensim
!pip install ipatok

Collecting ipatok
  Downloading ipatok-0.4.2-py2.py3-none-any.whl.metadata (6.3 kB)
Downloading ipatok-0.4.2-py2.py3-none-any.whl (15 kB)
Installing collected packages: ipatok
Successfully installed ipatok-0.4.2


In [2]:
import pandas as pd
from gensim.models import Word2Vec
from google.colab import drive
import os
import warnings
from ipatok.ipa import is_letter, is_tie_bar
import numpy as np

In [41]:
# Mount Google Drive
drive.mount('/content/gdrive')
forms = pd.read_csv("/content/gdrive/My Drive/Data Science/forms.csv")
# Check the data
forms

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Unnamed: 0,ID,Local_ID,Language_ID,Parameter_ID,Value,Form,Segments,Comment,Source,Cognacy,Loan,Graphemes,Profile,Prosody,Morpheme_Glosses,Partial_Cognacy,Chinese_Characters
0,Beijing-91_vomit-1,,Beijing,91_vomit,tʰu⁵¹,tʰu⁵¹,tʰ u ⁵¹,,Liu2007,,,,,i n t,spit/吐,1,吐
1,Haerbin-91_vomit-1,,Haerbin,91_vomit,tʰu⁵³,tʰu⁵³,tʰ u ⁵³,,Liu2007,,,,,i n t,spit/吐,1,吐
2,Jinan-91_vomit-1,,Jinan,91_vomit,tʰu³¹,tʰu³¹,tʰ u ³¹,,Liu2007,,,,,i n t,spit/吐,1,吐
3,Rongcheng-91_vomit-1,,Rongcheng,91_vomit,ou²¹³⁻³⁵ tʰu²¹⁴,ou²¹³⁻³⁵ tʰu²¹⁴,ou ²¹³ + tʰ u ²¹⁴,copulative synonyme,Liu2007,,,,,n t + i n t,nausea/嘔 spit/吐,2 1,嘔 吐
4,Taiyuan-91_vomit-1,,Taiyuan,91_vomit,tʰu⁵³ lə⁰,tʰu⁵³ lə⁰,tʰ u ⁵³ + l ə ⁰,,Liu2007,,,,,i n t + i n t,nausea/嘔 _:PERFECTIVE/了,2 5,嘔 嘞
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4297,Guangzhou-90_woman-1,,Guangzhou,90_woman,nøy²³ iɐn²¹⁻²³,nøy²³ iɐn²¹⁻²³,n øy ²³ + j ɐ n ²¹,,Liu2007,,,,,i n t + i n c t,female/女 _person/人,39 38,女 人
4298,Fuzhou-90_woman-1,,Fuzhou,90_woman,i⁵⁵⁻⁵³ tsia³²,i⁵⁵⁻⁵³ tsia³²,i ⁵⁵ + ts j a ³²,,Liu2007,,,,,n t + i m n t,woman/伊 sister/姐,824 715,伊 姐
4299,Fuzhou-90_woman-2,,Fuzhou,90_woman,ny³²⁻⁵⁵ ɛ²¹²,ny³²⁻⁵⁵ ɛ²¹²,n y ³² + ɛ ²¹²,,Liu2007,,,,,i n t + n t,female/女 _world/界,39 825,女 界
4300,Fuzhou-90_woman-3,,Fuzhou,90_woman,tsy⁵⁵ nøyŋ⁵³⁻⁵⁵ nøyŋ⁵³,tsy⁵⁵ nøyŋ⁵³⁻⁵⁵ nøyŋ⁵³,ts y ⁵⁵ + n øy ŋ ⁵³ + n øy ŋ ⁵³,,Liu2007,,,,,i n t + i n c t + i n c t,woman/諸 female/娘 _person/人,749 31 38,諸 娘 儂


In [42]:
# Training a word2vec model to generate the vector representations of IPA segments
# First, we'll use the traget data conmbined with the annotated chinese dialets data
# as the training data, for the model's only purpose is to generate the embedding for the
# target data, so overfitting is not a problem.


def normalise_token(token):
	return ''.join([char for char in token
					if not is_tie_bar(char) and char != '◌̯'[1]])

def read_from_dir(dataset_path):
  # Initialize a list to store the result
  ipa_data = []
  # Iterate over all the file in the dir
  for file in os.listdir(dataset_path):
    file_path = os.path.join(dataset_path, file)
    # Check if the file is in tsv form
    if os.path.isfile(file_path) and file_path.endswith('.tsv'):
      # Read and process the ipa
      with open(file_path, encoding='utf-8') as f:
        for line in f:
          col = line.strip().split('\t')
          if col[0] == 'language':
            continue
          # The IPA tokens is the 6th column of the tsv table
          tokens = col[5]
          ipa_data.append(normalise_token(tokens).split())
  return ipa_data

# Read from the target data
form = forms['Form']
tar_data = []
for ipa in form:
    if pd.notna(ipa):
        segment = [normalise_token(ipa).strip()]
        tar_data.append(segment)

dir_data = read_from_dir("/content/gdrive/My Drive/Data Science/data")
ipa_data = tar_data
# Add target data and the extra training data together
ipa_data.extend(dir_data)
ipa_data


[['tʰu⁵¹'],
 ['tʰu⁵³'],
 ['tʰu³¹'],
 ['ou²¹³⁻³⁵ tʰu²¹⁴'],
 ['tʰu⁵³ lə⁰'],
 ['ŋou²¹ tʰu⁵³'],
 ['fa²¹⁻⁵⁵ tʰu⁵³ lo²¹'],
 ['tʰu⁵³⁻⁵⁵ lo²¹'],
 ['ŋəu⁵³⁻⁴⁵ lo²¹'],
 ['tʰu⁴⁴'],
 ['əɯ¹¹⁻²² tʰu⁴⁴⁻²²'],
 ['fɔ⁵⁵⁻⁴⁵ ŋoʔ³²*'],
 ['ʏ⁵²'],
 ['tʰøy⁵¹'],
 ['tʰu⁵⁵'],
 ['xue⁴²'],
 ['ta³⁵⁻⁵ xue⁴²'],
 ['ŋiɛu²¹³'],
 ['tʰu²¹³'],
 ['pʰon³³'],
 ['tʰu²⁵'],
 ['ŋɐu³⁵ tʰou³³'],
 ['tʰou²¹²'],
 ['tʰo²¹'],
 ['pʰᴀ⁵¹'],
 ['pʰa⁵³'],
 ['pʰa³¹'],
 ['pʰa³³⁴'],
 ['pʰa⁴⁵'],
 ['pʰɑ⁴⁴'],
 ['pʰa²¹³'],
 ['tsʰa²¹³'],
 ['pʰɑ⁴⁴'],
 ['pʰo³⁵'],
 ['pʰo⁴¹²'],
 ['ho⁴¹'],
 ['pʰa⁵⁵'],
 ['pʰɔ³⁵'],
 ['pʰa²¹³'],
 ['pʰa⁵³'],
 ['vi⁵³'],
 ['kiaŋ⁴⁴'],
 ['pʰᴀ²⁵'],
 ['pʰa³³'],
 ['kiaŋ⁵⁵'],
 ['kiã⁵⁵'],
 ['pʰi³⁵ fu⁰'],
 ['ʐou⁵³ pʰiər²⁴'],
 ['pʰi⁵³ fu²¹⁴'],
 ['pʰi³⁵ fu⁵²'],
 ['pʰi²¹ fu²¹'],
 ['pʰi²⁴'],
 ['pʰi²¹⁻⁵⁵ pʰi²¹⁻⁵⁵'],
 ['pʰi²¹⁻⁵⁵ tsɿ⁵³'],
 ['pʰi¹³ fu³¹'],
 ['pʰɿ⁴⁴'],
 ['bi¹³⁻²² fu⁴⁴⁻³¹'],
 ['bei³⁴¹⁻³³ fu⁴⁴⁻²²'],
 ['pi¹³ fu³³'],
 ['pʰi¹³'],
 ['pʰi²⁴ fu⁰'],
 ['pʰi¹¹'],
 ['pi²² fu⁴²⁻²²'],
 ['pʰei²¹ fu⁵³'],
 ['pʰuoi⁵³⁻⁵⁵ u⁵⁵'],
 ['pʰuoi⁵³'],
 ['pʰe

In [30]:
# Train the model
model = Word2Vec(
				sentences=ipa_data,
				vector_size=15,  # the length of the output vectors
				window=1,  # that many to the left and that many to the right
				seed=42,  # random seed
				workers=1,  # needed for reproducibility
        # I lower the min_count because the training data is small.
				min_count=1,  # ignore tokens occurring less often than that
				sg=1,  # 0 for cbow, 1 for skip-gram
				negative=1,  # number of negative samples (per positive one?)
				epochs=5,  # number of epochs
				null_word=True)  # reached by ['\0']

In [47]:
# Return the vector representation of the IPA, if it's in the set of  word2vec word vectors.
def get_vector(token):
  # Normalize and strip the token
	token = normalise_token(token).strip()
  # Check if the token exists in the model's vocabulary
	if token in model.wv:
		return token
  # If the token is an empty string, return a zero vector
	if token == '':
		return '\0'
  # Create an alternative token by removing non-letter characters
	alt_token = ''.join([char for char in token if is_letter(char, False)])
  # Check if the alternative token exists in the model's vocabulary
	if alt_token in model.wv:
		return alt_token
  # Warn that the token cannot be recognized by the model
	warnings.warn('phon2vec: cannot recognise {}'.format(token))
	return '\0'

# Calculate the similiraty between two IPA segments
def calc_sim(ipa_a, ipa_b):
  # Retrieve vectors for both IPA segments
  vec_a = get_vector(ipa_a)
  vec_b = get_vector(ipa_b)
  # If either vector is a zero vector, return 0 as similarity
  if vec_a == "\0" or vec_b == "\0":
    return 0
  # Normalize similarity to the range [0, 1]
  normalized_sim = (model.wv.similarity(vec_a, vec_b) + 1) / 2
  return normalized_sim

test = calc_sim('tʰu⁵¹', 'ŋəu⁵³⁻⁴⁵ lo²¹')
test


0.7863156795501709