In [1]:
import pandas as pd
import re 
import numpy as np

1. Lining up and mapping between CELEX and IPA syllables

In [None]:
subtlex = pd.read_csv('/home/neel/Desktop/MOUS_hierarchical-representations/subtlex_v3_IPA_syllables.csv')
subtlex

In [3]:
subtlex_IPA = subtlex[['Word','Syllables']]
subtlex_IPA.head()

Unnamed: 0,Word,Syllables
0,ik,ɪk
1,je,jə
2,het,hət
3,de,də
4,dat,dɑt


In [4]:
# Load the CSV file
celex = pd.read_csv('/home/neel/Desktop/MOUS_hierarchical-representations/dutch_celex_database_updatedv2.csv')

# Check the columns of the DataFrame
print(celex.columns)

# Assuming the correct column name is 'phone_full', if it is not, replace 'phone_full' with the correct column name
celex_syllables = celex[['Head', 'phone_full']]   

# Rename columns
celex_syllables = celex_syllables.rename(columns={'Head': 'Word', 'phone_full': 'CELEX'})

# Remove apostrophes (stress marks) from every entry in 'CELEX'
celex_syllables['CELEX'] = celex_syllables['CELEX'].str.replace("'", "")

# Display the first few rows
celex_syllables.head()

Index(['n_word', 'IdNum', 'Head', 'phone_full', 'syl_num', 'phone_num',
       'phone_clean', 'phone_CV', 'syl_num_sequence', 'phone_position_syl',
       'phone_position_syl_onc', 'phone_cluster_size',
       'phone_cluster_position', 'stress_pattern'],
      dtype='object')


Unnamed: 0,Word,CELEX
0,a,a
1,Aagje,ax-j@
2,aagt,axt
3,aagtappel,axt-A-p@l
4,aai,aj


In [5]:
#merge the two dataframes on Word
merged = pd.merge(subtlex_IPA, celex_syllables, on='Word', how='inner')
#insert a space before and after every dash in the phone_full column
merged['CELEX'] = merged['CELEX'].str.replace("-", " - ")
#rename Syllables to IPA
merged = merged.rename(columns={'Syllables': 'IPA'})
merged.to_csv('/home/neel/Desktop/MOUS_hierarchical-representations/merged-IPA_CELEX.csv', index=False)
merged

Unnamed: 0,Word,IPA,CELEX
0,ik,ɪk,Ik
1,je,jə,j@
2,het,hət,@t
3,de,də,d@
4,dat,dɑt,dAt
...,...,...,...
47642,aanbidster,aːn - bɪt - stər,am - bIt - st@r
47643,aanbesteden,aːn - bə - steː - dən,am - b@ - ste - d@
47644,aanbelanden,aːn - bə - lɑn - dən,am - b@ - lAn - d@
47645,aanbaksel,aːn - bɑk - səl,am - bAk - s@l


In [6]:
#create an empty column for whether the number of syllables matches
merged['Equal # of Syllables'] = np.nan



Syllable comparison

In [8]:

syllables_mapping_master = {}
conflict_mapping_master = []
for row, word in enumerate(merged.iterrows()):
    Celex2IPA_syllables_mapping = {}
    IPA = word[1]['IPA']
    CELEX = word[1]['CELEX']
    # Split the IPA and CELEX strings into lists
    IPA_list = IPA.split(" - ")
    CELEX_list = CELEX.split(" - ")
    # If the number of syllables in the lists is equal, create a mapping
    if len(IPA_list) == len(CELEX_list):
        merged.at[row, 'Equal # of Syllables'] = True
        for i in range(len(IPA_list)):
            Celex2IPA_syllables_mapping[CELEX_list[i]] = IPA_list[i]
        # Concatenate the mappings for all words
        # If an entry already exists in the dictionary, check if the mapping is the same
        for key, value in Celex2IPA_syllables_mapping.items():
            if key in syllables_mapping_master:
                if syllables_mapping_master[key] != value:
                    print(f"Key {key} already exists in syllables_mapping_master with a different value.")
                    conflict_mapping_master.append((key, syllables_mapping_master[key], value))
            else:
                syllables_mapping_master[key] = value
    else:
        merged.at[row, 'Equal # of Syllables'] = False

Key war already exists in syllables_mapping_master with a different value.
Key Om already exists in syllables_mapping_master with a different value.
Key t@ already exists in syllables_mapping_master with a different value.
Key t@ already exists in syllables_mapping_master with a different value.
Key m@ already exists in syllables_mapping_master with a different value.
Key d@ already exists in syllables_mapping_master with a different value.
Key t@ already exists in syllables_mapping_master with a different value.
Key l@ already exists in syllables_mapping_master with a different value.
Key t@ already exists in syllables_mapping_master with a different value.
Key d@r already exists in syllables_mapping_master with a different value.
Key en already exists in syllables_mapping_master with a different value.
Key l@ already exists in syllables_mapping_master with a different value.
Key t@ already exists in syllables_mapping_master with a different value.
Key d@ already exists in syllables_m

In [9]:
#Check for convergence of syllable mapping conflicts (n = 37565)
#e.g if the same CELEX syllable maps to different IPA syllables
#eventually, pool: combine the frequency counts of all the IPA syllables that map to the same CELEX syllable
conflict_mapping_master # (CELEX, IPA1, IPA2)
#REFORMAT into a dictionary, where the first value in each tuple is the key, and the second and third values are the values. Merge all the entries which have the same key
conflict_mapping_dict = {}
for conflict in conflict_mapping_master:
    if conflict[0] in conflict_mapping_dict:
        conflict_mapping_dict[conflict[0]].append(conflict[1:])
    else:
        conflict_mapping_dict[conflict[0]] = [conflict[1:]]
#only keep unique values for each key
for key, value in conflict_mapping_dict.items():
    conflict_mapping_dict[key] = list(set(value))

#combine all the values for each key into a single list, then keep the unique elements
for key, value in conflict_mapping_dict.items():
    conflict_mapping_dict[key] = list(set([item for sublist in value for item in sublist]))
conflict_mapping_dict


{'war': ['ʋaː', 'ʋaːr', 'ʋaːrh', 'aːr'],
 'Om': ['dɔm',
  'ɔmz',
  'ɔmh',
  'fɔm',
  'tɔm',
  'rɔmh',
  'rɔm',
  'ɔm',
  'oː',
  'ɔn',
  'oːm',
  'lɔm',
  'ɔmʋ',
  'sɔn',
  'nɔm',
  'lɔmsx',
  'sɔm'],
 't@': ['təɲ',
  'stəŋ',
  'teːh',
  'sti',
  'təŋ',
  'tɛk',
  'stɛŋ',
  'tɛnh',
  'stə',
  'tɛn',
  'tɛsx',
  'stɪː',
  'stən',
  'taː',
  'teː',
  'tɛr',
  'stɛn',
  'tɛ',
  'tɛɲ',
  'tɛx',
  'stənt',
  'tɑ',
  'stɛ',
  'tən',
  'steːh',
  'stəh',
  'tər',
  'ti',
  'tə',
  'tɛŋ',
  'tɛp',
  'stər',
  'tənh',
  'təh',
  'tɪː',
  'steː'],
 'm@': ['mɛ',
  'mɛŋ',
  'mən',
  'məŋ',
  'meːh',
  'mi',
  'mɛn',
  'mɛtʲ',
  'smə',
  'mɛz',
  'mɛk',
  'mə',
  'meː',
  'mɛx',
  'mɛnh',
  'mɪː',
  'məɲ'],
 'd@': ['dɛ',
  'də',
  'dɛt',
  'dəs',
  'tɛn',
  'dɛnh',
  'dɪ',
  'teː',
  'dən',
  'tən',
  'dəl',
  'dɛnʋ',
  'dək',
  'dəŋk',
  'dɛn',
  'dəŋ',
  'dər',
  'dənsh',
  'dəh',
  'deː',
  'dɛsh',
  'deːh',
  'døː',
  'dɛh',
  'di',
  'dɪː'],
 'l@': ['ləh',
  'lɛtʲ',
  'ləŋk',
  'lɛ',
  'ləɲ',


In [10]:
num_keys = len(conflict_mapping_dict)
print(num_keys)

2510


In [11]:
#Number of syllabification conflicts (n = 6292)
merged[merged['Equal # of Syllables'] == False]


Unnamed: 0,Word,IPA,CELEX,Equal # of Syllables
16,zijn,zɛ - ɪn,zKn,False
28,jij,jɛ - ɪ,jK,False
31,mijn,mɛ - ɪn,mKn,False
99,kijk,kɛ - ɪk,kKk,False
108,altijd,ɑl - tɛ - ɪt,Al - tKt,False
...,...,...,...,...
47629,aangrijnzen,aːn - ɣrɛ - ɪn - zən,aN - GrKn - z@,False
47632,aangenaamheid,aːn - ɣə - naːmh - ɛ - ɪt,aN - G@ - nam - hKt,False
47637,aandweilen,aːnd - ʋɛ - ɪ - lən,an - dwK - l@,False
47638,aandrijver,aːn - drɛ - ɪ - vər,an - drK - v@r,False


2. Frequencies of IPA Syllables

In [14]:
subtlex

Unnamed: 0,Word,FREQcount,CDcount,FREQlow,CDlow,FREQlemma,SUBTLEXWF,Zipf,SUBTLEXCD,Lg10CD,dominant.pos,dominant.pos.freq,dominant.pos.lemma,dominant.pos.lemma.freq,all.pos,all.pos.freq,all.pos.lemma.freq,Status,IPA,Syllables
0,ik,1744062,8054,778704,3125,1744527,39883.0334,7.597064,99.8017,3.9061,VNW,1743609,ik,1743944,.VNW.SPEC.N.VZ.,.1743609.448.4.1.,.1743944.448.134.1.,keep,ˈɪk,ɪk
1,je,1600888,8060,1315051,6535,1600923,36608.9449,7.559864,99.8761,3.9064,VNW,1600798,je,1600798,.VNW.SPEC.N.BW.LID.,.1600798.72.15.2.1.,.1600798.72.50.2.1.,keep,jˈə,jə
2,het,1068396,8066,780771,5578,1913811,24431.9717,7.384235,99.9504,3.9067,VNW,735390,het,735395,.VNW.LID.SPEC.WW.N.,.735390.332929.53.22.2.,.735395.332929.53.845403.31.,keep,hˈət,hət
3,de,1061177,8070,903872,6512,1063827,24266.8883,7.381291,100.0000,3.9069,LID,1060098,de,1062748,.LID.VNW.SPEC.VZ.,.1060098.806.272.1.,.1062748.806.272.1.,keep,dˈə,də
4,dat,965424,8063,715570,6107,965431,22077.2184,7.340221,99.9133,3.9066,VNW,532576,dat,532576,.VNW.VG.SPEC.N.WW.,.532576.432794.51.2.1.,.532576.432794.51.9.1.,keep,dˈɑt,dɑt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270349,aagh,1,1,0,0,1,0.0229,1.656532,0.0124,0.3010,SPEC,1,aagh,1,.SPEC.,.1.,.1.,keep,ˈaːx,aːx
270350,aagghh,1,1,0,0,1,0.0229,1.656532,0.0124,0.3010,N,1,aagghh,1,.N.,.1.,.1.,keep,ˈaːx,aːx
270351,aagezien,1,1,1,1,1,0.0229,1.656532,0.0124,0.3010,WW,1,aazien,1,.WW.,.1.,.1.,keep,ˌaːɣəzˈin,aː - ɣə - zin
270352,aaf,1,1,0,0,1,0.0229,1.656532,0.0124,0.3010,SPEC,1,aaf,1,.SPEC.,.1.,.1.,keep,ˈaːf,aːf


In [13]:
IPA_syllables = subtlex['IPA']
for i in range(len(IPA_syllables)):
    IPA_syllables[i] = IPA_syllables[i].replace("-", " - ")
