In [None]:
import pandas as pd
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from segments import Tokenizer

from collections import Counter

import unicodedata

np.random.seed(123)

In [None]:
TOKENIZER = Tokenizer()

https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf

In [None]:
phoible = pd.read_csv('phoible.csv', dtype=str)

In [None]:
unique_phonemes = pd.read_csv('phoible_features.csv', dtype=str)

In [None]:
langs = ['eng_latn_uk_broad_filtered', 
         'eng_latn_uk_narrow', 
         'eng_latn_us_broad_filtered', 
         'eng_latn_us_narrow', 
         'fra_latn_broad_filtered', 
         'ell_grek_broad_filtered', 
         'ell_grek_narrow', 
         'cmn_hani_broad', 
         'deu_latn_broad_filtered', 
         'deu_latn_narrow', 
         'eus_latn_broad', 
         'fin_latn_broad', 
         'fin_latn_narrow', 
         'ind_latn_broad', 
         'ind_latn_narrow', 
         'kat_geor_broad_filtered', 
         'mya_mymr_broad_filtered',
         'spa_latn_ca_broad_filtered', 
         'spa_latn_ca_narrow', 
         'spa_latn_la_broad_filtered', 
         'kor_hang_narrow_filtered', 
         'jpn_hira_narrow_filtered', 
         'spa_latn_la_narrow', 
         'tgl_latn_broad', 
         'tgl_latn_narrow', 
         'tha_thai_broad', 
         'tur_latn_broad', 
         'tur_latn_narrow_filtered', 
         'hin_deva_broad_filtered', 
         'hin_deva_narrow', 
         'zul_latn_broad', 
         'vie_latn_hanoi_narrow_filtered', 
         'rus_cyrl_narrow', 
]

In [None]:
for lang in langs:

    unique_phon = [unicodedata.normalize('NFC', char.strip()) for char in unique_phonemes['Phoneme'].to_list()]
    unique_phonemes['Phoneme'] = unique_phon

    print(f'CURRENT LANGUAGE: {lang}')

    # reading the phoible subset for this language
    lang_phoible = phoible[phoible['ISO6393'] == lang[:3]]

    # get the cleaned wikipron data
    current_lang = pd.read_csv('wikipron_clean/high/' + lang + '.tsv', dtype=str, sep='\t', names=['grapheme', 'phoneme'], na_filter=False)
    
    # get the set of phonemes in the actual training data. All chars are normalized to match the other data
    tokenized = ' '.join(current_lang['phoneme'].to_list())
    current_phoneme_set = set([unicodedata.normalize('NFC', char.strip()) for char in tokenized.split()])
    #current_phoneme_set_NFD = set([unicodedata.normalize('NFD', char.strip()) for char in tokenized.split()])

    # get the phoible features for those phonemes in the language
    phoible_subset = unique_phonemes[unique_phonemes.Phoneme.isin(current_phoneme_set)]
    phoible_allophones = lang_phoible['Allophones'].dropna().to_list()
    phoible_allophones = [char for all in phoible_allophones for char in all.split()]

    diff = set(current_phoneme_set).difference(phoible_subset.Phoneme.to_list())
    diff_allo = diff.difference(set(phoible_allophones))

    for phon in diff_allo:
        if phon.endswith('ː'):
            new_phon = unique_phonemes[unique_phonemes['Phoneme'] == phon[:-1]]
            new_phon['long'] = '+'
            new_phon['Phoneme'] = phon
            phoible_subset = phoible_subset.append(new_phon)
            diff.remove(phon)
    
    phoible_subset = phoible_subset[phoible_subset['consonantal'] == '-']
            
    
    # encode the dataset such that we can see what featuers correlate
    encoded = phoible_subset.iloc[:,1:]
    label_encoder = LabelEncoder()
    for idx in range(encoded.shape[1]):
        encoded.iloc[:,idx] = label_encoder.fit_transform(encoded.iloc[:,idx]).astype('float64')

    # get the correlation matrix
    current_corr = encoded.corr()
    
    # print heatmap for correlation if neccessary 
    # plt.subplots(figsize=(10, 7))
    # sns.heatmap(current_corr)

    # kick those features out whose correlation with another feature is above a certain threshold
    columns = np.full((current_corr.shape[0],), True, dtype=bool)
    for i in range(current_corr.shape[0]):
        for j in range(i+1, current_corr.shape[0]):
            if current_corr.iloc[i,j] >= 0.5:
                if columns[j]:
                    columns[j] = False
    
    # get those features, i.e. the colum names and retrieve it from the dataset
    columns = np.insert(columns, 0, False)
    selected_columns = phoible_subset.columns[columns]
    # selected_columns = ['consonantal']
    features = phoible_subset[selected_columns].drop_duplicates()
    ratio = round(len(features) / len(phoible_subset), 3)

    # create phonemes lists and their corresponsing features
    all_phons_list = phoible_subset[['Phoneme']].values.flatten().tolist()
    all_phones_features_list = phoible_subset[selected_columns].values.tolist()
    
    # ration of how many flags we have compared to the number of phonemes in the dataset, i.e. how many phonemes we can distinguish using the features
    print(f'Ratio of number of phonemes and number of different flags: {ratio}')
    print(f'Number of phonemes in this language: {len(current_phoneme_set)}')

    chars = 'ABCDEFGHIJKLMN'
    gen = itertools.product(chars, repeat=2)
    
    # create mapping from features to encoding
    f1_dict = {}
    for l in features.values:
        f1_dict[''.join(l)] = ''.join(next(gen))

    # create mapping from phoneme to encoded phoneme
    phon_features = {}
    c = Counter()
    for p, f1 in zip(all_phons_list, all_phones_features_list):
        c.update([f1_dict[''.join(f1)]])
        phon_features[p] = p + ' ' + f1_dict[''.join(f1)]

    # check what phonemes that are in the dataset are not in the phoible phonemes (without allophones)


    i = 0
    for ph in current_lang.phoneme.to_list():
        if any([True for i in diff if i in ph]):
            i += 1
    
    new_phones = []
    for graph, phon in zip(current_lang[['grapheme']].values.tolist(), current_lang[['phoneme']].values.tolist()):
        p = phon[0].split(' ')
        new_p = ''
        for char in p:
            try:
                char = phon_features[char]
            except KeyError:
                char = char
            new_p += ' ' + char
        new_phones.append([graph[0], new_p.strip()])

    too_long = [p for p in new_phones if len(p[1].split()) > 30]
    
    new_df = pd.DataFrame(new_phones)
    new_df.to_csv('wikipron_features/' + lang + '_FEATURES_v3.tsv', sep='\t', header=False, index=False)
    
    print(f'Number of phonemes that are not expanded with a feature encoding: {i}')
    print(f'Phonemes that are not found like that in phoible (without allophones): {diff}')
    print(f'Number of pronunciations that are longer than 30 chars: {len(too_long)}\n\n')

CURRENT LANGUAGE: eng_latn_uk_broad_filtered
Ratio of number of phonemes and number of different flags: 0.306
Number of phonemes in this language: 60
Number of phonemes that are not expanded with a feature encoding: 0
Phonemes that are not found like that in phoible (without allophones): set()
Number of pronunciations that are longer than 30 chars: 74


CURRENT LANGUAGE: eng_latn_uk_narrow
Ratio of number of phonemes and number of different flags: 0.236
Number of phonemes in this language: 177
Number of phonemes that are not expanded with a feature encoding: 101
Phonemes that are not found like that in phoible (without allophones): {'ʊʷ', 'aʰ', 'ŭ̥', 'ɹʲ', 'k̚', 't̚', 'ɹ̥ʷ', 'kˡ', 'ǀ', 'ʌˑ', 'ɹ̠̊ʷ', 'ʌˀ', 'ɒˀ', 'ɹ̠̊', 'ɹ̠ʷ', 'ɹ̠̝ʷ', 'ˀe', 'ɔ̹', 'ɹʷ', 'ɹ̝̊', 'bˡ', 'uːʷ', 'ĭ̥', 'ʉ̯', 'ɪˑ', 'æˀ', 'b̚', 'l̥ʰ', 'd̚', 'p̚', 'ɹ̠', 'iːʲ', 'æ̙', 'k̠ʰ', '˞', 'ɪʰ', 'ɫ̩'}
Number of pronunciations that are longer than 30 chars: 1


CURRENT LANGUAGE: eng_latn_us_broad_filtered
Ratio of number of pho

In [None]:
label_encoder = LabelEncoder()
for idx in range(df.shape[1]):
    df.iloc[:,idx] = label_encoder.fit_transform(df.iloc[:,idx]).astype('float64')

In [None]:
for idx in range(1, df.shape[1]):
    print(np.unique(df.iloc[:,idx]))

In [None]:
corr = df.corr()

In [None]:
plt.subplots(figsize=(15, 12))
sns.heatmap(corr)

In [None]:
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.4:
            if columns[j]:
                columns[j] = False

selected_columns = df.columns[columns]
data = df[selected_columns]

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3164 entries, 0 to 3163
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tone                3164 non-null   object
 1   stress              3164 non-null   object
 2   consonantal         3164 non-null   object
 3   spreadGlottis       3164 non-null   object
 4   constrictedGlottis  3164 non-null   object
dtypes: object(5)
memory usage: 123.7+ KB


In [None]:
total_df.describe()

Unnamed: 0,Phoneme,tone,stress,syllabic,short,long,consonantal,sonorant,continuant,delayedRelease,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
count,3164,3164,3164,3164,3164,3164,3164,3164,3164,3164,...,3164,3164,3164,3164,3164,3164,3164,3164,3164,3164
unique,3164,3,3,9,5,7,7,11,10,10,...,6,4,10,4,11,8,4,8,6,8
top,χʷ,0,-,-,-,-,+,+,+,0,...,0,0,+,-,-,-,-,-,-,-
freq,1,3102,3102,1990,3019,2582,1876,1546,1730,1596,...,1978,1978,1970,3092,2625,2692,1964,2915,3070,1828


In [None]:
chars1 = 'ABCDEFGH'
chars2 = 'NOPQRST'

In [None]:
gen2 = itertools.product(chars1, repeat=3)
gen1 = itertools.product(chars2, repeat=3)

In [None]:
'syllabic', 'nasal', 'labial', 

In [None]:
total_df[['Phoneme', 'tone', 'consonantal', 'stress', 'spreadGlottis', 'constrictedGlottis']].drop_duplicates(subset=['tone', 'consonantal', 'stress', 'spreadGlottis', 'constrictedGlottis'])

In [None]:
pd.set_option('display.max_rows', 20)
df[df['consonantal'] == '+'].describe()

Unnamed: 0,Phoneme,tone,stress,syllabic,short,long,consonantal,sonorant,continuant,delayedRelease,approximant,tap,trill,nasal,lateral,labial,round,labiodental,coronal,anterior,distributed,strident,dorsal,high,low,front,back,tense,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
count,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876,1876
unique,1876,1,1,2,2,4,1,9,8,9,5,5,6,8,7,11,5,5,9,6,10,9,14,5,3,4,4,1,2,2,8,2,9,6,2,6,4,6
top,n̤s,0,-,-,-,-,+,-,-,-,-,-,-,-,-,-,0,0,+,+,-,-,-,0,0,0,0,0,0,0,-,-,-,-,-,-,-,-
freq,1,1876,1876,1851,1861,1617,1876,1237,1131,715,1611,1784,1804,1461,1663,1391,1392,1392,1158,945,638,926,1018,1021,1021,1021,1021,1876,1792,1792,876,1874,1517,1551,1832,1694,1844,1701


In [None]:
pd.set_option('display.max_rows', 20)
cons_f1 = df[df['consonantal'] == '+'][['continuant', 'sonorant', 'distributed', 'nasal', 'lateral', 'strident', 'delayedRelease']].drop_duplicates()

cons_f2 = df[df['consonantal'] == '+'][['anterior', 'dorsal', 'coronal', 'labial', 'labiodental', 'retractedTongueRoot', 'advancedTongueRoot']].drop_duplicates()

cons_f1.info()
cons_f2.info()

In [None]:
cons_f1_dict = {}
cons_f2_dict = {}
for l in cons_f1.values:
    cons_f1_dict['-' + ''.join(l)] = ''.join(next(gen2))

for l2 in cons_f2.values:
    cons_f2_dict['-' + ''.join(l2)] = ''.join(next(gen2))

In [None]:
print(len(cons_f1_dict))
print(len(cons_f2_dict))

147
91


In [None]:
cons = df[df['consonantal'] == '+'][['Phoneme']].values.flatten().tolist()
cons_f1_list = df[df['consonantal'] == '+'][['continuant', 'sonorant', 'distributed', 'nasal', 'lateral', 'strident', 'delayedRelease']].values.tolist()
cons_f2_list = df[df['consonantal'] == '+'][['anterior', 'dorsal', 'coronal', 'labial', 'labiodental', 'retractedTongueRoot', 'advancedTongueRoot']].values.tolist()

In [None]:
phon_features_cons = {}
num_cons = []

for p, f1, f2 in zip(cons, cons_f1_list, cons_f2_list):
    num_cons.append(cons_f1_dict['-' + ''.join(f1)] + cons_f2_dict['-' + ''.join(f2)])
    phon_features_cons[p] = p + ' ' + cons_f1_dict['-' + ''.join(f1)] + ' ' + cons_f2_dict['-' + ''.join(f2)]

In [None]:
phon_features_cons

In [None]:
pd.set_option('display.max_rows', 20)
non_cons_f1 = df[df['consonantal'] == '-'][['high', 'low', 'front', 'back', 'tense']].drop_duplicates()

non_cons_f2 = df[df['consonantal'] == '-'][['nasal', 'labial', 'labiodental', 'syllabic', 'long', 'round']].drop_duplicates()

non_cons_f2.info()
non_cons_f1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101 entries, 0 to 3109
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   nasal        101 non-null    object
 1   labial       101 non-null    object
 2   labiodental  101 non-null    object
 3   syllabic     101 non-null    object
 4   long         101 non-null    object
 5   round        101 non-null    object
dtypes: object(6)
memory usage: 5.5+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 209 entries, 0 to 3110
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   high    209 non-null    object
 1   low     209 non-null    object
 2   front   209 non-null    object
 3   back    209 non-null    object
 4   tense   209 non-null    object
dtypes: object(5)
memory usage: 9.8+ KB


In [None]:
non_cons_f1_dict = {}
non_cons_f2_dict = {}
for l in non_cons_f1.values:
    non_cons_f1_dict['-' + ''.join(l)] = ''.join(next(gen1))

for l2 in non_cons_f2.values:
    non_cons_f2_dict['-' + ''.join(l2)] = ''.join(next(gen1))

In [None]:
print(len(non_cons_f1_dict))
print(len(non_cons_f2_dict))

103
206


In [None]:
non_cons = df[df['consonantal'] == '-'][['Phoneme']].values.flatten().tolist()
f1_list = df[df['consonantal'] == '-'][['high', 'low', 'front', 'back']].values.tolist()
f2_list = df[df['consonantal'] == '-'][['tense', 'nasal', 
              'labial', 'labiodental', 'syllabic', 'long', 'round']].values.tolist()

In [None]:
phon_features_non_cons = {}
num = []

for p, f1, f2 in zip(non_cons, f1_list, f2_list):
    num.append(non_cons_f1_dict['-' + ''.join(f1)]+non_cons_f2_dict['-' + ''.join(f2)])
    phon_features_non_cons[p] = p + ' ' + non_cons_f1_dict['-' + ''.join(f1)] + ' ' + non_cons_f2_dict['-' + ''.join(f2)]

In [None]:
 phon_features_non_cons

In [None]:
all_features = {}
all_features.update(phon_features_non_cons)
all_features.update(phon_features_cons)

In [None]:
len(all_features)

3071

In [None]:
deu = pd.read_csv('/content/deu_latn_broad_filtered.tsv', sep='\t', names=['g', 'p'])

In [None]:
dp = deu[['p']].values.tolist()
dg = deu[['g']].values.tolist()

In [None]:
import re
PATTERN = r'[͡.̍͜]'

In [None]:
new_phones = []
for graph, phon in zip(dg, dp):
    p = phon[0].split(' ')
    new_p = ''
    for char in p:
        char = re.sub(PATTERN, '', char)
        try:
            char = all_features[char]
        except KeyError:
            char = char
        new_p += ' ' + char
    new_phones.append([graph[0], new_p.strip()])

In [None]:
new_phones

In [None]:
new_deu = pd.DataFrame(new_phones)

In [None]:
new_deu.to_csv('deu_latn_broad_filtered_FEATURES.tsv', sep='\t', header=False, index=False)