# PHOIBLE

An exploratory notebook to inspect the PHOIBLE dataset.

PHOIBLE project page: https://phoible.org/  
PHOIBLE data on GitHub: https://github.com/phoible/dev/tree/master/data

In [1]:
import pandas as pd

In [2]:
# load the tabular dataset as Pandas DataFrame
df = pd.read_csv('https://raw.githubusercontent.com/phoible/dev/master/data/phoible.csv', low_memory=False)

In [3]:
df

Unnamed: 0,InventoryID,Glottocode,ISO6393,LanguageName,SpecificDialect,GlyphID,Phoneme,Allophones,Marginal,SegmentClass,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
0,1,kore1280,kor,Korean,,0068,h,ç h ɦ,,consonant,...,0,0,-,-,+,-,-,-,-,-
1,1,kore1280,kor,Korean,,006A,j,j,,consonant,...,0,0,+,-,-,-,-,-,-,-
2,1,kore1280,kor,Korean,,006B,k,k̚ ɡ k,,consonant,...,0,0,-,-,-,-,-,-,-,-
3,1,kore1280,kor,Korean,,006B+02B0,kʰ,kʰ,,consonant,...,0,0,-,-,+,-,-,-,-,-
4,1,kore1280,kor,Korean,,006B+02C0,kˀ,kˀ,,consonant,...,0,0,-,-,-,+,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105482,3020,lamu1254,lby,Tableland Lamalama,,0294,ʔ,,False,consonant,...,0,0,-,-,-,+,-,-,-,-
105483,3020,lamu1254,lby,Tableland Lamalama,,03B8,θ,,False,consonant,...,0,0,-,-,-,-,-,-,-,-
105484,3020,lamu1254,lby,Tableland Lamalama,,0061,a,,False,vowel,...,-,-,+,-,-,-,0,-,-,0
105485,3020,lamu1254,lby,Tableland Lamalama,,0069,i,,False,vowel,...,-,-,+,-,-,-,0,-,-,0


In [4]:
# map the ISO 6393 language codes to language names (this loses some info since the codes map to multiple variants)
languages = dict(df[['ISO6393', 'LanguageName']].dropna().values)

In [5]:
# map language codes to the relevant language rows so we can look up the inventories by language code
phoible = {language: df[df['ISO6393'] == language] for language in languages}

In [6]:
phoible['eng']

Unnamed: 0,InventoryID,Glottocode,ISO6393,LanguageName,SpecificDialect,GlyphID,Phoneme,Allophones,Marginal,SegmentClass,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
6029,160,stan1293,eng,English,,0062,b,bⁿ p͉ b̚ b b,,consonant,...,0,0,+,-,-,-,-,-,-,-
6030,160,stan1293,eng,English,,0064,d,t͉ dˡ d̪ dⁿ ɖ d dz d̚ d,,consonant,...,0,0,+,-,-,-,-,-,-,-
6031,160,stan1293,eng,English,,0064+0320+0292,d̠ʒ,d̠ʒ t̠ʃ͉ d̠ʒ,,consonant,...,0,0,+,-,-,-,-,-,-,-
6032,160,stan1293,eng,English,,0066,f,f,,consonant,...,0,0,-,-,-,-,-,-,-,-
6033,160,stan1293,eng,English,,0068,h,h ç ɦ,,consonant,...,0,0,-,-,+,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90925,2515,stan1293,eng,English,English (Liverpool),025B,ɛ,,False,vowel,...,-,-,+,-,-,-,0,-,-,0
90926,2515,stan1293,eng,English,English (Liverpool),025B+0289,ɛʉ,,False,vowel,...,-,-,+,-,-,-,0,-,-,0
90927,2515,stan1293,eng,English,English (Liverpool),026A,ɪ,,False,vowel,...,-,-,+,-,-,-,0,-,-,0
90928,2515,stan1293,eng,English,English (Liverpool),0289+02D0,ʉː,,False,vowel,...,-,-,+,-,-,-,0,-,-,0


In [7]:
# define some helper functions
def jaccard(s1, s2):
    if len(s1 | s2) == 0:
        return 1.0
    return len(s1 & s2) / len(s1 | s2)

def phonemes(language):
    return {phoneme for phoneme in phoible[language]['Phoneme']}

def consonants(language):
    return {consonant for consonant in phoible[language][phoible[language]['SegmentClass'] == 'consonant']['Phoneme']}

def vowels(language):
    return {vowel for vowel in phoible[language][phoible[language]['SegmentClass'] == 'vowel']['Phoneme']}


In [8]:
# look at the phonemic inventory for English
print(f'''{languages['eng']} has {len(phonemes('eng'))} phonemes: {phonemes('eng')}
... and {len(consonants('eng'))} consonants: {consonants('eng')}
... and {len(vowels('eng'))} vowels: {vowels('eng')}
''')

English has 94 phonemes: {'ʊ', 'aɪ', 'oʊ', 'ɛː', 'ɔɪ', 'ɒ', 'r', 'iː', 'ɪə', 'o̞ː', 'eɪ̯', 'eː', 'øː', 'w', 'm', 'ʉː', 'ɑː', 'd', 'u', 's', 'ɔː', 'n', 'ʊə', 'ɐʉ', 'ʉə', 't', 'ɑe', 'ɚ', 'ʔ', 'e̞', 'l', 'pʰ', 'ts', 'ə', 'ɒɯ', 'æo', 'ei', 'uː', 'iɪ', 'əʊ', 'ɘ', 'ɵː', 'oɪ', 'ɐ', 'oe', 'aː', 'x', 'ɐː', 'ɹ', 'ɑ', 'ɪ', 'tʰ', 'oː', 'eɪ', 'ʃ', 'ɒː', 'k', 't̠ʃ', 'æɪ', 'ɔ', 'æɔ', 'kx', 'p', 'eə', 'z', 'əː', 'ð', 'iə', 'h', 'θ', 'v', 'e', 'b', 'i', 'ɚː', 'æ', 'j', 'a', 'iɛ', 'd̠ʒ', 'ʌ', 'ɛ', 'ŋ', 'ʒ', 'ɜː', 'æe', 'əʉ', 'f', 'aʊ', 'ʍ', 'ɻ', 'ɡ', 'ɛʉ', 'kʰ'}
... and 34 consonants: {'j', 't̠ʃ', 'd̠ʒ', 'kx', 'p', 'r', 'ŋ', 'ʒ', 'z', 'ð', 'w', 'm', 'f', 'd', 'h', 'b', 'ʍ', 's', 'ɻ', 'x', 'n', 'ɹ', 'θ', 'v', 'ɡ', 'tʰ', 'ʃ', 't', 'kʰ', 'ʔ', 'k', 'l', 'pʰ', 'ts'}
... and 60 vowels: {'ʊ', 'a', 'iɛ', 'ə', 'aɪ', 'æɪ', 'ɔ', 'æɔ', 'oʊ', 'ʌ', 'ɛ', 'ɛː', 'ɒɯ', 'ɔɪ', 'ɒ', 'ɜː', 'æo', 'ei', 'uː', 'iɪ', 'iː', 'eə', 'ɪə', 'o̞ː', 'eɪ̯', 'eː', 'øː', 'əː', 'ɚː', 'əʊ', 'æe', 'əʉ', 'ɘ', 'ɵː', 'oɪ', 'ɐ', 'oe', 'ʉː', 'iə',

In [9]:
# look at 'ə' across English variants
eng = phoible['eng']
eng[eng['Phoneme'] == 'ə']

Unnamed: 0,InventoryID,Glottocode,ISO6393,LanguageName,SpecificDialect,GlyphID,Phoneme,Allophones,Marginal,SegmentClass,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
6065,160,stan1293,eng,English,,259,ə,ɜ ə ə ə,,vowel,...,-,-,+,-,-,-,0,-,-,0
76433,2175,stan1293,eng,English (American),Western and Mid-Western US; Southern California,259,ə,ə,False,vowel,...,-,-,+,-,-,-,0,-,-,0
76556,2178,stan1293,eng,English (British),Liverpool,259,ə,ə,False,vowel,...,-,-,+,-,-,-,0,-,-,0
76641,2180,stan1293,eng,English (British),Tyneside English (spoken in Newcastle),259,ə,ə ɐ,False,vowel,...,-,-,+,-,-,-,0,-,-,0
79779,2252,stan1293,eng,English,English (RP),259,ə,,False,vowel,...,-,-,+,-,-,-,0,-,-,0
90924,2515,stan1293,eng,English,English (Liverpool),259,ə,,False,vowel,...,-,-,+,-,-,-,0,-,-,0


In [10]:
# look at the first English 'ə' in more detail
pd.DataFrame(eng[eng['Phoneme'] == 'ə'].iloc[0].T)

Unnamed: 0,6065
InventoryID,160
Glottocode,stan1293
ISO6393,eng
LanguageName,English
SpecificDialect,
GlyphID,0259
Phoneme,ə
Allophones,ɜ ə ə ə
Marginal,
SegmentClass,vowel


In [11]:
# look at the phonemic inventory for Hebrew
print(f'''{languages['heb']} has {len(phonemes('heb'))} phonemes: {phonemes('heb')}
... and {len(consonants('heb'))} consonants: {consonants('heb')}
... and {len(vowels('heb'))} vowels: {vowels('heb')}
''')

Israeli Hebrew has 43 phonemes: {'ʊ', 'j', 'a', 't̠ʃ', 'o', 'p͉', 't̠ʃ͉', 'p', 'ɛ', 't͉', 'r', 'ŋ', 'ʒ', 'ə̆', 'z', 'ʃ͉', 'χ', 'm', 'f', 'h', 'd', 'ħ', 'u', 's', 'k͉', 'x', 'n', 'v', 'i', 'ɡ', 'ɪ', 'ʕ', 't', 'ʃ', 'f͉', 's͉', 'e', 'ʁ', 'ʔ', 'k', 'l', 'b', 'ts'}
... and 34 consonants: {'j', 't̠ʃ', 'p͉', 't̠ʃ͉', 'p', 't͉', 'r', 'ŋ', 'ʒ', 'z', 'ʃ͉', 'χ', 'm', 'f', 'h', 'd', 'ħ', 's', 'k͉', 'x', 'n', 'v', 'ɡ', 'ʕ', 't', 'ʃ', 'f͉', 's͉', 'ʁ', 'ʔ', 'k', 'l', 'b', 'ts'}
... and 9 vowels: {'i', 'ʊ', 'a', 'o', 'e', 'u', 'ə̆', 'ɛ', 'ɪ'}



In [12]:
# look at Hebrew vowels
heb = phoible['heb']
heb[heb['SegmentClass'] == 'vowel']

Unnamed: 0,InventoryID,Glottocode,ISO6393,LanguageName,SpecificDialect,GlyphID,Phoneme,Allophones,Marginal,SegmentClass,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
4989,135,hebr1245,heb,Modern Hebrew,,0061,a,ɒ a,,vowel,...,-,-,+,-,-,-,0,-,-,0
4990,135,hebr1245,heb,Modern Hebrew,,006F,o,o,,vowel,...,-,-,+,-,-,-,0,-,-,0
4991,135,hebr1245,heb,Modern Hebrew,,0259+0306,ə̆,ə̆ ə̆,,vowel,...,-,-,+,-,-,-,0,-,-,0
4992,135,hebr1245,heb,Modern Hebrew,,025B,ɛ,e ɛ ei ɛ,,vowel,...,-,-,+,-,-,-,0,-,-,0
4993,135,hebr1245,heb,Modern Hebrew,,026A,ɪ,ɪ i,,vowel,...,-,-,+,-,-,-,0,-,-,0
4994,135,hebr1245,heb,Modern Hebrew,,028A,ʊ,ʊ,,vowel,...,-,-,+,-,-,-,0,-,-,0
76997,2189,hebr1245,heb,Hebrew,Oriental Dialect,0061,a,a,False,vowel,...,-,-,+,-,-,-,0,-,-,0
76998,2189,hebr1245,heb,Hebrew,Oriental Dialect,0065,e,e ei,False,vowel,...,-,-,+,-,-,-,0,-,-,0
76999,2189,hebr1245,heb,Hebrew,Oriental Dialect,0069,i,i,False,vowel,...,-,-,+,-,-,-,0,-,-,0
77000,2189,hebr1245,heb,Hebrew,Oriental Dialect,006F,o,o,False,vowel,...,-,-,+,-,-,-,0,-,-,0


In [13]:
# look at Hebrew 'ʕ' in more detail
pd.DataFrame(heb[heb['Phoneme'] == 'ʕ'].iloc[0].T)

Unnamed: 0,76995
InventoryID,2189
Glottocode,hebr1245
ISO6393,heb
LanguageName,Hebrew
SpecificDialect,Oriental Dialect
GlyphID,0295
Phoneme,ʕ
Allophones,ʕ
Marginal,False
SegmentClass,consonant


In [14]:
# look at the phonemic inventory for Japanese
print(f'''{languages['jpn']} has {len(phonemes('jpn'))} phonemes: {phonemes('jpn')}
... and {len(consonants('jpn'))} consonants: {consonants('jpn')}
... and {len(vowels('jpn'))} vowels: {vowels('jpn')}
''')

Japanese has 51 phonemes: {'çː', 'j', 'a', 't̠ʃ', 'o', 'ɴ', 'd̠ʒ', 'ɔ', 'ɛː', 'p', 'ɛ', 'ŋ', 'ɯː', 'z', 'iː', 'z̪', 'd̪', 't̠ʃː', 'w', 'm', 'tː', 'd', 'h', 'aː', 'u', 's', '˥', 'ɔː', 'n', 'ʃː', 'i', 'sː', 'ɡ', 's̪', 't̪', 't', 'ɾ', 'ʃ', 'kː', 'tsː', 'pː', 'ɯ', 'ɯ̃', 'n̪', 'd̠', 'e', 'ʔ', 'k', 'b', 'ts', '˧'}
... and 35 consonants: {'çː', 'j', 't̠ʃ', 'ɴ', 'd̠ʒ', 'p', 'ŋ', 'z', 'z̪', 'd̪', 't̠ʃː', 'w', 'm', 'tː', 'd', 'h', 's', 'n', 'ʃː', 'sː', 'ɡ', 's̪', 't̪', 't', 'ɾ', 'ʃ', 'kː', 'tsː', 'pː', 'n̪', 'd̠', 'ʔ', 'k', 'b', 'ts'}
... and 14 vowels: {'ɯ̃', 'aː', 'ɯ', 'a', 'ɯː', 'e', 'o', 'u', 'iː', 'ɔ', 'ɔː', 'ɛː', 'ɛ', 'i'}



In [15]:
# look at Japanese consonants
jpn = phoible['jpn']
jpn[jpn['SegmentClass'] == 'consonant']

Unnamed: 0,InventoryID,Glottocode,ISO6393,LanguageName,SpecificDialect,GlyphID,Phoneme,Allophones,Marginal,SegmentClass,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
7526,197,nucl1643,jpn,Japanese,,0062,b,bʲ b,,consonant,...,0,0,+,-,-,-,-,-,-,-
7527,197,nucl1643,jpn,Japanese,,0064,d,d,,consonant,...,0,0,+,-,-,-,-,-,-,-
7528,197,nucl1643,jpn,Japanese,,0064+0320+0292,d̠ʒ,ʒ d̠ʒ,,consonant,...,0,0,+,-,-,-,-,-,-,-
7529,197,nucl1643,jpn,Japanese,,0068,h,ɸ h ç ɸː fː hʲ f,,consonant,...,0,0,-,-,+,-,-,-,-,-
7530,197,nucl1643,jpn,Japanese,,006A,j,j j̥,,consonant,...,0,0,+,-,-,-,-,-,-,-
7531,197,nucl1643,jpn,Japanese,,006B,k,k̟ʲ k,,consonant,...,0,0,-,-,-,-,-,-,-,-
7532,197,nucl1643,jpn,Japanese,,006B+02D0,kː,k̟ʲː kː,,consonant,...,0,0,-,-,-,-,-,-,-,-
7533,197,nucl1643,jpn,Japanese,,006D,m,mʲ m,,consonant,...,0,0,+,-,-,-,-,-,-,-
7534,197,nucl1643,jpn,Japanese,,006E,n,n nʲ,,consonant,...,0,0,+,-,-,-,-,-,-,-
7535,197,nucl1643,jpn,Japanese,,0070,p,pʲ p,,consonant,...,0,0,-,-,-,-,-,-,-,-


In [16]:
# look at Japanese 'ɾ' in more detail
pd.DataFrame(jpn[jpn['Phoneme'] == 'ɾ'].iloc[0].T)

Unnamed: 0,7549
InventoryID,197
Glottocode,nucl1643
ISO6393,jpn
LanguageName,Japanese
SpecificDialect,
GlyphID,027E
Phoneme,ɾ
Allophones,ɾʲ ɾ ɺ
Marginal,
SegmentClass,consonant


In [17]:
# more helper functions for measuring similarities (as Jaccard index) of each pair of phonemic inventories
from operator import itemgetter
from itertools import combinations
from itertools import product

columns = 'lang1:ISO6393', 'lang2:ISO6393', 'lang1:LanguageName', 'lang2:LanguageName', 'JaccardIndex'

def rank_similar(phonemes=phonemes):
    similarities = {(l1, l2): jaccard(phonemes(l1), phonemes(l2)) for (l1, l2) in combinations(phoible, r=2)}
    return pd.DataFrame(
        ((l1, l2, languages[l1], languages[l2], score)
        for (l1, l2), score in sorted(similarities.items(), key=itemgetter(1), reverse=True)),
        columns=columns
    )

def rank_similar_to_languge(language, phonemes=phonemes):
    lang_pairs =list(product([language], list(set(phoible) - {language})))
    similarities = {(l1, l2): jaccard(phonemes(l1), phonemes(l2)) for (l1, l2) in lang_pairs}
    return pd.DataFrame(
        ((l1, l2, languages[l1], languages[l2], score)
        for (l1, l2), score in sorted(similarities.items(), key=itemgetter(1), reverse=True)),
        columns=columns
    )

In [18]:
# rank the most similar phonemic inventory pairs (takes a while to compute)
rank_similar()

Unnamed: 0,lang1:ISO6393,lang2:ISO6393,lang1:LanguageName,lang2:LanguageName,JaccardIndex
0,xwe,xwl,xwela,western xwla,1.0
1,xwe,kqk,xwela,kotafon,1.0
2,xwl,kqk,western xwla,kotafon,1.0
3,nzi,aha,Nzema,Ahanta,1.0
4,gru,mvz,Goggot,Masqan,1.0
...,...,...,...,...,...
2199748,wrs,mvi,WARIS,Miyako,0.0
2199749,jup,mvi,Hup,Miyako,0.0
2199750,cro,mvi,Crow,Miyako,0.0
2199751,yab,mvi,Yuhup,Miyako,0.0


In [19]:
# rank all phonemeic inventories by similarity to English
rank_similar_to_languge('eng')

Unnamed: 0,lang1:ISO6393,lang2:ISO6393,lang1:LanguageName,lang2:LanguageName,JaccardIndex
0,eng,deu,English,German,0.407407
1,eng,agq,English,Aghem,0.386792
2,eng,jer,English,Jere,0.366337
3,eng,lag,English,Langi,0.364486
4,eng,xsm,English,Kasɩm,0.362745
...,...,...,...,...,...
2092,eng,yxm,English,Yinwum,0.080357
2093,eng,zmv,English,Rimanggudinhma,0.078947
2094,eng,new,English,NEWARI,0.076923
2095,eng,ggk,English,Kungarakany,0.074074


In [20]:
# find the most similar vocalic inventories (takes a while to compute)
rank_similar(phonemes=vowels)

Unnamed: 0,lang1:ISO6393,lang2:ISO6393,lang1:LanguageName,lang2:LanguageName,JaccardIndex
0,zsm,bum,Standard Malay,Bulu,1.0
1,khl,fia,KALIAI,NUBIAN,1.0
2,mph,tun,Mawng,TUNICA,1.0
3,mph,swh,Mawng,Swahili,1.0
4,mph,bom,Mawng,Berom,1.0
...,...,...,...,...,...
2199748,uig,xrd,Uyghur,Gundungurra,0.0
2199749,uig,xni,Uyghur,Ngarigu,0.0
2199750,uig,gcd,Uyghur,Gangalidda,0.0
2199751,uig,gyd,Uyghur,Yangkaralda,0.0


In [21]:
# rank all vocalic inventories by similarity to English
rank_similar_to_languge('eng', phonemes=vowels)

Unnamed: 0,lang1:ISO6393,lang2:ISO6393,lang1:LanguageName,lang2:LanguageName,JaccardIndex
0,eng,shk,English,Shilluk,0.303030
1,eng,nld,English,Dutch,0.294118
2,eng,kfa,English,Kod̩agu,0.287879
3,eng,btg,English,Bété,0.283582
4,eng,xtc,English,Kadugli (Katcha),0.281250
...,...,...,...,...,...
2092,eng,kaa,English,Karakalpak,0.028169
2093,eng,ega,English,Ega,0.014706
2094,eng,dib,English,"Dinka, South Central",0.000000
2095,eng,abk,English,Abkhaz,0.000000


In [22]:
# find the most similar consonantal inventories (takes a while to compute)
rank_similar(phonemes=consonants)

Unnamed: 0,lang1:ISO6393,lang2:ISO6393,lang1:LanguageName,lang2:LanguageName,JaccardIndex
0,bao,cbc,Waimaha,Karapanã,1.0
1,bao,sri,Waimaha,Siriano,1.0
2,bao,tue,Waimaha,Tuyuca,1.0
3,bao,yui,Waimaha,Yurutí,1.0
4,ahp,ahi,AIZI,"Aizi, Tiagba",1.0
...,...,...,...,...,...
2199748,xav,slr,Xavánte,Salar,0.0
2199749,yab,mvi,Yuhup,Miyako,0.0
2199750,bsn,kim,Barasana-Eduria,Tofa,0.0
2199751,nhd,slr,Nhandeva,Salar,0.0


In [23]:
# rank all consonantal inventories by similarity to English
rank_similar_to_languge('eng', phonemes=consonants)

Unnamed: 0,lang1:ISO6393,lang2:ISO6393,lang1:LanguageName,lang2:LanguageName,JaccardIndex
0,eng,rmy,English,Vlax Romani,0.658537
1,eng,deu,English,German,0.650000
2,eng,bgj,English,Bangolan,0.648649
3,eng,bzj,English,Belizean Creole,0.647059
4,eng,kcn,English,Nubi,0.641026
...,...,...,...,...,...
2092,eng,wnc,English,WANTOAT,0.090909
2093,eng,ggk,English,Kungarakany,0.085106
2094,eng,xgm,English,Dharumbal,0.083333
2095,eng,ggr,English,Aghu Tharnggala,0.080000


## UMAP
Paper: https://arxiv.org/abs/1802.03426  
McInnes, L., Healy, J., & Melville, J. (2018). Umap: Uniform manifold approximation and projection for dimension reduction. arXiv preprint arXiv:1802.03426.  
Homepage: https://github.com/lmcinnes/umap  
SciPy 2018 talk: https://www.youtube.com/watch?v=nq6iPZVUxZU  

The following code computes a UMAP projection of the phonemic inventory.  This embeds the high-dimensional feature vector space into a 2D space for the purpose of visualizing phoneme similarity.

In [24]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
%matplotlib notebook
np.random.seed(0)

In [25]:
import umap.umap_ as umap
# import umap # (use this if the previous import doesn't work)
reducer = umap.UMAP()

In [26]:
# prepare English phoneme feature vectors
language = 'eng'
data = phoible[language][phoible[language].columns[11:]].values
data

array([['0', '-', '-', ..., '-', '-', '-'],
       ['0', '-', '-', ..., '-', '-', '-'],
       ['0', '-', '-', ..., '-', '-', '-'],
       ...,
       ['0', '-', '+', ..., '-', '-', '0'],
       ['0', '-', '+', ..., '-', '-', '0'],
       ['0', '-', '+', ..., '-', '-', '0']], dtype=object)

In [27]:
from sklearn.preprocessing import StandardScaler

In [28]:
# map the phoneme feature values to integers
value_map = {
    '+': 2,
    '+,-': 1,
    '0': 0,
    '-,+': -1,
    '-': -2
}
mapped_data = np.array([[value_map[v] for v in row] for row in data])
# scale the data
scaled_data = StandardScaler().fit_transform(mapped_data)
# compute UMAP embedding projected into 2D
embedding = reducer.fit_transform(scaled_data)
embedding.shape

(370, 2)

In [29]:
# separate x, y dimensions
x = embedding[:, 0]
y = embedding[:, 1]
# get phoneme labels for each data point
labels = phoible['eng']['Phoneme'].values
# set color of each phoneme depending on its 'SegmentClass' (vowel or consonant)
colors = [sns.color_palette()[c] for c in phoible['eng']['SegmentClass'].map(({'consonant':0, 'vowel':1}))]
# plot the 2D embeddings (takes a while to plot and adjust the text labels)
plt.scatter(x, y, c=colors)
texts = []
seen = set()
for (label, x, y) in zip(labels, x, y):
    if label not in seen: # some phonemes are repeated, so we skip labeling them multiple times
        texts.append(plt.text(x, y, f'/{label}/'))
        seen.add(label)
adjust_text(texts) # this is a nice function that tries to keep text labels legible and not overlapping

plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of English phonemes')

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'UMAP projection of English phonemes')