# PHOIBLE

An exploratory notebook to inspect the PHOIBLE dataset.

PHOIBLE project page: https://phoible.org/  
PHOIBLE data on GitHub: https://github.com/phoible/dev/tree/master/data

In [1]:
import pandas as pd

In [2]:
# load the tabular dataset as Pandas DataFrame
df = pd.read_csv('https://raw.githubusercontent.com/phoible/dev/master/data/phoible.csv', low_memory=False)

In [3]:
df

Unnamed: 0,InventoryID,Glottocode,ISO6393,LanguageName,SpecificDialect,GlyphID,Phoneme,Allophones,Marginal,SegmentClass,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
0,1,kore1280,kor,Korean,,0068,h,ç h ɦ,,consonant,...,0,0,-,-,+,-,-,-,-,-
1,1,kore1280,kor,Korean,,006A,j,j,,consonant,...,0,0,+,-,-,-,-,-,-,-
2,1,kore1280,kor,Korean,,006B,k,k̚ ɡ k,,consonant,...,0,0,-,-,-,-,-,-,-,-
3,1,kore1280,kor,Korean,,006B+02B0,kʰ,kʰ,,consonant,...,0,0,-,-,+,-,-,-,-,-
4,1,kore1280,kor,Korean,,006B+02C0,kˀ,kˀ,,consonant,...,0,0,-,-,-,+,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105482,3020,lamu1254,lby,Tableland Lamalama,,0294,ʔ,,False,consonant,...,0,0,-,-,-,+,-,-,-,-
105483,3020,lamu1254,lby,Tableland Lamalama,,03B8,θ,,False,consonant,...,0,0,-,-,-,-,-,-,-,-
105484,3020,lamu1254,lby,Tableland Lamalama,,0061,a,,False,vowel,...,-,-,+,-,-,-,0,-,-,0
105485,3020,lamu1254,lby,Tableland Lamalama,,0069,i,,False,vowel,...,-,-,+,-,-,-,0,-,-,0


In [4]:
# map the ISO 6393 language codes to language names (this loses some info since the codes map to multiple variants)
languages = dict(df[['ISO6393', 'LanguageName']].dropna().values)

In [5]:
# map language codes to the relevant language rows so we can look up the inventories by language code
phoible = {language: df[df['ISO6393'] == language] for language in languages}

In [6]:
phoible['eng']

Unnamed: 0,InventoryID,Glottocode,ISO6393,LanguageName,SpecificDialect,GlyphID,Phoneme,Allophones,Marginal,SegmentClass,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
6029,160,stan1293,eng,English,,0062,b,bⁿ p͉ b̚ b b,,consonant,...,0,0,+,-,-,-,-,-,-,-
6030,160,stan1293,eng,English,,0064,d,t͉ dˡ d̪ dⁿ ɖ d dz d̚ d,,consonant,...,0,0,+,-,-,-,-,-,-,-
6031,160,stan1293,eng,English,,0064+0320+0292,d̠ʒ,d̠ʒ t̠ʃ͉ d̠ʒ,,consonant,...,0,0,+,-,-,-,-,-,-,-
6032,160,stan1293,eng,English,,0066,f,f,,consonant,...,0,0,-,-,-,-,-,-,-,-
6033,160,stan1293,eng,English,,0068,h,h ç ɦ,,consonant,...,0,0,-,-,+,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90925,2515,stan1293,eng,English,English (Liverpool),025B,ɛ,,False,vowel,...,-,-,+,-,-,-,0,-,-,0
90926,2515,stan1293,eng,English,English (Liverpool),025B+0289,ɛʉ,,False,vowel,...,-,-,+,-,-,-,0,-,-,0
90927,2515,stan1293,eng,English,English (Liverpool),026A,ɪ,,False,vowel,...,-,-,+,-,-,-,0,-,-,0
90928,2515,stan1293,eng,English,English (Liverpool),0289+02D0,ʉː,,False,vowel,...,-,-,+,-,-,-,0,-,-,0


In [7]:
# define some helper functions
def jaccard(s1, s2):
    if len(s1 | s2) == 0:
        return 1.0
    return len(s1 & s2) / len(s1 | s2)

def phonemes(language):
    return {phoneme for phoneme in phoible[language]['Phoneme']}

def consonants(language):
    return {consonant for consonant in phoible[language][phoible[language]['SegmentClass'] == 'consonant']['Phoneme']}

def vowels(language):
    return {vowel for vowel in phoible[language][phoible[language]['SegmentClass'] == 'vowel']['Phoneme']}


In [8]:
# look at the phonemic inventory for English
print(f'''{languages['eng']} has {len(phonemes('eng'))} phonemes: {phonemes('eng')}
... and {len(consonants('eng'))} consonants: {consonants('eng')}
... and {len(vowels('eng'))} vowels: {vowels('eng')}
''')

English has 94 phonemes: {'ɘ', 'oː', 'ð', 'ɔ', 'pʰ', 'a', 't̠ʃ', 'd̠ʒ', 'ɵː', 'əʊ', 'ɒː', 'ʍ', 'ɐ', 'ɔɪ', 'n', 'x', 'j', 'ɛː', 'f', 'u', 'ʊ', 'l', 'kʰ', 'ɑ', 'm', 'aɪ', 'ɪə', 'æo', 'ɻ', 'z', 'ʃ', 'i', 'r', 'ɪ', 'iɪ', 'æɔ', 'ʒ', 'əː', 'ɔː', 'θ', 'aʊ', 'oʊ', 's', 'ɜː', 'ɐʉ', 'ɛ', 'o̞ː', 'iː', 'ɹ', 'ɚ', 'əʉ', 'ɛʉ', 'eə', 'b', 'iɛ', 'ei', 'eː', 'æɪ', 'ʊə', 'ɑː', 'æe', 'ʉə', 'w', 'ts', 'v', 'kx', 'e̞', 'k', 'ə', 'aː', 'e', 'uː', 't', 'eɪ', 'æ', 'ʌ', 'ɒɯ', 'ɚː', 'ŋ', 'ɐː', 'eɪ̯', 'ɡ', 'ɑe', 'h', 'd', 'ɒ', 'ʉː', 'øː', 'tʰ', 'ʔ', 'oɪ', 'p', 'oe', 'iə'}
... and 34 consonants: {'k', 'm', 'ɹ', 'ɻ', 't', 'ð', 'z', 'ʃ', 'pʰ', 't̠ʃ', 'd̠ʒ', 'b', 'r', 'ʍ', 'ŋ', 'n', 'ʒ', 'x', 'θ', 's', 'j', 'ɡ', 'h', 'w', 'f', 'd', 'ts', 'l', 'tʰ', 'kʰ', 'ʔ', 'v', 'p', 'kx'}
... and 60 vowels: {'o̞ː', 'ɘ', 'e̞', 'iː', 'ə', 'aː', 'ɚ', 'e', 'uː', 'aɪ', 'ɪə', 'əʉ', 'æo', 'oː', 'ɐʉ', 'ɔ', 'a', 'eɪ', 'ɛʉ', 'æ', 'ʌ', 'i', 'ɒɯ', 'eə', 'ɚː', 'ɵː', 'əʊ', 'ɪ', 'iɛ', 'ɒː', 'iɪ', 'æɔ', 'ɐ', 'ei', 'ɔɪ', 'ɐː', 'eː', 'əː', 'ɔː', 'e

In [9]:
# look at 'ə' across English variants
eng = phoible['eng']
eng[eng['Phoneme'] == 'ə']

Unnamed: 0,InventoryID,Glottocode,ISO6393,LanguageName,SpecificDialect,GlyphID,Phoneme,Allophones,Marginal,SegmentClass,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
6065,160,stan1293,eng,English,,259,ə,ɜ ə ə ə,,vowel,...,-,-,+,-,-,-,0,-,-,0
76433,2175,stan1293,eng,English (American),Western and Mid-Western US; Southern California,259,ə,ə,False,vowel,...,-,-,+,-,-,-,0,-,-,0
76556,2178,stan1293,eng,English (British),Liverpool,259,ə,ə,False,vowel,...,-,-,+,-,-,-,0,-,-,0
76641,2180,stan1293,eng,English (British),Tyneside English (spoken in Newcastle),259,ə,ə ɐ,False,vowel,...,-,-,+,-,-,-,0,-,-,0
79779,2252,stan1293,eng,English,English (RP),259,ə,,False,vowel,...,-,-,+,-,-,-,0,-,-,0
90924,2515,stan1293,eng,English,English (Liverpool),259,ə,,False,vowel,...,-,-,+,-,-,-,0,-,-,0


In [10]:
# look at the first English 'ə' in more detail
pd.DataFrame(eng[eng['Phoneme'] == 'ə'].iloc[0].T)

Unnamed: 0,6065
InventoryID,160
Glottocode,stan1293
ISO6393,eng
LanguageName,English
SpecificDialect,
GlyphID,0259
Phoneme,ə
Allophones,ɜ ə ə ə
Marginal,
SegmentClass,vowel


In [11]:
# look at the phonemic inventory for Hebrew
print(f'''{languages['heb']} has {len(phonemes('heb'))} phonemes: {phonemes('heb')}
... and {len(consonants('heb'))} consonants: {consonants('heb')}
... and {len(vowels('heb'))} vowels: {vowels('heb')}
''')

Israeli Hebrew has 43 phonemes: {'f͉', 'k', 'm', 'e', 't', 'z', 'ʃ', 'ə̆', 'a', 't̠ʃ', 'ʕ', 'i', 'k͉', 'b', 'r', 'ɪ', 'ŋ', 'ʁ', 'χ', 'p͉', 'ħ', 'n', 'ʒ', 'ʃ͉', 'x', 's', 'j', 't̠ʃ͉', 'ɡ', 's͉', 'h', 'o', 'f', 'd', 'u', 't͉', 'ts', 'ʊ', 'l', 'ʔ', 'v', 'p', 'ɛ'}
... and 34 consonants: {'f͉', 'k', 'm', 't', 'z', 'ʃ', 'ʕ', 't̠ʃ', 'k͉', 'b', 'r', 'ŋ', 'ʁ', 'χ', 'p͉', 'ħ', 'n', 'ʒ', 'ʃ͉', 'x', 's', 'j', 't̠ʃ͉', 'ɡ', 's͉', 'h', 'f', 'd', 't͉', 'ts', 'l', 'ʔ', 'v', 'p'}
... and 9 vowels: {'ə̆', 'a', 'o', 'i', 'e', 'u', 'ʊ', 'ɪ', 'ɛ'}



In [12]:
# look at Hebrew vowels
heb = phoible['heb']
heb[heb['SegmentClass'] == 'vowel']

Unnamed: 0,InventoryID,Glottocode,ISO6393,LanguageName,SpecificDialect,GlyphID,Phoneme,Allophones,Marginal,SegmentClass,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
4989,135,hebr1245,heb,Modern Hebrew,,0061,a,ɒ a,,vowel,...,-,-,+,-,-,-,0,-,-,0
4990,135,hebr1245,heb,Modern Hebrew,,006F,o,o,,vowel,...,-,-,+,-,-,-,0,-,-,0
4991,135,hebr1245,heb,Modern Hebrew,,0259+0306,ə̆,ə̆ ə̆,,vowel,...,-,-,+,-,-,-,0,-,-,0
4992,135,hebr1245,heb,Modern Hebrew,,025B,ɛ,e ɛ ei ɛ,,vowel,...,-,-,+,-,-,-,0,-,-,0
4993,135,hebr1245,heb,Modern Hebrew,,026A,ɪ,ɪ i,,vowel,...,-,-,+,-,-,-,0,-,-,0
4994,135,hebr1245,heb,Modern Hebrew,,028A,ʊ,ʊ,,vowel,...,-,-,+,-,-,-,0,-,-,0
76997,2189,hebr1245,heb,Hebrew,Oriental Dialect,0061,a,a,False,vowel,...,-,-,+,-,-,-,0,-,-,0
76998,2189,hebr1245,heb,Hebrew,Oriental Dialect,0065,e,e ei,False,vowel,...,-,-,+,-,-,-,0,-,-,0
76999,2189,hebr1245,heb,Hebrew,Oriental Dialect,0069,i,i,False,vowel,...,-,-,+,-,-,-,0,-,-,0
77000,2189,hebr1245,heb,Hebrew,Oriental Dialect,006F,o,o,False,vowel,...,-,-,+,-,-,-,0,-,-,0


In [13]:
# look at Hebrew 'ʕ' in more detail
pd.DataFrame(heb[heb['Phoneme'] == 'ʕ'].iloc[0].T)

Unnamed: 0,76995
InventoryID,2189
Glottocode,hebr1245
ISO6393,heb
LanguageName,Hebrew
SpecificDialect,Oriental Dialect
GlyphID,0295
Phoneme,ʕ
Allophones,ʕ
Marginal,False
SegmentClass,consonant


In [14]:
# look at the phonemic inventory for Japanese
print(f'''{languages['jpn']} has {len(phonemes('jpn'))} phonemes: {phonemes('jpn')}
... and {len(consonants('jpn'))} consonants: {consonants('jpn')}
... and {len(vowels('jpn'))} vowels: {vowels('jpn')}
''')

Japanese has 51 phonemes: {'pː', 'sː', 'iː', 'k', 'm', '˧', 'aː', 'e', 'n̪', 'd̪', 'ɯ̃', 's̪', 't', 'z', 'ʃ', 'ɔ', 'a', 't̠ʃ', 'ɾ', 'i', 'd̠ʒ', 'b', 'ŋ', 't̠ʃː', 'kː', 'n', 'ɯː', 'ɔː', 's', 'j', 'ʃː', 'ɡ', 'ɛː', 't̪', 'tsː', 'd̠', 'h', 'w', 'o', 'ɯ', 'd', 'u', 'çː', 'ts', 'z̪', 'ɴ', 'ʔ', 'p', 'tː', 'ɛ', '˥'}
... and 35 consonants: {'pː', 'sː', 'k', 'm', 'n̪', 'd̪', 's̪', 't', 'z', 'ʃ', 't̠ʃ', 'ɾ', 'd̠ʒ', 'b', 'ŋ', 't̠ʃː', 'kː', 'n', 's', 'j', 'ʃː', 'ɡ', 't̪', 'tsː', 'd̠', 'h', 'w', 'd', 'çː', 'ts', 'z̪', 'ɴ', 'ʔ', 'p', 'tː'}
... and 14 vowels: {'ɔ', 'a', 'iː', 'i', 'o', 'ɯ', 'aː', 'ɯː', 'e', 'u', 'ɔː', 'ɯ̃', 'ɛː', 'ɛ'}



In [15]:
# look at Japanese consonants
jpn = phoible['jpn']
jpn[jpn['SegmentClass'] == 'consonant']

Unnamed: 0,InventoryID,Glottocode,ISO6393,LanguageName,SpecificDialect,GlyphID,Phoneme,Allophones,Marginal,SegmentClass,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
7526,197,nucl1643,jpn,Japanese,,0062,b,bʲ b,,consonant,...,0,0,+,-,-,-,-,-,-,-
7527,197,nucl1643,jpn,Japanese,,0064,d,d,,consonant,...,0,0,+,-,-,-,-,-,-,-
7528,197,nucl1643,jpn,Japanese,,0064+0320+0292,d̠ʒ,ʒ d̠ʒ,,consonant,...,0,0,+,-,-,-,-,-,-,-
7529,197,nucl1643,jpn,Japanese,,0068,h,ɸ h ç ɸː fː hʲ f,,consonant,...,0,0,-,-,+,-,-,-,-,-
7530,197,nucl1643,jpn,Japanese,,006A,j,j j̥,,consonant,...,0,0,+,-,-,-,-,-,-,-
7531,197,nucl1643,jpn,Japanese,,006B,k,k̟ʲ k,,consonant,...,0,0,-,-,-,-,-,-,-,-
7532,197,nucl1643,jpn,Japanese,,006B+02D0,kː,k̟ʲː kː,,consonant,...,0,0,-,-,-,-,-,-,-,-
7533,197,nucl1643,jpn,Japanese,,006D,m,mʲ m,,consonant,...,0,0,+,-,-,-,-,-,-,-
7534,197,nucl1643,jpn,Japanese,,006E,n,n nʲ,,consonant,...,0,0,+,-,-,-,-,-,-,-
7535,197,nucl1643,jpn,Japanese,,0070,p,pʲ p,,consonant,...,0,0,-,-,-,-,-,-,-,-


In [16]:
# look at Japanese 'ɾ' in more detail
pd.DataFrame(jpn[jpn['Phoneme'] == 'ɾ'].iloc[0].T)

Unnamed: 0,7549
InventoryID,197
Glottocode,nucl1643
ISO6393,jpn
LanguageName,Japanese
SpecificDialect,
GlyphID,027E
Phoneme,ɾ
Allophones,ɾʲ ɾ ɺ
Marginal,
SegmentClass,consonant


In [17]:
# measure similarities (as Jaccard index) of each pair of phonemic inventories
from itertools import combinations
similarities = {(l1, l2): jaccard(phonemes(l1), phonemes(l2)) for (l1, l2) in combinations(phoible, r=2)}

In [18]:
# rank the most similar phonemic inventory pairs
from operator import itemgetter
columns = 'lang1:ISO6393', 'lang2:ISO6393', 'lang1:LanguageName', 'lang2:LanguageName', 'JaccardIndex'
pd.DataFrame(
    ((l1, l2, languages[l1], languages[l2], score)
    for (l1, l2), score in sorted(similarities.items(), key=itemgetter(1), reverse=True)),
    columns=columns
)

Unnamed: 0,lang1:ISO6393,lang2:ISO6393,lang1:LanguageName,lang2:LanguageName,JaccardIndex
0,xwe,xwl,xwela,western xwla,1.0
1,xwe,kqk,xwela,kotafon,1.0
2,xwl,kqk,western xwla,kotafon,1.0
3,nzi,aha,Nzema,Ahanta,1.0
4,gru,mvz,Goggot,Masqan,1.0
...,...,...,...,...,...
2199748,wrs,mvi,WARIS,Miyako,0.0
2199749,jup,mvi,Hup,Miyako,0.0
2199750,cro,mvi,Crow,Miyako,0.0
2199751,yab,mvi,Yuhup,Miyako,0.0


In [19]:
# rank all phonemeic inventories by similarity to English
from itertools import product
eng_pairs = product(['eng'], list(set(phoible) - {'eng'}))
similarities = {(l1, l2): jaccard(phonemes(l1), phonemes(l2)) for (l1, l2) in eng_pairs}
pd.DataFrame(
    ((l1, l2, languages[l1], languages[l2], score)
    for (l1, l2), score in sorted(similarities.items(), key=itemgetter(1), reverse=True)),
    columns=columns
)

Unnamed: 0,lang1:ISO6393,lang2:ISO6393,lang1:LanguageName,lang2:LanguageName,JaccardIndex
0,eng,deu,English,German,0.407407
1,eng,agq,English,Aghem,0.386792
2,eng,jer,English,Jere,0.366337
3,eng,lag,English,Langi,0.364486
4,eng,xsm,English,Kasɩm,0.362745
...,...,...,...,...,...
2092,eng,ggr,English,Aghu Tharnggala,0.080357
2093,eng,zmv,English,Rimanggudinhma,0.078947
2094,eng,new,English,NEWARI,0.076923
2095,eng,ggk,English,Kungarakany,0.074074


In [None]:
# find the most similar vocalic inventories
similarities = {(l1, l2): jaccard(vowels(l1), vowels(l2)) for (l1, l2) in combinations(phoible, r=2)}
pd.DataFrame(
    ((l1, l2, languages[l1], languages[l2], score)
    for (l1, l2), score in sorted(similarities.items(), key=itemgetter(1), reverse=True)),
    columns=columns
)

In [None]:
# rank all vocalic inventories by similarity to English
similarities = {(l1, l2): jaccard(vowels(l1), vowels(l2)) for (l1, l2) in eng_pairs}
pd.DataFrame(
    ((l1, l2, languages[l1], languages[l2], score)
    for (l1, l2), score in sorted(similarities.items(), key=itemgetter(1), reverse=True)),
    columns=columns
)

In [None]:
# find the most similar consonantal inventories
similarities = {(l1, l2): jaccard(consonants(l1), consonants(l2)) for (l1, l2) in combinations(phoible, r=2)}
pd.DataFrame(
    ((l1, l2, languages[l1], languages[l2], score)
    for (l1, l2), score in sorted(similarities.items(), key=itemgetter(1), reverse=True)),
    columns=columns
)

In [None]:
# rank all consonantal inventories by similarity to English
similarities = {(l1, l2): jaccard(consonants(l1), consonants(l2)) for (l1, l2) in eng_pairs}
pd.DataFrame(
    ((l1, l2, languages[l1], languages[l2], score)
    for (l1, l2), score in sorted(similarities.items(), key=itemgetter(1), reverse=True)),
    columns=columns
)