In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
import glob
import pandas as pd
import bibtexparser
import re

import configparser
config = configparser.RawConfigParser()

Collecting all paths to .ini files from glottolog and converting them into Path objects

In [2]:
paths = []

for path in tqdm(glob.glob('glottolog/languoids/tree/**/*', recursive=True)):
    if path.endswith('.ini'):
        paths.append(path)

  0%|          | 0/53503 [00:00<?, ?it/s]

In [3]:
paths = [Path(path) for path in paths]

Parsing .ini files into dict with information on a language including their glottocode, path to the corresponding .ini file, level, sources, etc.

In [4]:
mds = []

for path in tqdm(paths):
    config = configparser.RawConfigParser()
    config.read(path, encoding='utf8')
    md = {s:dict(config[s].items()) for s in config.sections()}
    dct = {'glottocode': path.parts[-2], 'path': Path(*path.parts[2:-1]).as_posix()}
    core = md['core'].copy()
    dct.update(core)
#     print(dct)
    del md['core']
    dct['other'] = md
    mds.append(dct)

  0%|          | 0/26737 [00:00<?, ?it/s]

In [5]:
mds[0]

{'glottocode': 'abin1243',
 'path': 'tree/abin1243',
 'name': 'Abinomn',
 'hid': 'bsa',
 'level': 'language',
 'iso639-3': 'bsa',
 'latitude': '-2.92281',
 'longitude': '138.891',
 'macroareas': '\nPapunesia',
 'countries': '\nID',
 'links': '\n[Abinomn](https://endangeredlanguages.com/lang/1763)\nhttps://en.wikipedia.org/wiki/Abinomn_language\nhttps://www.wikidata.org/entity/Q56648',
 'other': {'sources': {'glottolog': '\n**elcat:6f91926917d8609bd798174c58ad27cf**\n**hh:e:Lagerberg:Moegip**\n**hh:h:SilzerClouse:Index**\n**hh:h:SilzerHeikkinen:Irian**\n**hh:hv:Foley:Northwest-New-Guinea**\n**hh:hvtyp:DonohueMusgrave:Melanesia**\n**hh:w:Fiwei:Abinomn**'},
  'altnames': {'multitree': '\n"Baso"\nAbinomn\nAvinomen\nFoja\nFoya',
   'lexvo': '\nAbinomn [en]\nAbinomn language [en]\nAbinomneg [br]\nLingua abinomn [gl]\nLlingua Abinomn [ast]',
   'hhbib_lgcode': '\nBaso',
   'elcat': '\n"Baso"\nAbinomn\nAvinomen\nFoja\nFoya'},
  'triggers': {'lgcode': '\nmacrohistory\nmoegip'},
  'identifier': 

Keeping only languages (excluding dialects and families)

In [15]:
md_data = pd.DataFrame(mds)
raw_languages_data = md_data[md_data.level == 'language'].copy()
language_list = raw_languages_data.glottocode.to_list()

Extracting literature

In [7]:
sources = raw_languages_data.other.tolist()

In [8]:
sources = [x['sources']['glottolog'].strip('\n *') if 'sources' in x else '' for x in sources]

Parsing all bibtex files

In [9]:
libraries = {}
for file in tqdm(Path('glottolog/references/bibtex/').iterdir()):
    libraries[file.stem] = bibtexparser.parse_file(file)

0it [00:00, ?it/s]

 Unexpected block start: `@incollection`. Was still looking for field-value closing `,` or `}` 
 Expected comma after entry key, but found "
 Expected comma after entry key, but found "
 Expected comma after entry key, but found "


In [10]:
def find_entry(library_name, key, libraries):
    """
    This function receives library name, dict of all libraries and the key that
    must be found
    """
    for entry in libraries[library_name].entries:
        if entry.key == key:
            return entry

In [11]:
def find_pages_of_grammar(entry):
    """
    Receives entry and return string describing pages of the grammar
    """
    pages = None
    for field in entry.fields:
        if field.key == 'pages':
            pages = field.value
        if field.key == 'hhtype':
            if 'grammar' not in field.value:
                return None
    return pages

In [12]:
def string2int(numeric):
    """
    This function receives a string with numeric describing the number of pages
    and tries to convert it into integer
    """
    numeric = numeric.strip()
    if not numeric:
        return 0
    if numeric.isdigit():
        return int(numeric)
    if '-' in numeric:
        start, end = numeric.split('-', 1)
        return string2int(end) - string2int(start) + 1
    if '+' in numeric:
        parts_plus = numeric.split('+')
        return sum([string2int(x) for x in parts_plus])
    rom_val = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
    int_val = 0
    for i in range(len(numeric)):
        if i > 0 and rom_val[numeric[i]] > rom_val[numeric[i - 1]]:
            int_val += rom_val[numeric[i]] - 2 * rom_val[numeric[i - 1]]
        else:
            int_val += rom_val[numeric[i]]
    return int_val

In [13]:
def count_pages(string_pages: str):
    """
    This function receives a string describing the number of pages
    and tries to convert it into integer
    """
    parts_comma = re.split(r'[,;] ?', string_pages)
    for i, part in enumerate(parts_comma):
        part = part.replace('–', '-')
        part = re.sub(r'\(.*?\)', '', part).strip()
        part = re.sub(r'(?:plates?|ff?|S\.|pp|\+? ?map|tables|\?|\[|\])', '', part).strip()
        parts_comma[i] = string2int(part)
    return sum(parts_comma)

For each source, I try to convert string describing the pages into the iteger meaning the number of pages. I use only hh database.

In [14]:
sources_pages = []
for source in tqdm(sources):
    n_pages = []
    for entry in source.split('**\n**'):
        if not entry:
            continue
        library, name = entry.split(':', 1)
        
        if library != 'hh':
            continue
            
        entry = find_entry(library, name, libraries)
        pages = find_pages_of_grammar(entry)
        
        if not pages:
            continue
            
        try:
            n_pages.append(count_pages(pages))
        except:
            if pages:
                n_pages.append(pages)
    sources_pages.append(n_pages)

  0%|          | 0/8578 [00:00<?, ?it/s]

In [16]:
raw_languages_data['pages'] = sources_pages

In [139]:
raw_languages_data.to_csv('languages_with_pages.tsv', sep='\t')

In [None]:
class NodeWithGenus(NodeMixin):
    def __init__(self, name, genus=None, parent=None, confidence=None):
        self.name = name
        self.genus = genus
        self.parent = parent
        self.md = md
        self.confidence = confidence
    def __repr__(self):
        return f'Node {self.name}, Genus {self.genus}'

def find_common_ancestor(node1, node2):
    anc1 = list(node1.ancestors)
    anc2 = list(node2.ancestors)
    
    anc1.reverse()
    anc2.reverse()
    
    for i in anc1:
        for j in anc2:
            if i == j:
                return i

In [218]:
wals = pd.read_csv('wals_languages.csv')
wals.at[2296,'Glottocode'] = 'shah1254'
genera = wals[['Genus', 'Glottocode']].dropna()
genera = pd.Series(genera.Genus.values,index=genera.Glottocode).to_dict()

In [249]:
wals.query('Genus == "Yaruro"')

Unnamed: 0,ID,Name,Macroarea,Latitude,Longitude,Glottocode,ISO639P3code,Family,Subfamily,Genus,GenusIcon,ISO_codes,Samples_100,Samples_200,Country_ID,Source,Parent_ID
2614,yrr,Yaruro,South America,7.0,-68.0,pume1238,yae,Yaruro,,Yaruro,,yae,False,False,VR,Mosonyi-et-al-2000a,genus-yaruro


In [None]:
nodes = {}
lst = [path.split('/') for path in md_data.path]

for path in lst:
    for i, par in enumerate(path):
        nk = nodes[par] = nodes.get(par) or NodeWithGenus(par, genera.get(par, None))
        if i != 0:
            nv = nodes[path[i - 1]] = nodes.get(path[i - 1]) or NodeWithGenus(path[i - 1], genera.get(path[i - 1], None))
            nk.parent = nv
            
[root] = [n for n in nodes.values() if n.parent is None]

In [None]:
leaves = root.descendants

for leaf in leaves:
    if leaf.genus:
        leaf.confidence = 'initial'
    if leaf.parent == root:
        leaf.genus = 'isolate'

In [None]:
for genus in tqdm(wals.Genus.dropna().unique()):
    if genus in ['Creoles and Pidgins', 'Sign Languages']:
        continue
    # Creoles and Pidgins as well as Sign Languages do not form a genealogical genus
    
    langs = findall(root, filter_=lambda node: node.genus == genus)
    # Find all languages with the same genus
    
    print(genus, len(langs))
    
    pairs = combinations(langs, 2)
    # Make all possible combinations
    
    all_nodes = list(root.descendants)
    
    for node1, node2 in pairs:
        anc = find_common_ancestor(node1, node2)
        # find the first common ancestor
        
        if anc.name in ['uncl1493', 'book1242']:
            continue
        
        # if an ancestor has already been taken into account it is removed from the list
        if anc not in all_nodes:
            continue
        else:
            all_nodes.remove(anc)
        
        for des in anc.descendants:
            if des.genus and des.genus != genus:
                # if any descendant has another genus, nothing is changed
                break
        else:
            for des in anc.descendants:
                if des.genus:
                    continue
                des.genus = genus
                des.confidence = f'common ancestor ({node1.name}, {node2.name})'

In [None]:
for node in tqdm(root.descendants):
    if node.name not in language_list:
        continue
    if node.parent.name in ['uncl1493', 'root', 'book1242']:
        continue
    if not node.genus or node.genus in ['Creoles and Pidgins', 'Sign Languages']:
        continue
    # Creoles and Pidgins as well as Sign Languages do not form a genealogical genus
    
    siblings = node.siblings
    
    for sibling in siblings:
        if sibling.genus and sibling.genus != node.genus:
            # if any sibling has another genus, nothing is changed
            break
    else:
        for sibling in siblings:
            if sibling.genus or sibling.name not in language_list:
                continue
            sibling.genus = node.genus
            sibling.confidence = f'sibling of {node.name}'

In [None]:
leaves = root.descendants

In [None]:
leaves = [leaf for leaf in leaves if leaf.name in language_list]

In [None]:
len(leaves)

In [None]:
sum([x.genus is not None for x in leaves])

In [None]:
7134 / 8578

In [None]:
final_df = [
    {
        'glottocode': x.name,
        'genus': x.genus,
        'confidence': x.confidence
    } for x in leaves
]

In [None]:
pd.DataFrame(final_df)#.query('glottocode == "tari1263"')

In [None]:
pd.DataFrame(final_df).to_csv('hmm.tsv', sep='\t', index=False)

In [179]:
genera = pd.read_csv('hmm.tsv', delimiter='\t')

In [180]:
genera = genera[~pd.isna(genera.confidence)]

In [181]:
genera = pd.merge(genera, raw_languages_data)[['glottocode', 'genus', 'name', 'pages', 'confidence']]

In [182]:
from natsort import natsorted

In [184]:
genera.pages = genera.pages.apply(lambda x: natsorted(x, reverse=True))

In [214]:
for key, df in genera.groupby('genus'):
    sorted_df = df.sort_values('pages', ascending=False)
    sorted_df.to_excel(f'genera/{key.strip("|=")[:30]}.xlsx', index=False)

In [213]:
print('\n'.join(sorted(genera.genus.unique())))

=|Hoan
Abau
Abipon
Achumawi
Agneby
Aimore
Ainu
Alacalufan
Aleut
Algonquian
Alor-Pantar
Alto-Orinoco
Amuzgoan
Angaataha
Angami-Pochuri
Anindilyakwa
Antillean Arawakan
Ap Ma
Arauan
Araucanian
Arhuacic
Arikem
Armenian
Aslian
Asmat-Kamrau Bay
Atayalic
Athapaskan
Atsugewi
Avar-Andic-Tsezic
Avikam-Alladian
Awju
Aymaran
Aztecan
Baatonum
Bahnaric
Baibai-Fas
Baining
Baltic
Bantu
Barbacoan
Bargam
Barito
Barí
Batanic
Beboid
Becking-Dawi
Beja
Bella Coola
Benue-Congo Plateau
Berber
Bilic
Binanderean
Birri
Biu-Mandara
Boazi
Bodic
Bogia
Bolivia-Parana
Bongo-Bagirmi
Boran
Border
Bororoan
Bosavi
Brahmaputran
Bringen
Bua
Bulaka River
Bunuban
Burarran
Burmese-Lolo
Cacua-Nukak
Caddo
Cahita
Cahuapanan
Cangin
Cariban
Catawban
Celebic
Celtic
Central Arawakan
Central Cushitic
Central Delta
Central Kainji
Central Luzon
Central Malayo-Polynesian
Central Naga
Central Pama-Nyungan
Central Salish
Central Tariku
Central Wapei
Chamorro
Chapacura-Wanham
Chibcha-Duit
Chichimec
Chimakuan
Chimbu-Wahgi
Chimila
Chinanteca