# Parsing Glottolog

In [2]:
from pathlib import Path
from tqdm.notebook import tqdm
import glob
import pandas as pd
from anytree import Node, RenderTree, NodeMixin
from anytree.search import findall
from itertools import combinations
import bibtexparser
import re

import configparser
config = configparser.RawConfigParser()

Getting all `.ini` files that are stored in `glottolog/langoids/tree/` with all information on anitem

In [11]:
paths = []

for path in tqdm(glob.glob('../glottolog/languoids/tree/**/*', recursive=True)):
    if path.endswith('.ini'):
        paths.append(Path(path))

  0%|          | 0/53346 [00:00<?, ?it/s]

Getting all the data for all nodes. Important: if run on Windows, make sure `LongsPathEnabled` if set to 1.

In [3]:
mds = []

for path in tqdm(paths):
#     print(path)
    config = configparser.RawConfigParser()
    config.read(path, encoding='utf8')
    md = {s: dict(config[s].items()) for s in config.sections()}
    dct = {
        'glottocode': path.parts[-2],
        'path': Path(*path.parts[3:-1]).as_posix(),
        'toplevelfamily': path.parts[4]
    }
#     if 'core' not in md:
#         print(md)
    core = md['core'].copy()
    dct.update(core)
    del md['core']
    dct['other'] = md
    mds.append(dct)

  0%|          | 0/26673 [00:00<?, ?it/s]

KeyboardInterrupt: 

Saving all data and filtering only languages (excluding families).

In [66]:
md_data = pd.DataFrame(mds)
md_data.to_csv("all_data_from_glottolog.csv")

raw_languages_data = md_data[md_data.level == 'language']
language_list = raw_languages_data.glottocode.to_list()

Getting sources

In [26]:
sources = raw_languages_data.other.tolist()

In [27]:
sources = [x['sources']['glottolog'].strip('\n *') if 'sources' in x else '' for x in sources]

Reading `hh.bib`.

In [30]:
libraries = {}
with open('../glottolog/references/bibtex/hh.bib', encoding='utf8') as f:
    libraries['hh'] = bibtexparser.load(f)

Entry type pdhthesis not standard. Not considered.


In [42]:
libraries['hh'].entries[0]

{'macro_area': 'Eurasia',
 'lgcode': 'Tati, Harzani [hrz]',
 'inlg': 'Farsi [pes]',
 'hhtype': 'grammar_sketch',
 'glottolog_ref_id': '41999',
 'year': '1334 [1953]',
 'pages': '6+160',
 'address': 'Tabriz',
 'publisher': 'Tabriz: Tabriz University Press',
 'title': 'Tāti va Harzani, do lahja az zabān-i bāstān-e Āẕarbāyjān',
 'author': "'Abd-al-'Ali Kārang",
 'ENTRYTYPE': 'book',
 'ID': 's:Karang:Tati-Harzani'}

In [48]:
def find_entry(library_name, key, libraries):
    """
    finding entry by library_name, key and libraries
    """
    for entry in libraries[library_name].entries:
        if entry['ID'] == key:
            return entry

In [52]:
def find_pages_and_inlg_of_grammar(entry):
    """
    pages and language by entry
    """
    pages = None
    inlg = None
    for field in entry.keys():
        if field == 'pages':
            pages = entry[field]
        if field == 'inlg':
            inlg = entry[field]
        if field == 'hhtype':
            if 'grammar' not in entry[field]:
                return None, None
    return pages, inlg

In [45]:
def string2int(numeric):
    """
    Converting different types of string-encoded numerals to int
    """
    numeric = numeric.strip()
    if not numeric:
        return 0
    if numeric.isdigit():
        return int(numeric)
    if '-' in numeric:
        start, end = numeric.split('-', 1)
        return string2int(end) - string2int(start) + 1
    if '+' in numeric:
        parts_plus = numeric.split('+')
        return sum([string2int(x) for x in parts_plus])
    rom_val = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
    int_val = 0
    for i in range(len(numeric)):
        if i > 0 and rom_val[numeric[i]] > rom_val[numeric[i - 1]]:
            int_val += rom_val[numeric[i]] - 2 * rom_val[numeric[i - 1]]
        else:
            int_val += rom_val[numeric[i]]
    return int_val

In [46]:
def count_pages(string_pages: str):
    """
    Counting pages
    """
    parts_comma = re.split(r'[,;] ?', string_pages)
    for i, part in enumerate(parts_comma):
        part = part.replace('–', '-')
        part = re.sub(r'\(.*?\)', '', part).strip()
        part = re.sub(r'(?:plates?|ff?|S\.|pp|\+? ?map|tables|\?|\[|\])', '', part).strip()
        parts_comma[i] = string2int(part)
    return sum(parts_comma)

In [57]:
name

'typ:Ko:Crow'

In [58]:
sources_pages = []
sources_langs = []
for source in tqdm(sources):
    n_pages = []
    for entry in source.split('**\n**'):
        if not entry:
            continue
        library, name = entry.split(':', 1)
        
        if library != 'hh':
            continue
            
        entry = find_entry(library, name, libraries)
        if not entry:
            continue
        pages, inlg = find_pages_and_inlg_of_grammar(entry)
        
        if not pages:
            continue
            
        try:
            n_pages.append(count_pages(pages))
        except:
            if pages:
                n_pages.append(pages)
    sources_pages.append(n_pages)
    sources_langs.append(inlg)

  0%|          | 0/8565 [00:00<?, ?it/s]

In [73]:
raw_languages_data.loc[:, 'pages'] = sources_pages
raw_languages_data.loc[:, 'inlg'] = sources_langs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_languages_data.loc[:, 'pages'] = sources_pages
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_languages_data.loc[:, 'inlg'] = sources_langs


In [75]:
raw_languages_data.to_csv('languages_with_pages_v05.11.2024.tsv', sep='\t')

In [3]:
raw_languages_data = pd.read_csv('languages_with_pages_v05.11.2024.tsv', sep='\t')
md_data = pd.read_csv("all_data_from_glottolog.csv")

In [4]:
genera = pd.read_csv('genera.tsv', delimiter='\t')

In [5]:
genera = genera[~pd.isna(genera.confidence)]

In [6]:
genera = pd.merge(
    genera,
    raw_languages_data
)[['glottocode', 'genus', 'name', 'toplevelfamily', 'pages', 'confidence']]

In [7]:
polina = "Arawakan, Nuclear-Macro-Je, Tucanoan, Chapacuran, Chocoan, Matacoan, Lengua-Mascoy, Yanomamic, Guahiboan, Guaicuruan, Chonan, Naduhup, Saliban, Zamucoan, Boran, Hibito-Cholon, Uru-Chipaya, Andoque, Betoi-Jirara, Hoti, Itonama, Kanoê, Movima, Kariri, Mosetén-Chimané, Páez, Puinave, Pumé"
families = polina.split(', ')
families_codes = md_data[md_data.name.isin(families)].toplevelfamily.unique()

In [14]:
names_codes = md_data[md_data.name.isin(families)][['toplevelfamily', 'name']].drop_duplicates()
families_mapping = {k:v for k, v in zip(names_codes.toplevelfamily, names_codes.name)}
families_mapping

{'ando1256': 'Andoque',
 'araw1281': 'Arawakan',
 'beto1236': 'Betoi-Jirara',
 'bora1262': 'Boran',
 'chap1271': 'Chapacuran',
 'choc1280': 'Chocoan',
 'chon1288': 'Chonan',
 'guah1252': 'Guahiboan',
 'guai1249': 'Guaicuruan',
 'hibi1242': 'Hibito-Cholon',
 'iton1250': 'Itonama',
 'kano1245': 'Kanoê',
 'kari1254': 'Kariri',
 'leng1261': 'Lengua-Mascoy',
 'mata1289': 'Matacoan',
 'mose1249': 'Mosetén-Chimané',
 'movi1243': 'Movima',
 'nada1235': 'Naduhup',
 'nucl1710': 'Nuclear-Macro-Je',
 'paez1247': 'Páez',
 'puin1248': 'Puinave',
 'pume1238': 'Pumé',
 'sali1297': 'Saliban',
 'tuca1253': 'Tucanoan',
 'uruc1242': 'Uru-Chipaya',
 'yano1268': 'Yanomamic',
 'yuwa1244': 'Hoti',
 'zamu1243': 'Zamucoan'}

In [15]:
genera_for_polia = genera[genera.toplevelfamily.isin(families_codes)][['genus', 'toplevelfamily']].drop_duplicates()

In [16]:
genera_for_polia

Unnamed: 0,genus,toplevelfamily
417,isolate,ando1256
440,Alto-Orinoco,araw1281
443,Japura-Colombia,araw1281
445,Antillean Arawakan,araw1281
448,Guajiro-Paraujano,araw1281
450,Central Arawakan,araw1281
457,Palikur,araw1281
469,Negro-Roraima,araw1281
479,Bolivia-Parana,araw1281
486,Pre-Andine Arawakan,araw1281


In [17]:
genera_for_polia['toplevelfamily'] = genera_for_polia['toplevelfamily'].apply(lambda x: families_mapping[x])

In [18]:
genera['pages'] = genera.pages.apply(eval)

In [21]:
pages_by_genera = genera.groupby('genus')[['pages']].aggregate(lambda x: sum(x, []))
pages_mapping = dict(pages_by_genera['pages'].apply(lambda x: natsorted(x, reverse=True)))

In [22]:
genera_for_polia['pages'] = genera_for_polia.genus.apply(lambda x: pages_mapping[x] if x != 'isolate' else '')

In [23]:
genera_for_polia

Unnamed: 0,genus,toplevelfamily,pages
417,isolate,Andoque,
440,Alto-Orinoco,Arawakan,"[245, 236, 216, 98, 92, 69, 40, 26, 15, 6]"
443,Japura-Colombia,Arawakan,"[1089, 729, 543, 482, 479, 384, 381, 374, 328,..."
445,Antillean Arawakan,Arawakan,"[536, 462, 396, 323, 313, 313, 306, 296, 293, ..."
448,Guajiro-Paraujano,Arawakan,"[708, 582, 352, 322, 268, 224, 204, 204, 179, ..."
450,Central Arawakan,Arawakan,"[603, 480, 458, 297, 244, 211, 122, 118, 110, ..."
457,Palikur,Arawakan,"[256, 109, 102]"
469,Negro-Roraima,Arawakan,"[298, 179, 148]"
479,Bolivia-Parana,Arawakan,"[1034, 893, 644, 500, 223, 206, 192, 136, 127,..."
486,Pre-Andine Arawakan,Arawakan,"[895, 709, 701, 502, 487, 412, 394, 366, 335, ..."


In [25]:
genera_for_polia.to_excel('genera_for_polia_with_pages.xlsx')

In [20]:
from natsort import natsorted

In [184]:
genera.pages = genera.pages.apply(lambda x: natsorted(x, reverse=True))

In [214]:
for key, df in genera.groupby('genus'):
    sorted_df = df.sort_values('pages', ascending=False)
    sorted_df.to_excel(f'genera/{key.strip("|=")[:30]}.xlsx', index=False)

In [213]:
print('\n'.join(sorted(genera.genus.unique())))

=|Hoan
Abau
Abipon
Achumawi
Agneby
Aimore
Ainu
Alacalufan
Aleut
Algonquian
Alor-Pantar
Alto-Orinoco
Amuzgoan
Angaataha
Angami-Pochuri
Anindilyakwa
Antillean Arawakan
Ap Ma
Arauan
Araucanian
Arhuacic
Arikem
Armenian
Aslian
Asmat-Kamrau Bay
Atayalic
Athapaskan
Atsugewi
Avar-Andic-Tsezic
Avikam-Alladian
Awju
Aymaran
Aztecan
Baatonum
Bahnaric
Baibai-Fas
Baining
Baltic
Bantu
Barbacoan
Bargam
Barito
Barí
Batanic
Beboid
Becking-Dawi
Beja
Bella Coola
Benue-Congo Plateau
Berber
Bilic
Binanderean
Birri
Biu-Mandara
Boazi
Bodic
Bogia
Bolivia-Parana
Bongo-Bagirmi
Boran
Border
Bororoan
Bosavi
Brahmaputran
Bringen
Bua
Bulaka River
Bunuban
Burarran
Burmese-Lolo
Cacua-Nukak
Caddo
Cahita
Cahuapanan
Cangin
Cariban
Catawban
Celebic
Celtic
Central Arawakan
Central Cushitic
Central Delta
Central Kainji
Central Luzon
Central Malayo-Polynesian
Central Naga
Central Pama-Nyungan
Central Salish
Central Tariku
Central Wapei
Chamorro
Chapacura-Wanham
Chibcha-Duit
Chichimec
Chimakuan
Chimbu-Wahgi
Chimila
Chinanteca

In [None]:
class NodeWithGenus(NodeMixin):
    def __init__(self, name, genus=None, parent=None, confidence=None):
        self.name = name
        self.genus = genus
        self.parent = parent
        self.md = md
        self.confidence = confidence
    def __repr__(self):
        return f'Node {self.name}, Genus {self.genus}'

def find_common_ancestor(node1, node2):
    anc1 = list(node1.ancestors)
    anc2 = list(node2.ancestors)
    
    anc1.reverse()
    anc2.reverse()
    
    for i in anc1:
        for j in anc2:
            if i == j:
                return i

In [218]:
wals = pd.read_csv('wals_languages.csv')
wals.at[2296,'Glottocode'] = 'shah1254'
genera = wals[['Genus', 'Glottocode']].dropna()
genera = pd.Series(genera.Genus.values,index=genera.Glottocode).to_dict()

In [249]:
wals.query('Genus == "Yaruro"')

Unnamed: 0,ID,Name,Macroarea,Latitude,Longitude,Glottocode,ISO639P3code,Family,Subfamily,Genus,GenusIcon,ISO_codes,Samples_100,Samples_200,Country_ID,Source,Parent_ID
2614,yrr,Yaruro,South America,7.0,-68.0,pume1238,yae,Yaruro,,Yaruro,,yae,False,False,VR,Mosonyi-et-al-2000a,genus-yaruro


In [None]:
nodes = {}
lst = [path.split('/') for path in md_data.path]

for path in lst:
    for i, par in enumerate(path):
        nk = nodes[par] = nodes.get(par) or NodeWithGenus(par, genera.get(par, None))
        if i != 0:
            nv = nodes[path[i - 1]] = nodes.get(path[i - 1]) or NodeWithGenus(path[i - 1], genera.get(path[i - 1], None))
            nk.parent = nv
            
[root] = [n for n in nodes.values() if n.parent is None]

In [None]:
leaves = root.descendants

for leaf in leaves:
    if leaf.genus:
        leaf.confidence = 'initial'
    if leaf.parent == root:
        leaf.genus = 'isolate'

In [None]:
for genus in tqdm(wals.Genus.dropna().unique()):
    if genus in ['Creoles and Pidgins', 'Sign Languages']:
        continue
    # Creoles and Pidgins as well as Sign Languages do not form a genealogical genus
    
    langs = findall(root, filter_=lambda node: node.genus == genus)
    # Find all languages with the same genus
    
    print(genus, len(langs))
    
    pairs = combinations(langs, 2)
    # Make all possible combinations
    
    all_nodes = list(root.descendants)
    
    for node1, node2 in pairs:
        anc = find_common_ancestor(node1, node2)
        # find the first common ancestor
        
        if anc.name in ['uncl1493', 'book1242']:
            continue
        
        # if an ancestor has already been taken into account it is removed from the list
        if anc not in all_nodes:
            continue
        else:
            all_nodes.remove(anc)
        
        for des in anc.descendants:
            if des.genus and des.genus != genus:
                # if any descendant has another genus, nothing is changed
                break
        else:
            for des in anc.descendants:
                if des.genus:
                    continue
                des.genus = genus
                des.confidence = f'common ancestor ({node1.name}, {node2.name})'

In [None]:
for node in tqdm(root.descendants):
    if node.name not in language_list:
        continue
    if node.parent.name in ['uncl1493', 'root', 'book1242']:
        continue
    if not node.genus or node.genus in ['Creoles and Pidgins', 'Sign Languages']:
        continue
    # Creoles and Pidgins as well as Sign Languages do not form a genealogical genus
    
    siblings = node.siblings
    
    for sibling in siblings:
        if sibling.genus and sibling.genus != node.genus:
            # if any sibling has another genus, nothing is changed
            break
    else:
        for sibling in siblings:
            if sibling.genus or sibling.name not in language_list:
                continue
            sibling.genus = node.genus
            sibling.confidence = f'sibling of {node.name}'

In [None]:
leaves = root.descendants

In [None]:
leaves = [leaf for leaf in leaves if leaf.name in language_list]

In [None]:
len(leaves)

In [None]:
sum([x.genus is not None for x in leaves])

In [None]:
7134 / 8578

In [None]:
final_df = [
    {
        'glottocode': x.name,
        'genus': x.genus,
        'confidence': x.confidence
    } for x in leaves
]

In [None]:
pd.DataFrame(final_df)#.query('glottocode == "tari1263"')

In [None]:
pd.DataFrame(final_df).to_csv('genera.tsv', sep='\t', index=False)