In [2]:
import pandas as pd
from anytree import Node, RenderTree, NodeMixin
from anytree.search import findall
from itertools import combinations
from tqdm.notebook import tqdm

In [3]:
class NodeWithGenus(NodeMixin):
    def __init__(self, name, genus=None, parent=None, confidence=None):
        self.name = name
        self.genus = genus
        self.parent = parent
        self.confidence = confidence
    def __repr__(self):
        return f'Node {self.name}, Genus {self.genus}'

def find_common_ancestor(node1, node2):
    anc1 = list(node1.ancestors)
    anc2 = list(node2.ancestors)
    
    anc1.reverse()
    anc2.reverse()
    
    for i in anc1:
        for j in anc2:
            if i == j:
                return i

In [4]:
wals = pd.read_csv('wals_languages.csv')
wals.at[2296,'Glottocode'] = 'shah1254'
genera = wals[['Genus', 'Glottocode']].dropna()
genera = pd.Series(genera.Genus.values,index=genera.Glottocode).to_dict()

In [5]:
wals.query('Genus == "Yaruro"')

Unnamed: 0,ID,Name,Macroarea,Latitude,Longitude,Glottocode,ISO639P3code,Family,Subfamily,Genus,GenusIcon,ISO_codes,Samples_100,Samples_200,Country_ID,Source,Parent_ID
2614,yrr,Yaruro,South America,7.0,-68.0,pume1238,yae,Yaruro,,Yaruro,,yae,False,False,VR,Mosonyi-et-al-2000a,genus-yaruro


In [6]:
glottolog_parsed = pd.read_csv('languages_with_pages.tsv', sep='\t', index_col=0)
paths = glottolog_parsed.path
language_list = glottolog_parsed.glottocode.to_list()

In [7]:
nodes = {}
lst = [path.split('/') for path in paths]

for path in lst:
    for i, par in enumerate(path):
        nk = nodes[par] = nodes.get(par) or NodeWithGenus(par, genera.get(par, None))
        if i != 0:
            nv = nodes[path[i - 1]] = nodes.get(path[i - 1]) or NodeWithGenus(path[i - 1], genera.get(path[i - 1], None))
            nk.parent = nv
            
[root] = [n for n in nodes.values() if n.parent is None]

In [8]:
leaves = root.descendants
leaves = [leaf for leaf in leaves if leaf.name in language_list]
sum([x.genus is not None for x in leaves]) / len(leaves)

0.2809512706924691

In [9]:
leaves = root.descendants

for leaf in leaves:
    if leaf.genus:
        leaf.confidence = 'initial'
    if leaf.parent == root:
        leaf.genus = 'isolate'

In [10]:
for genus in tqdm(wals.Genus.dropna().unique()):
    if genus in ['Creoles and Pidgins', 'Sign Languages']:
        continue
    # Creoles and Pidgins as well as Sign Languages do not form a genealogical genus
    
    langs = findall(root, filter_=lambda node: node.genus == genus)
    # Find all languages with the same genus
    
    print(genus, len(langs))
    
    pairs = combinations(langs, 2)
    # Make all possible combinations
    
    all_nodes = list(root.descendants)
    
    for node1, node2 in pairs:
        anc = find_common_ancestor(node1, node2)
        # find the first common ancestor
        
        if anc.name in ['uncl1493', 'book1242']:
            continue
        
        # if an ancestor has already been taken into account it is removed from the list
        if anc not in all_nodes:
            continue
        else:
            all_nodes.remove(anc)
        
        for des in anc.descendants:
            if des.genus and des.genus != genus:
                # if any descendant has another genus, nothing is changed
                break
        else:
            for des in anc.descendants:
                if des.genus:
                    continue
                des.genus = genus
                des.confidence = f'common ancestor ({node1.name}, {node2.name})'

  0%|          | 0/625 [00:00<?, ?it/s]

Kombio-Arapesh 4
South Omotic 3
Abau 1
Semitic 32
Agneby 2
Abipon 1
Northwest Caucasian 5
Muskogean 6
Central Pama-Nyungan 15
Lowland East Cushitic 12
Abun 0
Alor-Pantar 6
Algonquian 28
Malayo-Sumbawan 20
Japura-Colombia 8
Maweti-Guarani 16
Mayan 27
Western Nilotic 10
Achumawi 1
Burmese-Lolo 14
Keresan 2
Jivaroan 4
Andoke 0
Oceanic 146
Aleut 1
Germanic 19
Gauwa 5
Upper Cross 2
Greater Central Philippine 19
Wide Grassfields 13
Lezgic 9
Angami-Pochuri 3
Northern Pama-Nyungan 32
Southern Cushitic 5
Athapaskan 22
Awju 2
Aikaná 0
Ainu 1
Kru 9
Kresh 2
Gbe 4
Bantu 128
Great Andamanese 4
Tano 7
Northern Caddoan 4
Cariban 17
Sepik Hill 3
Albanian 0
Huarpe 1
Avikam-Alladian 2
Central Malayo-Polynesian 29
Alsea 0
Northern Chukotko-Kamchatkan 4
Mara 3
Northwest Sumatra-Barrier Islands 8
Ndu 5
Panoan 11
Bodic 31
Mabuso 3
East Formosan 2
Harakmbet 1
Left May 2
Border 5
Central Kainji 2
South Halmahera - West New Guinea 10
Yanesha' 1
Josephstaal 1
Amuzgoan 1
Tacanan 4
West Chadic 14
Anêm 0
Senagi 2
K

In [14]:
leaves = root.descendants
leaves = [leaf for leaf in leaves if leaf.name in language_list]
sum([x.genus is not None for x in leaves]), len(leaves)

(6506, 8578)

In [12]:
leaves = root.descendants
leaves = [leaf for leaf in leaves if leaf.name in language_list]
sum([x.genus is not None for x in leaves]) / len(leaves)

0.7584518535789229

In [71]:
final_df = [
    {
        'glottocode': x.name,
        'genus': x.genus,
        'confidence': x.confidence
    } for x in leaves
]

pd.DataFrame(final_df).to_csv('final_genera_75.tsv', sep='\t', index=False)

In [59]:
# for node in tqdm(root.descendants):
#     if node.name not in language_list:
#         continue
#     if node.parent.name in ['uncl1493', 'root', 'book1242']:
#         continue
#     if not node.genus or node.genus in ['Creoles and Pidgins', 'Sign Languages']:
#         continue
#     # Creoles and Pidgins as well as Sign Languages do not form a genealogical genus
    
#     siblings = node.siblings
    
#     for sibling in siblings:
#         if sibling.genus and sibling.genus != node.genus:
#             # if any sibling has another genus, nothing is changed
#             break
#     else:
#         for sibling in siblings:
#             if sibling.genus or sibling.name not in language_list:
#                 continue
#             sibling.genus = node.genus
#             sibling.confidence = f'sibling of {node.name}'

# leaves = root.descendants
# leaves = [leaf for leaf in leaves if leaf.name in language_list]
# sum([x.genus is not None for x in leaves]) / len(leaves)

  0%|          | 0/13324 [00:00<?, ?it/s]