In [51]:
import csv

In [233]:
languages = []
id_to_language = {}

with open("data/glottolog_languoid_5.2.csv") as csv_file:
    glottolog_languages = csv.reader(csv_file)

    for row in glottolog_languages:
        languages.append(
            {
                "language_id": row[0],
                "parent_id": row[2],
                "name": row[3],
            }
        )

        id_to_language[row[0]] = {
            "language_id": row[0],
            "parent_id": row[2],
            "name": row[3],
        }

In [191]:
import re


def clean_label(label):
    label = label.replace("sequence", "")
    label = label.replace("seq", "")
    return re.sub(r"[^A-Za-z ]+", " ", label).lower().strip()

In [232]:
from rapidfuzz import fuzz, process

language_names = [clean_label(l["name"]) for l in languages]

matches = process.extractOne(
    clean_label("seq_zurich german"),
    language_names,
    scorer=fuzz.WRatio,
    score_cutoff=80,
)
matches

('z rich german', 92.3076923076923, 27028)

In [242]:
def extend_lineage(lineage: list):
    next_parent_id = lineage[-1]["parent_id"]
    next_parent = id_to_language.get(next_parent_id)

    if next_parent:
        lineage.append(next_parent)
        extend_lineage(lineage)

    return lineage

In [243]:
extend_lineage([languages[27028]])

[{'language_id': 'zuri1239', 'parent_id': 'high1290', 'name': 'Zürich German'},
 {'language_id': 'high1290',
  'parent_id': 'swis1247',
  'name': 'High Alemannic'},
 {'language_id': 'swis1247',
  'parent_id': 'sout3294',
  'name': 'Central Alemannic'},
 {'language_id': 'sout3294',
  'parent_id': 'alem1243',
  'name': 'South Alemannic'},
 {'language_id': 'alem1243', 'parent_id': 'mode1258', 'name': 'Alemannic'},
 {'language_id': 'mode1258',
  'parent_id': 'midd1349',
  'name': 'Modern High German'},
 {'language_id': 'midd1349',
  'parent_id': 'high1286',
  'name': 'Middle-Modern High German'},
 {'language_id': 'high1286', 'parent_id': 'high1289', 'name': 'Upper German'},
 {'language_id': 'high1289', 'parent_id': 'west2793', 'name': 'High German'},
 {'language_id': 'west2793', 'parent_id': 'nort3152', 'name': 'West Germanic'},
 {'language_id': 'nort3152',
  'parent_id': 'germ1287',
  'name': 'Northwest Germanic'},
 {'language_id': 'germ1287', 'parent_id': 'clas1257', 'name': 'Germanic'},

In [240]:
lineage = [languages[27028]]
next_parent_id = lineage[-1]["parent_id"]
next_parent = id_to_language.get(next_parent_id)

In [241]:
next_parent

{'language_id': 'high1290', 'parent_id': 'swis1247', 'name': 'High Alemannic'}

In [1]:
from phylodata.language_utils import lookup_language

In [21]:
lookup_language("Italian")

[ClassificationEntry(id='ital1282', scientific_name='Italian'),
 ClassificationEntry(id='ital1287', scientific_name='Italian Romance'),
 ClassificationEntry(id='ital1286', scientific_name='Italo-Dalmatian'),
 ClassificationEntry(id='ital1285', scientific_name='Italo-Western Romance'),
 ClassificationEntry(id='roma1334', scientific_name='Romance'),
 ClassificationEntry(id='impe1234', scientific_name='Imperial Latin'),
 ClassificationEntry(id='lati1263', scientific_name='Latinic'),
 ClassificationEntry(id='lati1262', scientific_name='Latino-Faliscan'),
 ClassificationEntry(id='ital1284', scientific_name='Italic'),
 ClassificationEntry(id='clas1257', scientific_name='Classical Indo-European'),
 ClassificationEntry(id='indo1319', scientific_name='Indo-European')]