In [None]:
import numpy as np
import pandas as pd
from functools import reduce #for summing a list of pandas dataframes
import re #regular expressions

from IPython.display import HTML

import difflib

!pip install lingtypology #only because colab doesn't have it by default
import lingtypology

In [None]:
def importwordlists(lang_ids):
    """downloads the ABVD's word lists of languages, indicated by their ABVD id,
    and returns them in a database."""
    colnames = ["id","word_id","word","item","annotation","loan","cognacy","pmpcognacy"]
    mydata = pd.DataFrame(columns = colnames)
    csv_url = "https://abvd.shh.mpg.de/utils/save/?type=csv&section=austronesian&language="
    for name,ABVD_id in lang_ids.items():
        print("Downloading ABVD wordlist id =",ABVD_id,"of",name, end='')
        data = pd.read_csv(csv_url+str(ABVD_id), names=colnames, index_col=False, escapechar='\\')
        data = data.iloc[int(np.where((data==colnames).all(axis=1))[0])+1:]
        data["language name"]=name
        data["ABVD id"]=ABVD_id
        mydata = pd.concat([mydata, data])
        print(" - done.")
    return mydata

def charfreq(languagename, sortbyfreq=False):
    """returns two lists with all unique characters in a given language's wordlist
    and their respective frequencies. By default the list is sorted alphabetically.
    Sorting by descending frequency is also possible."""
    df_lang = mydata[mydata["language name"]==languagename]
    allitems = ''.join(df_lang["item"])
    if sortbyfreq:
        chars = sorted(set(allitems), key=allitems.count, reverse=True)
    else:
        chars = sorted(set(allitems))
    charfreqs = [allitems.count(char) for char in chars]
    return chars,charfreqs

def findsubstrings(strings, languagename):
    """given a list of strings (e.g. ['N','ñ']) and a language name (e.g. 'Paser'),
    returns a pandas database with the items from the given language that contain any of these strings."""
    df_lang = mydata[mydata["language name"]==languagename]
    df_strings = [df_lang["item"].str.contains(i) for i in strings] #reminder: str.contains can also handle regular expressions!
    return df_lang[reduce(lambda x, y: x.add(y), df_strings)]

def findPMPwords(regex):
    """given a regular expression (e.g. '^[qk]'), this returns a pandas dataframe
    containing the PMP items that match the regular expression."""
    df_PMP = mydata[mydata["language name"]=="Proto-Malayo-Polynesian"]
    df_matchitems = df_PMP["item"].str.contains(regex)
    df_PMP = df_PMP[df_matchitems]
    df_PMP = df_PMP[df_PMP.cognacy.notna()]
    return df_PMP

def matchreflexes(df_PMP,language,cogn_only=True):
    """returns descendants of the words in df_PMP in the specified language.
    Optionally returns a list of words with the same id, not just descendants"""
    df_lang = mydata[mydata["language name"]==language].set_index("word_id")
    df_lang = df_lang.loc[df_lang.index.intersection(df_PMP.word_id)]
    df_lang = df_lang[df_lang.cognacy.notna()] #exclude rows with NaN cognacy value
    df_lang.reset_index(inplace=True)
    if cogn_only:
        for wordid in sorted(set(df_lang.word_id)):
            PMPcognrs = [nr for row in df_PMP.set_index("word_id").cognacy.loc[[wordid]] for nr in row.split(',')]
            for ind,row in df_lang[df_lang.word_id==wordid].iterrows():
                if not any(nr in PMPcognrs for nr in row.cognacy.split(',')):
                    df_lang.drop(ind, inplace=True)
    return df_lang

In [None]:
#Get the data
lang_ids = {"Kadorih":487, "Dayak Ngaju":360, "Katingan":158, "Yakan":200, "Ma'anyan":215, "Paser (~Taboyan)":1209, "Tunjung":189, "Malagasy (Tandroy)":1186, "Proto Malagasy":1526, "Proto-Malayo-Polynesian":269}
#note 1: A few languages have multiple entries in the ABVD (under different ids). I manually selected the most complete entries.
#note 2: I didn't use the Tandroy Malagasy and Proto Malagasy data in the end.
mydata = importwordlists(lang_ids)
display(mydata)

In [None]:
#Put Smith's segments with respective contexts in lists, for both vowels and consonants.
#I didn't have time to analyze the vowels for my Bachelor's thesis, but the program is ready for it
#Adding additional segments+contexts can also be done here, e.g. *j-, *-t or *m.

#Smith's PMP phonemes and contexts (word-initial, between vowels and word-final)
PMP_C_AlexSmith = ["-p-","-t-","-k-","q-","-q-","-q","b-","-b-","-b","d-","-d-","-d","-j-","-j","z-","-z-","s-","-s-","-s","l-","-l-","-l","R-","-R-","-R","y","w"] #note: ABVD writes ʀ, not R
PMP_V_AlexSmith = ["-a","-aC","-aCVC","-u","-uC","-uCVC","-i","-iC","-iCVC","-əC","-əCVC","-ay","aw"] #note: the ABVD list writes e for schwas, as is tradition

#the corresponding regular expressions, to search the ABVD list for the right words
PMP_C_AlexSmith_re = ["[aeiouí]p[aeiouí]","[aeiouí]t[aeiouí]","[aeiouí]k[aeiouí]","^\*q","[aeiouí]q[aeiouí]","q$","^\*b","[aeiouí]b[aeiouí]","b$","^\*d","[aeiouí]d[aeiouí]","d$","[aeiouí]j[aeiouí]","j$","^\*z","[aeiouí]z[aeiouí]","^\*s","[aeiouí]s[aeiouí]","s$","^\*l","[aeiouí]l[aeiouí]","l$","^\*ʀ","[aeiouí]ʀ[aeiouí]","ʀ$","y","w"]
PMP_V_AlexSmith_re = ["a$","a[^aeiouí]$","a[^aeiouí][aeiouí][^aeiouí]$","u$","u[^aeiouí]$","u[^aeiouí][aeiouí][^aeiouí]$","i$","i[^aeiouí]$","i[^aeiouí][aeiouí][^aeiouí]$","e[^aeiouí]$","e[^aeiouí][aeiouí][^aeiouí]$","ay$","aw"]

In [None]:
#Generate a table of the number of reflexes per language per segment, as found in the cell above.

nr_of_reflexes = np.zeros(shape=(len(list(lang_ids.keys())[:-3]), len(PMP_C_AlexSmith)), dtype=int)
for i,phoneme in enumerate(PMP_C_AlexSmith):
    df_PMP = findPMPwords(PMP_C_AlexSmith_re[i])
    for j,language in enumerate(list(lang_ids.keys())[:-3]):
        df_lang = matchreflexes(df_PMP,language)
        df_PMP_matches = df_PMP.set_index("word_id").loc[df_lang.word_id].reset_index()
        nr_of_reflexes[j,i] = len(df_lang)
nr_of_reflexes = pd.DataFrame(data = nr_of_reflexes.T, index=PMP_C_AlexSmith, columns=list(lang_ids.keys())[:-3])
display(nr_of_reflexes)
nr_of_reflexes.to_csv('number of reflexes consonants.csv')

In [None]:
#Finds the PMP items containing a given sequence of PMP characters,
#and returns the words in the modern languages with the same word_id.
#These are candidates for cognate sets, but the cognacy and loan checks must be
#done manually! I only used this to generate cognacy sets, not in my analysis.

PMPstrings = ["^\*z"] #here goes the list of PMP sequences for which to return candidate cognate sets
df_with_phonemes = findsubstrings(PMPstrings,"Proto-Malayo-Polynesian")

display(df_with_phonemes[["word_id","word","item","annotation","cognacy"]])
ids = list(df_with_phonemes["word_id"])

modernlangs = mydata[mydata["language name"].isin(list(lang_ids.keys())[:-3]+["Proto-Malayo-Polynesian"])]
for wordid in ids:
    df_wrds = modernlangs[modernlangs['word_id']==wordid]
    print(df_wrds["word"].iloc[0])
    df = df_wrds[["item","language name","cognacy","loan","annotation"]].T
    display(HTML(df.to_html(header=False)))
    print()

In [None]:
#from https://oneadder.github.io/lingtypology/html/index.html

m = lingtypology.LingMap(('Ot Danum', 'Ngaju', 'Yakan', "Ma'anyan", 'Tawoyan', 'Tunjung'))
m.start_location = (0,115)
m.start_zoom = 5
m.add_features(['Kadorih', 'Ngaju', 'Yakan', "Ma'anyan", 'Taboyan', 'Tunjung'])
m.create_map()

In [None]:
#I tried reordering Smith's step-ladder, but didn't find anything interesting
#and didn't use this in the end.

nwb="+--------"
swb="++---+---"
yak="+++------"
seb="-+++---++"
ceb="-++++--++"
neb="--+++++--"
tun="+---++++-"
names = ["nwb","swb","yak","seb","ceb","neb","tun"]
smith = [nwb,swb,yak,seb,ceb,neb,tun]
d = {n:s for n,s in zip(names,smith)}

for i,l in enumerate(d.items()):
    print(l[0])
    d_s = dict(sorted(d.items(), key=lambda x: difflib.SequenceMatcher(None,x[1],l[1]).ratio(),reverse=True)[1:])
    print(list(d_s.keys()))