In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [18]:
def get_id2string(infile, to_int=False):
    id2lang = {}
    with open(infile, encoding='utf8') as f:
        for line in f:
            if line.startswith('ID,'):
                continue
            cells = line.split(',')
            entry_id = cells[0]
            if to_int:
                entry_id = int(entry_id)
            id2lang[entry_id] = cells[1]
    return id2lang

id2lang = get_id2string('./data/languages.csv')
id2param = get_id2string('./data/parameters.csv')

- 0.0 - 5. no evidence for borrowing
- 0.25 - 4. very little evidence for borrowing
- 0.5 - 3. perhaps borrowed
- 0.75 - 2. probably borrowed
- 1.0 - 1. clearly borrowed

In [2]:
df_complete = pd.read_csv('./data/forms.csv')
df = df_complete[['ID', 'Language_ID', 'Parameter_ID', 'Form', 'BorrowedScore']]
df.head(10)
# df_complete.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ID,Language_ID,Parameter_ID,Form,BorrowedScore
0,51602,1,1-1,dunia,1.0
1,51603,1,1-1,ulimwengu,0.25
2,18425,2,1-1,yaamu,0.0
3,8475,3,1-1,ʔalame,1.0
4,10666,4,1-1,dúuníyàa,1.0
5,23191,5,1-1,dúnyâ,1.0
6,1254,6,1-1,ddənya,1.0
7,49513,7,1-1,lemonn,0.0
8,42643,8,1-1,lume,0.0
9,47780,9,1-1,világo,1.0


In [4]:
# df['category'] = df['Parameter_ID'].apply(lambda x: x.split('-')[0])
# df.head()

In [4]:
# df.BorrowedScore.unique()
df[df.BorrowedScore == 1.0].head(10)

Unnamed: 0,ID,Language_ID,Parameter_ID,Form,BorrowedScore
0,51602,1,1-1,dunia,1.0
3,8475,3,1-1,ʔalame,1.0
4,10666,4,1-1,dúuníyàa,1.0
5,23191,5,1-1,dúnyâ,1.0
6,1254,6,1-1,ddənya,1.0
9,47780,9,1-1,világo,1.0
15,26045,14,1-1,mɨrr,1.0
16,2946,15,1-1,dunnal,1.0
17,0,16,1-1,dunil,1.0
21,19687,21,1-1,sekai,1.0


**NOTE**: Some languages have multiple words with different borrowing statuses for one concept.

In [12]:
concept = dict()
for index, row in df.iterrows():
    if row['BorrowedScore'] > 0.7:  # probably/clearly borrowed
        lang = id2lang[row['Language_ID']]
        try:
            concept[row['Parameter_ID']].add(lang)
        except KeyError:
            concept[row['Parameter_ID']] = {lang}

In [16]:
print(id2param['1-1'])
print(concept['1-1'])

the world
{'Gawwada', 'Kanuri', 'Japanese', 'Swahili', 'Kildin Saami', 'Selice Romani', 'Archi', 'Vietnamese', 'Indonesian', 'Hausa', 'Thai', 'Bezhta', 'Tarifiyt Berber'}


In [19]:
THRESHOLD = 5
c1_ids = []
c1_counts = []
c2_ids = []
c2_counts = []
coocc_counts = []

concepts = list(concept.keys())

for i, c1 in enumerate(concepts):
    for c2 in concepts[i + 1:]:
        
        c1_count = len(concept[c1])
        if c1_count < THRESHOLD:
            continue
        c2_count = len(concept[c2])
        if c2_count < THRESHOLD:
            continue
        c1_ids.append(id2param[c1])
        c2_ids.append(id2param[c2])
        c1_counts.append(c1_count)    
        c2_counts.append(c2_count)
        coocc_counts.append(len(concept[c1].intersection(concept[c2])))

In [20]:
df_pmi = pd.DataFrame({'c1': c1_ids, 'c2': c2_ids,
                   'c1_count': c1_counts, 'c2_count': c2_counts,
                   'cooccurrence': coocc_counts})
df_pmi.head()

Unnamed: 0,c1,c2,c1_count,c2_count,cooccurrence
0,the world,the land,13,6,4
1,the world,the dust,13,9,4
2,the world,the mud,13,5,1
3,the world,the mountain or hill,13,9,3
4,the world,the cliff or precipice,13,5,2


In [21]:
n_langs = len(df.Language_ID.unique())
n_langs

41

In [22]:
df_pmi['c1_prop'] = df_pmi['c1_count'] / n_langs
df_pmi['c2_prop'] = df_pmi['c2_count'] / n_langs
df_pmi['coocc_prop'] = df_pmi['cooccurrence'] / n_langs

df_pmi['pmi'] = np.log(df_pmi['coocc_prop'] / df_pmi['c1_prop'] / df_pmi['c2_prop'])
df_pmi['npmi'] = df_pmi['pmi'] / - np.log(df_pmi['coocc_prop'])
df_pmi.sort_values(by=['cooccurrence'], ascending=False).head()

  """
  


Unnamed: 0,c1,c2,c1_count,c2_count,cooccurrence,c1_prop,c2_prop,coocc_prop,pmi,npmi
585045,the motor,the coffee,34,34,30,0.829268,0.829268,0.731707,0.062048,0.198635
342985,the soap,the coffee,33,34,29,0.804878,0.829268,0.707317,0.058,0.167496
299709,the sugar,the hour,33,33,29,0.804878,0.804878,0.707317,0.087853,0.253707
517235,the hour,the clock,33,32,29,0.804878,0.780488,0.707317,0.118624,0.342572
323437,the sock or stocking,the hour,32,33,29,0.780488,0.804878,0.707317,0.118624,0.342572


In [57]:
df_pmi.sort_values(by=['npmi', 'coocc_prop', 'cooccurrence'], ascending=False)

Unnamed: 0,c1,c2,c1_count,c2_count,cooccurrence,c1_prop,c2_prop,coocc_prop,pmi,npmi
723240,12-45,12-46,10,10,10,0.243902,0.243902,0.243902,1.410987,1.000000
140952,2-64,2-641,6,6,6,0.146341,0.146341,0.146341,1.921813,1.000000
741691,13-07,13-09,12,12,12,0.292683,0.292683,0.292683,1.228665,1.000000
295764,4-28,12-24,4,4,4,0.097561,0.097561,0.097561,2.327278,1.000000
301628,4-31,4-33,4,4,4,0.097561,0.097561,0.097561,2.327278,1.000000
609480,9-32,10-471,4,4,4,0.097561,0.097561,0.097561,2.327278,1.000000
646593,10-35,12-212,4,4,4,0.097561,0.097561,0.097561,2.327278,1.000000
67721,1-75,12-27,3,3,3,0.073171,0.073171,0.073171,2.614960,1.000000
67976,1-75,18-12,3,3,3,0.073171,0.073171,0.073171,2.614960,1.000000
117924,2-48,2-49,3,3,3,0.073171,0.073171,0.073171,2.614960,1.000000
