In [42]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [43]:
df = pd.read_csv('data/language.tsv', sep='\t', na_filter=False)

In [44]:
df

Unnamed: 0,wals_code,iso_code,glottocode,Name,latitude,longitude,genus,family,macroarea,countrycodes,...,137B M in Second Person Singular,136B M in First Person Singular,109B Other Roles of Applied Objects,10B Nasal Vowels in West Africa,25B Zero Marking of A and P Arguments,21B Exponence of Tense-Aspect-Mood Inflection,108B Productivity of the Antipassive Construction,130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand',58B Number of Possessive Nouns,79B Suppletion in Imperatives and Hortatives
0,aab,,,Arapesh (Abu),-3.450000,142.950000,Kombio-Arapesh,Torricelli,,PG,...,,,,,,,,,,
1,aar,aiw,aari1239,Aari,6.000000,36.583333,South Omotic,Afro-Asiatic,Africa,ET,...,,,,,,,,,,
2,aba,aau,abau1245,Abau,-4.000000,141.250000,Upper Sepik,Sepik,Papunesia,PG,...,,,,,,,,,,
3,abb,shu,chad1249,Arabic (Abbéché Chad),13.833333,20.833333,Semitic,Afro-Asiatic,Africa,TD,...,,,,,,,,,,
4,abd,abi,abid1235,Abidji,5.666667,-4.583333,Kwa,Niger-Congo,Africa,CI,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2674,zte,zpz,texm1235,Zapotec (Texmelucan),16.500000,-97.166667,Zapotecan,Oto-Manguean,North America,MX,...,,,,,,,,,,
2675,zul,zul,zulu1248,Zulu,-30.000000,30.000000,Bantoid,Niger-Congo,Africa,ZA,...,1 No m in second person singular,2 m in first person singular,1 Instrument,,2 Non-zero marking,1 monoexponential TAM,4 no antipassive,,1 None reported,5 None (= no suppletive imperatives reported i...
2676,zun,zun,zuni1245,Zuni,35.083333,-108.833333,Zuni,Zuni,North America,US,...,1 No m in second person singular,1 No m in first person singular,,,2 Non-zero marking,,,,,5 None (= no suppletive imperatives reported i...
2677,zya,zav,yatz1235,Zapotec (Yatzachi),17.200000,-96.200000,Zapotecan,Oto-Manguean,North America,MX,...,,,,,,,,,,


In [45]:
df = df.replace("",np.NaN)

In [46]:
df[df['wals_code'] == 'nan']

Unnamed: 0,wals_code,iso_code,glottocode,Name,latitude,longitude,genus,family,macroarea,countrycodes,...,137B M in Second Person Singular,136B M in First Person Singular,109B Other Roles of Applied Objects,10B Nasal Vowels in West Africa,25B Zero Marking of A and P Arguments,21B Exponence of Tense-Aspect-Mood Inflection,108B Productivity of the Antipassive Construction,130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand',58B Number of Possessive Nouns,79B Suppletion in Imperatives and Hortatives
1666,,kln,nand1266,Nandi,0.25,35.0,Nilotic,Eastern Sudanic,Africa,KE,...,,,,,,1 monoexponential TAM,,,1 None reported,


# Task 1: Calculating similarity scores

In [47]:
def get_matching_features_number(df, language_X_code, language_Y_code, columns_to_ignore):
    features_matching = 0
    
    x_row = df.loc[df['wals_code'] == language_X_code].iloc[0]
    y_row = df.loc[df['wals_code'] == language_Y_code].iloc[0]
    
    for column in df.columns:            
        if column in columns_to_ignore:
            continue

        x_value = x_row[column]
        y_value = y_row[column]
        
        if pd.isnull(x_value):
            continue
        
        if pd.isnull(y_value):
            continue

        if x_value == y_value:
            features_matching += 1
    
    return features_matching, y_row

In [48]:
def calculate_similarities(df, chosen_language_code):
    columns_to_ignore = ['wals_code', 'iso_code', 
                         'glottocode', 'Name', 
                         'latitude', 'longitude', 
                         'countrycodes']
    total_number_of_features = len(df.columns) - len(columns_to_ignore)
    
    language_codes = df['wals_code']
    
    chosen_language_idx = df.index[df['wals_code'] == chosen_language_code]
    
    results = []
    
    for lang_code in tqdm(language_codes):
        # skipping the language itself
        if lang_code == chosen_language_code:
            continue
        
        matching_features, lang = get_matching_features_number(df, 
                                                               chosen_language_code, 
                                                               lang_code, 
                                                               columns_to_ignore)
        
        results.append({
            'wals_code': lang['wals_code'],
            'name': lang['Name'],
            'genus': lang['genus'],
            'family': lang['family'],
            'macroarea': lang['macroarea'],
            'matching_features': matching_features,
            'similarity_score': matching_features / total_number_of_features
        })
    
    df_results = pd.DataFrame(results)
    return df_results.sort_values('similarity_score', ascending=False)

In [49]:
results = calculate_similarities(df, "scr")

  0%|          | 0/2679 [00:00<?, ?it/s]

In [50]:
results.head(10)

Unnamed: 0,wals_code,name,genus,family,macroarea,matching_features,similarity_score
2038,rus,Russian,Slavic,Indo-European,Eurasia,50,0.25641
783,grk,Greek (Modern),Greek,Indo-European,Eurasia,43,0.220513
1931,pol,Polish,Slavic,Indo-European,Eurasia,43,0.220513
1321,lit,Lithuanian,Baltic,Indo-European,Eurasia,41,0.210256
650,eng,English,Germanic,Indo-European,Eurasia,39,0.2
2123,slo,Slovene,Slavic,Indo-European,Eurasia,38,0.194872
385,bul,Bulgarian,Slavic,Indo-European,Eurasia,38,0.194872
2434,ukr,Ukrainian,Slavic,Indo-European,Eurasia,37,0.189744
2154,spa,Spanish,Romance,Indo-European,Eurasia,37,0.189744
543,cze,Czech,Slavic,Indo-European,Eurasia,36,0.184615


In [51]:
results.tail(10)

Unnamed: 0,wals_code,name,genus,family,macroarea,matching_features,similarity_score
545,dab,Daba,Biu-Mandara,Afro-Asiatic,Africa,0,0.0
2316,tmp,Tampulma,Gur,Niger-Congo,Africa,0,0.0
1873,pad,Padoe,Celebic,Austronesian,Papunesia,0,0.0
242,bfd,Biafada,Northern Atlantic,Niger-Congo,Africa,0,0.0
544,daa,Da'a,Celebic,Austronesian,Papunesia,0,0.0
1875,pag,Pagu,North Halmaheran,West Papuan,Papunesia,0,0.0
2311,tmk,Tumak,East Chadic,Afro-Asiatic,Africa,0,0.0
1877,pak,Pakanha,Northern Pama-Nyungan,Pama-Nyungan,Australia,0,0.0
538,cwe,Columbia-Wenatchi,Interior Salish,Salishan,North America,0,0.0
1099,kkr,Kirikiri,Lakes Plain,Lakes Plain,Papunesia,0,0.0


# Task 2: Finding centroid language for a given genus

In [52]:
def find_centroid_language(df, genus):
    columns_to_ignore = ['wals_code', 'iso_code', 
                         'glottocode', 'Name', 
                         'latitude', 'longitude', 
                         'countrycodes']
    total_number_of_features = len(df.columns) - len(columns_to_ignore)

    langs_in_genus = df.loc[df['genus'] == genus]['wals_code']

    matrix_of_similarities = np.zeros((len(langs_in_genus), len(langs_in_genus)))
    
    for i in range(len(langs_in_genus) - 1):
        for j in range(i+1, len(langs_in_genus)):
            i_code = langs_in_genus.iloc[i]
            j_code = langs_in_genus.iloc[j]
            
            fn, l = get_matching_features_number(df, i_code, j_code, columns_to_ignore)
            similarity = fn / total_number_of_features
            
            matrix_of_similarities[i, j] = similarity
            matrix_of_similarities[j, i] = similarity
    
    vector_total_similarities = matrix_of_similarities.sum(axis=0)
    idx_of_centroid_lang = np.argmax(vector_total_similarities)
    total_similarity_of_centroid = np.max(vector_total_similarities)
    
    return langs_in_genus.iloc[idx_of_centroid_lang], total_similarity_of_centroid

In [53]:
centroid_lang, max_similarity = find_centroid_language(df, 'Slavic')

In [54]:
centroid_lang, max_similarity

('rus', 2.2256410256410257)

In [55]:
df[df['wals_code'] == centroid_lang]

Unnamed: 0,wals_code,iso_code,glottocode,Name,latitude,longitude,genus,family,macroarea,countrycodes,...,137B M in Second Person Singular,136B M in First Person Singular,109B Other Roles of Applied Objects,10B Nasal Vowels in West Africa,25B Zero Marking of A and P Arguments,21B Exponence of Tense-Aspect-Mood Inflection,108B Productivity of the Antipassive Construction,130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand',58B Number of Possessive Nouns,79B Suppletion in Imperatives and Hortatives
2038,rus,rus,russ1263,Russian,56.0,38.0,Slavic,Indo-European,Eurasia,RU,...,1 No m in second person singular,2 m in first person singular,5 No applicative construction,,2 Non-zero marking,1 monoexponential TAM,4 no antipassive,,1 None reported,2 Imperative


In [56]:
centroid_lang, max_similarity = find_centroid_language(df, 'Romance')

In [57]:
centroid_lang, max_similarity

('spa', 2.4358974358974352)

In [58]:
df[df['wals_code'] == centroid_lang]

Unnamed: 0,wals_code,iso_code,glottocode,Name,latitude,longitude,genus,family,macroarea,countrycodes,...,137B M in Second Person Singular,136B M in First Person Singular,109B Other Roles of Applied Objects,10B Nasal Vowels in West Africa,25B Zero Marking of A and P Arguments,21B Exponence of Tense-Aspect-Mood Inflection,108B Productivity of the Antipassive Construction,130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand',58B Number of Possessive Nouns,79B Suppletion in Imperatives and Hortatives
2155,spa,spa,stan1288,Spanish,40.0,-4.0,Romance,Indo-European,Eurasia,ES,...,1 No m in second person singular,2 m in first person singular,5 No applicative construction,,2 Non-zero marking,2 TAM+agreement,4 no antipassive,,1 None reported,2 Imperative


In [59]:
centroid_lang, max_similarity = find_centroid_language(df, 'Semitic')

In [60]:
centroid_lang, max_similarity 

('aeg', 3.128205128205128)

In [61]:
df[df['wals_code'] == centroid_lang]

Unnamed: 0,wals_code,iso_code,glottocode,Name,latitude,longitude,genus,family,macroarea,countrycodes,...,137B M in Second Person Singular,136B M in First Person Singular,109B Other Roles of Applied Objects,10B Nasal Vowels in West Africa,25B Zero Marking of A and P Arguments,21B Exponence of Tense-Aspect-Mood Inflection,108B Productivity of the Antipassive Construction,130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand',58B Number of Possessive Nouns,79B Suppletion in Imperatives and Hortatives
34,aeg,arz,egyp1253,Arabic (Egyptian),30.0,31.0,Semitic,Afro-Asiatic,Africa,EG,...,1 No m in second person singular,1 No m in first person singular,5 No applicative construction,,2 Non-zero marking,3 TAM+agreement+diathesis,4 no antipassive,,1 None reported,2 Imperative


# Task 3: Finding the "weirdest" language

In [83]:
def find_weirdest_language(df, genus=None, family=None):
    columns_to_ignore = ['wals_code', 'iso_code', 
                         'glottocode', 'Name', 
                         'latitude', 'longitude', 
                         'countrycodes']
    total_number_of_features = len(df.columns) - len(columns_to_ignore)

    langs = df
    if genus is not None:
        langs = langs[langs['genus'] == genus]
    if family is not None:
        langs = langs[langs['family'] == family]
    
    total_langs = len(langs)
    matrix_of_similarities = np.zeros((total_langs, total_langs))
    
    for i in tqdm(range(total_langs - 1)):
        for j in range(i+1, total_langs):
            i_code = langs.iloc[i]['wals_code']
            j_code = langs.iloc[j]['wals_code']
            
            fn, l = get_matching_features_number(langs, i_code, j_code, columns_to_ignore)
            similarity = fn / total_number_of_features
            
            matrix_of_similarities[i, j] = similarity
            matrix_of_similarities[j, i] = similarity
    
    vector_total_similarities = matrix_of_similarities.mean(axis=0)
    idx_of_weirdest_lang = np.argmin(vector_total_similarities)
    sim_of_weirdest_lang = np.min(vector_total_similarities)
    
    return langs.iloc[idx_of_weirdest_lang], sim_of_weirdest_lang

### Family: Indo-European

In [101]:
weirdest_lang, weirdest_lang_sim = find_weirdest_language(df, family='Indo-European')

  0%|          | 0/175 [00:00<?, ?it/s]

In [102]:
weirdest_lang_sim

0.007575757575757556

In [103]:
df[df['wals_code'] == weirdest_lang['wals_code']]

Unnamed: 0,wals_code,iso_code,glottocode,Name,latitude,longitude,genus,family,macroarea,countrycodes,...,137B M in Second Person Singular,136B M in First Person Singular,109B Other Roles of Applied Objects,10B Nasal Vowels in West Africa,25B Zero Marking of A and P Arguments,21B Exponence of Tense-Aspect-Mood Inflection,108B Productivity of the Antipassive Construction,130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand',58B Number of Possessive Nouns,79B Suppletion in Imperatives and Hortatives
2028,rse,,,Romani (Sepecides),38.25,27.0,Indic,Indo-European,,TR,...,,,,,,,,,,


### Family: Dravidian

In [104]:
weirdest_lang, weirdest_lang_sim = find_weirdest_language(df, family='Dravidian')

  0%|          | 0/22 [00:00<?, ?it/s]

In [105]:
weirdest_lang_sim

0.010702341137123744

In [106]:
df[df['wals_code'] == weirdest_lang['wals_code']]

Unnamed: 0,wals_code,iso_code,glottocode,Name,latitude,longitude,genus,family,macroarea,countrycodes,...,137B M in Second Person Singular,136B M in First Person Singular,109B Other Roles of Applied Objects,10B Nasal Vowels in West Africa,25B Zero Marking of A and P Arguments,21B Exponence of Tense-Aspect-Mood Inflection,108B Productivity of the Antipassive Construction,130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand',58B Number of Possessive Nouns,79B Suppletion in Imperatives and Hortatives
726,gdk,gdb,pott1240,Gadaba (Kondekor),18.75,83.5,Central Dravidian,Dravidian,Eurasia,IN,...,,,,,,,,,,


### Family: Niger-Congo

In [107]:
weirdest_lang, weirdest_lang_sim = find_weirdest_language(df, family='Niger-Congo')

  0%|          | 0/326 [00:00<?, ?it/s]

In [108]:
weirdest_lang_sim

0.010381870932329618

In [109]:
df[df['wals_code'] == weirdest_lang['wals_code']]

Unnamed: 0,wals_code,iso_code,glottocode,Name,latitude,longitude,genus,family,macroarea,countrycodes,...,137B M in Second Person Singular,136B M in First Person Singular,109B Other Roles of Applied Objects,10B Nasal Vowels in West Africa,25B Zero Marking of A and P Arguments,21B Exponence of Tense-Aspect-Mood Inflection,108B Productivity of the Antipassive Construction,130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand',58B Number of Possessive Nouns,79B Suppletion in Imperatives and Hortatives
2449,urh,urh,urho1239,Urhobo,5.583333,6.0,Edoid,Niger-Congo,Africa,NG,...,,,,,,,,,,


### Genus: Slavic

In [110]:
weirdest_lang, weirdest_lang_sim = find_weirdest_language(df, genus='Slavic')

  0%|          | 0/16 [00:00<?, ?it/s]

In [111]:
weirdest_lang_sim

0.015082956259426846

In [112]:
df[df['wals_code'] == weirdest_lang['wals_code']]

Unnamed: 0,wals_code,iso_code,glottocode,Name,latitude,longitude,genus,family,macroarea,countrycodes,...,137B M in Second Person Singular,136B M in First Person Singular,109B Other Roles of Applied Objects,10B Nasal Vowels in West Africa,25B Zero Marking of A and P Arguments,21B Exponence of Tense-Aspect-Mood Inflection,108B Productivity of the Antipassive Construction,130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand',58B Number of Possessive Nouns,79B Suppletion in Imperatives and Hortatives
329,bos,bos,bosn1245,Bosnian,43.0,18.0,Slavic,Indo-European,Eurasia,BA,...,,,,,,,,,,


### Genus: Romance

In [113]:
weirdest_lang, weirdest_lang_sim = find_weirdest_language(df, genus='Romance')

  0%|          | 0/23 [00:00<?, ?it/s]

In [114]:
weirdest_lang_sim

0.014957264957264958

In [115]:
df[df['wals_code'] == weirdest_lang['wals_code']]

Unnamed: 0,wals_code,iso_code,glottocode,Name,latitude,longitude,genus,family,macroarea,countrycodes,...,137B M in Second Person Singular,136B M in First Person Singular,109B Other Roles of Applied Objects,10B Nasal Vowels in West Africa,25B Zero Marking of A and P Arguments,21B Exponence of Tense-Aspect-Mood Inflection,108B Productivity of the Antipassive Construction,130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand',58B Number of Possessive Nouns,79B Suppletion in Imperatives and Hortatives
1547,mol,ron,roma1327,Moldavian,47.0,29.0,Romance,Indo-European,Eurasia,MD,...,,,,,,,,,,


### Genus: Turkic

In [116]:
weirdest_lang, weirdest_lang_sim = find_weirdest_language(df, genus='Turkic')

  0%|          | 0/40 [00:00<?, ?it/s]

In [117]:
weirdest_lang_sim

0.01500938086303939

In [118]:
df[df['wals_code'] == weirdest_lang['wals_code']]

Unnamed: 0,wals_code,iso_code,glottocode,Name,latitude,longitude,genus,family,macroarea,countrycodes,...,137B M in Second Person Singular,136B M in First Person Singular,109B Other Roles of Applied Objects,10B Nasal Vowels in West Africa,25B Zero Marking of A and P Arguments,21B Exponence of Tense-Aspect-Mood Inflection,108B Productivity of the Antipassive Construction,130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand',58B Number of Possessive Nouns,79B Suppletion in Imperatives and Hortatives
611,dol,dlg,dolg1241,Dolgan,71.25,98.0,Turkic,Altaic,Eurasia,RU,...,,,,,,,,,,
