In [1]:
import pandas as pd

# Read the data downloaded from WALS
df_81A = pd.read_csv('81A.txt', sep='\t')
df_26A = pd.read_csv("26A.txt", sep="\t")

# Drop unnecessary columns
cols_to_drop = ["value", "latitude", "longitude", "area"]
df_81A = df_81A.drop(columns=cols_to_drop, axis=1)
df_26A = df_26A.drop(columns=cols_to_drop, axis=1)

# Rename columns of interest
df_81A = df_81A.rename(columns={"wals code": "wals_code", "description": "description_81A"})
df_26A = df_26A.rename(columns={"wals code": "wals_code", "description": "description_26A"})

# Merge the two dataframes
merge_columns = ["wals_code", "name", "genus", "family"] # Define merge condition
df_merged = df_81A.merge(df_26A, on=merge_columns)

# Combine strongly and weakly suffixing/prefixing into just suffixing/prefixing
df_merged = df_merged.replace("Strongly suffixing", "Suffixing").replace("Weakly suffixing", "Suffixing")
df_merged = df_merged.replace("Strong prefixing", "Prefixing").replace("Weakly prefixing", "Prefixing")

In [2]:
# Print merged dataframe basic info
cols_merged = [col for col in df_merged.columns]
print(f"Columns in dataframe (Available data per languages): {cols_merged}")
print(f"Length of dataframe (Number of languages): {len(df_merged)}")
print(f"Possible Affixings: {df_merged.description_26A.unique()}")
print(f"Possible word orders: {df_merged.description_81A.unique()}")

# Format accroding to Step 2 chart, write to csv
df_step2 = df_merged.drop(columns=["wals_code", "genus"])
df_step2 = df_step2.rename(columns={"name": "Language", "family": "Language Family", "description_81A": "Basic Order", "description_26A": "Affixes"})
df_step2 = df_step2.reindex(columns=["Language", "Language Family", "Affixes", "Basic Order"])
df_step2.to_csv("step2_chart", index=False)

Columns in dataframe (Available data per languages): ['wals_code', 'name', 'description_81A', 'genus', 'family', 'description_26A']
Length of dataframe (Number of languages): 876
Possible Affixings: ['Suffixing' 'Equal prefixing and suffixing' 'Little affixation'
 'Prefixing']
Possible word orders: ['SVO' 'SOV' 'No dominant order' 'VSO' 'VOS' 'OVS']


In [6]:
"""Sort languages according to Word Order + Affixing Combination"""
affix_word_order = {}

for row in df_merged.itertuples():
    wals_code, affixing, word_order = row.wals_code, row.description_26A, row.description_81A
    combination_name = f"{affixing}/{word_order}"
    if combination_name in affix_word_order:
        affix_word_order[combination_name].append(wals_code)
    else:
        affix_word_order[combination_name] = [wals_code]

# Order dictionary from most common combination to least common
affix_word_order = dict(sorted(affix_word_order.items(), key=lambda item: len(item[1]), reverse=True))
for key,vals in affix_word_order.items():
    print(f"{key} : {len(vals)}")


Suffixing/SOV : 276
Suffixing/SVO : 92
Prefixing/SVO : 81
Little affixation/SVO : 72
Suffixing/No dominant order : 69
Equal prefixing and suffixing/SVO : 43
Equal prefixing and suffixing/SOV : 42
Little affixation/SOV : 29
Equal prefixing and suffixing/No dominant order : 27
Prefixing/No dominant order : 22
Suffixing/VSO : 19
Prefixing/SOV : 19
Equal prefixing and suffixing/VSO : 16
Prefixing/VSO : 15
Little affixation/VSO : 15
Little affixation/No dominant order : 12
Equal prefixing and suffixing/VOS : 7
Prefixing/VOS : 5
Little affixation/VOS : 5
Suffixing/VOS : 3
Suffixing/OVS : 3
Equal prefixing and suffixing/OVS : 3
Little affixation/OVS : 1


In [15]:
"""Find the prior probabilities of word order and affixing"""

def count_to_probs(count_dict, total, round_digits=3):
    """Helper function that divides each value in dict by a constant number"""
    for key, value in count_dict.items():
        count_dict[key] = round(value/total, round_digits)

# Number of languages
NUM_LANGS = len(df_merged)
print(f"Total number of langauges in database: {NUM_LANGS}")

# Calculate prior probabilities for Word Order and Affixing
wo_priors = {}
affixing_priors = {}
for row in df_merged.itertuples():
    affixing, word_order = row.description_26A, row.description_81A
    if word_order in wo_priors:
        wo_priors[word_order] += 1
    else:
        wo_priors[word_order] = 1
    if affixing in affixing_priors:
        affixing_priors[affixing] += 1
    else: 
        affixing_priors[affixing] = 1

# Sort and print priors for Word Order
print("\nPriors for Word Order:")
count_to_probs(wo_priors, NUM_LANGS)
wo_priors = dict(sorted(wo_priors.items(), key=lambda item: item[1], reverse=True))
for key,vals in wo_priors.items():
    print(f"{key} : {vals}")

# Sort and print priors for Affixing
print("\nPriors for Affixing:")
count_to_probs(affixing_priors, NUM_LANGS)
affixing_priors = dict(sorted(affixing_priors.items(), key=lambda item: item[1], reverse=True))
for key,vals in affixing_priors.items():
    print(f"{key} : {vals}")

Total number of langauges in database: 876

Priors for Word Order:
SOV : 0.418
SVO : 0.329
No dominant order : 0.148
VSO : 0.074
VOS : 0.023
OVS : 0.008

Priors for Affixing:
Suffixing : 0.527
Prefixing : 0.162
Equal prefixing and suffixing : 0.158
Little affixation : 0.153


In [17]:
""" 
Group data by language family, sort languages in each family by word order + affixing combination 
# {"language family": {"weak/SOV": ["code1", "code2"], "strong/SOV": []"code3", "code4"], ...} , ...}
"""

lang_families = {}

for row in df_merged.itertuples():
    wals_code, affixing, word_order, family = row.wals_code, row.description_26A, row.description_81A, row.family
    combination = (affixing, word_order)
    if family not in lang_families:
        lang_families[family] = {combination: [wals_code]}
    else: 
        # Family already exists in language dict
        if combination not in lang_families[family]: # Current combination does not exist in family
            lang_families[family][combination] = [wals_code]
        else: # Current combination does exist in family 
            lang_families[family][combination].append(wals_code)


with open("num_combinations_grouped_by_family.txt", "w", encoding="utf-8") as f:
    for family, family_dict in lang_families.items():
        f.write(f"Language Family: {family}\n")
        for combination, wals_codes in family_dict.items():
            f.write(f"\t{combination}: {len(wals_codes)}\n")
        f.write("\n")

with open("langs_combinations_grouped_by_family.txt", "w", encoding="utf-8") as f:
    for family, family_dict in lang_families.items():
        f.write(f"Language Family: {family}\n")
        for combination, wals_codes in family_dict.items():
            f.write(f"\t{combination}: {wals_codes}\n")
        f.write("\n")

In [41]:
"""Calculates joint probability of Word Order and Affix per family"""
from copy import deepcopy

# Number of languages in each family 
family_counts = []
family_counts_dict = {}
for family in lang_families:
    count = sum([len(lang_list) for lang_list in lang_families[family].values()])
    family_counts.append((family, count))
    family_counts_dict[family] = count

family_counts =  sorted(family_counts, key=lambda tup: tup[1], reverse=True)

# Only tabulate statistics for langauge families with more than 15 languages
interested_families = [family for family in family_counts if family[1] >= 15]
interested_families = sorted(interested_families, key=lambda tup: tup[0], reverse=False) # Sort families in alphabetical order
print(interested_families)
print("\n")

# Calculate joint probabilities of word order and affixing per language family
joint_probs_lang_families = deepcopy(lang_families)
for family,count in interested_families:
    combinations_dict = joint_probs_lang_families[family]
    for combination in combinations_dict:
        lang_list = combinations_dict[combination]
        combinations_dict[combination] = (round(len(lang_list) / count, 3), len(lang_list))

# Write results to file
with open("joint_statistics_per_family_cleaned.txt", "w", encoding="utf-8") as f:
    for family,count in interested_families:
        f.write(f"Family name: {family} ({count})\n")
        combinations_dict = joint_probs_lang_families[family]
        combinations_dict = dict(sorted(combinations_dict.items(), key=lambda x:x[1][1], reverse=True))
        for combination, tup in combinations_dict.items():
            probs, counts = tup
            f.write(f"\t{combination}: {probs} ({counts})\n")
        f.write("\n")

[('Afro-Asiatic', 50), ('Altaic', 23), ('Austronesian', 104), ('Central Sudanic', 19), ('Eastern Sudanic', 32), ('Indo-European', 51), ('Niger-Congo', 73), ('Oto-Manguean', 17), ('Pama-Nyungan', 38), ('Sino-Tibetan', 50), ('Trans-New Guinea', 41), ('Uto-Aztecan', 24)]


