In [1]:
import pandas as pd

df_81A = pd.read_csv('81A.txt', sep='\t')
df_26A = pd.read_csv("26A.txt", sep="\t")

cols_to_drop = ["value", "latitude", "longitude", "area"]

df_81A = df_81A.drop(columns=cols_to_drop, axis=1)
df_81A = df_81A.rename(columns={"wals code": "wals_code", "description": "description_81A"})

df_26A = df_26A.drop(columns=cols_to_drop, axis=1)
df_26A = df_26A.rename(columns={"wals code": "wals_code", "description": "description_26A"})

merge_columns = ["wals_code", "name", "genus", "family"]
df_merged = df_81A.merge(df_26A, on=merge_columns)
df_merged = df_merged.replace("Strongly suffixing", "Suffixing").replace("Weakly suffixing", "Suffixing")
df_merged = df_merged.replace("Strong prefixing", "Prefixing").replace("Weakly prefixing", "Prefixing")

In [13]:
cols_merged = [col for col in df_merged.columns]
print(cols_merged)
print(len(df_merged))
print(df_merged.description_26A.unique())

# Format accroding to Step 2 chart, write to csv
df_step2 = df_merged.drop(columns=["wals_code", "genus"])
df_step2 = df_step2.rename(columns={"name": "Language", "family": "Language Family", "description_81A": "Basic Order", "description_26A": "Affixes"})
df_step2 = df_step2.reindex(columns=["Language", "Language Family", "Affixes", "Basic Order"])
df_step2.to_csv("step2_chart", index=False)

['wals_code', 'name', 'description_81A', 'genus', 'family', 'description_26A']
876
['Suffixing' 'Equal prefixing and suffixing' 'Little affixation'
 'Prefixing']


In [7]:
# Produces graph shown on WALS website when tabulating features
suffix_word_order = {}

for row in df_merged.itertuples():
    wals_code, affixing, word_order = row.wals_code, row.description_26A, row.description_81A
    if (affixing, word_order) in suffix_word_order:
        suffix_word_order[(affixing, word_order)].append(wals_code)
    else:
        suffix_word_order[(affixing, word_order)] = [wals_code]

suffix_word_order = dict(sorted(suffix_word_order.items(), key=lambda item: len(item[1]), reverse=True))
for key,vals in suffix_word_order.items():
    print(f"{key} : {len(vals)}")


('Suffixing', 'SOV') : 276
('Suffixing', 'SVO') : 92
('Prefixing', 'SVO') : 81
('Little affixation', 'SVO') : 72
('Suffixing', 'No dominant order') : 69
('Equal prefixing and suffixing', 'SVO') : 43
('Equal prefixing and suffixing', 'SOV') : 42
('Little affixation', 'SOV') : 29
('Equal prefixing and suffixing', 'No dominant order') : 27
('Prefixing', 'No dominant order') : 22
('Suffixing', 'VSO') : 19
('Prefixing', 'SOV') : 19
('Equal prefixing and suffixing', 'VSO') : 16
('Prefixing', 'VSO') : 15
('Little affixation', 'VSO') : 15
('Little affixation', 'No dominant order') : 12
('Equal prefixing and suffixing', 'VOS') : 7
('Prefixing', 'VOS') : 5
('Little affixation', 'VOS') : 5
('Suffixing', 'VOS') : 3
('Suffixing', 'OVS') : 3
('Equal prefixing and suffixing', 'OVS') : 3
('Little affixation', 'OVS') : 1


In [9]:
word_order_dict = {}
for row in df_merged.itertuples():
    word_order = row.description_81A
    if word_order in word_order_dict:
        word_order_dict[word_order] += 1
    else:
        word_order_dict[word_order] = 1

word_order_dict = dict(sorted(word_order_dict.items(), key=lambda item: item[1], reverse=True))
for key,vals in word_order_dict.items():
    print(f"{key} : {vals}")

SOV : 366
SVO : 288
No dominant order : 130
VSO : 65
VOS : 20
OVS : 7


In [4]:
# ['wals_code', 'name', 'description_81A', 'genus', 'family', 'description_26A']
# Now order by language family: 
# {"language family": {"weak/SOV": ["code1", "code2"], "strong/SOV": []"code3", "code4"], ...} , ...}

lang_families = {}
for row in df_merged.itertuples():
    wals_code, affixing, word_order, family = row.wals_code, row.description_26A, row.description_81A, row.family
    affixing_word_order = (affixing, word_order)
    if family not in lang_families:
        lang_families[family] = {affixing_word_order: [wals_code]}
        affixing_dict = lang_families[family]
        affixing_dict = dict(sorted(affixing_dict.items(), key=lambda dict_tup: len(dict_tup[1]), reverse=True))
    else:
        affixing_dict = lang_families[family]
        if affixing_word_order not in affixing_dict:
            affixing_dict[affixing_word_order] = [wals_code]
        else:
            affixing_dict[affixing_word_order].append(wals_code)
        affixing_dict = dict(sorted(affixing_dict.items(), key=lambda dict_tup: len(dict_tup[1]), reverse=True))


with open("analysis2.txt", "w", encoding="utf-8") as f:
    for family, affixing_word_order in lang_families.items():
        f.write(f"Language Family: {family}\n")
        for tup, codes in affixing_word_order.items():
            f.write(f"\t{tup}: {len(codes)}\n")
        f.write("\n")

In [32]:
# P(suffixing | SOV) 
# There are much faster and more efficient ways to calculate this...

def format_joint_dict(joint_probs):
    for conditional_var in joint_probs:
        print(conditional_var)
        for resulting_var in joint_probs[conditional_var]:
            print(f"\t{resulting_var} : {joint_probs[conditional_var][resulting_var]}")

def calculate_affix_given_wordorder(tups):
    """Input tups: [(code, affix, word order), ...]"""
    # Tabulate joint counts
    joint_counts = {}
    for code, affix, word_order in tups:
        if word_order in joint_counts:
            if affix in joint_counts[word_order]: 
                joint_counts[word_order][affix] += 1
            else: 
                joint_counts[word_order][affix] = 1
        else: 
            joint_counts[word_order]= {affix: 1}
        
    # Calculate priors
    priors = {}
    for word_order in joint_counts:
        priors[word_order] = round(sum(list(joint_counts[word_order].values())) / len(tups), 3)
    print(priors)

    # Convert joint_counts into joint probabilities
    for word_order in joint_counts:
        word_order_counts = sum(list(joint_counts[word_order].values()))
        for affix in joint_counts[word_order]:
            joint_counts[word_order][affix] = round(joint_counts[word_order][affix] / word_order_counts, 3)
    return joint_counts

tups = [(row.wals_code, row.description_26A, row.description_81A) for row in df_merged.itertuples()]
affix_given_wordorder = calculate_affix_given_wordorder(tups)
format_joint_dict(affix_given_wordorder)

def calculate_wordorder_given_affix(tups):
    """Input tups: [(code, affix, word order), ...]"""
    # Tabulate joint counts
    joint_counts = {}
    for code, affix, word_order in tups:
        if affix in joint_counts:
            if word_order in joint_counts[affix]: 
                joint_counts[affix][word_order] += 1
            else: 
                joint_counts[affix][word_order] = 1
        else: 
            joint_counts[affix]= {word_order: 1}
    
    # Calculate priors
    priors = {}
    for affix in joint_counts:
        priors[affix] = round(sum(list(joint_counts[affix].values())) / len(tups), 3)
    print(priors)

    # Convert joint_counts into joint probabilities
    for affix in joint_counts:
        affix_counts = sum(list(joint_counts[affix].values()))
        for word_order in joint_counts[affix]:
            joint_counts[affix][word_order] = round(joint_counts[affix][word_order] / affix_counts, 3)
    return joint_counts

print("\n")
wordorder_given_affix = calculate_wordorder_given_affix(tups)
format_joint_dict(wordorder_given_affix)



{'SVO': 0.329, 'SOV': 0.418, 'No dominant order': 0.148, 'VSO': 0.074, 'VOS': 0.023, 'OVS': 0.008}
SVO
	Suffixing : 0.319
	Little affixation : 0.25
	Equal prefixing and suffixing : 0.149
	Prefixing : 0.281
SOV
	Equal prefixing and suffixing : 0.115
	Little affixation : 0.079
	Suffixing : 0.754
	Prefixing : 0.052
No dominant order
	Little affixation : 0.092
	Equal prefixing and suffixing : 0.208
	Suffixing : 0.531
	Prefixing : 0.169
VSO
	Suffixing : 0.292
	Prefixing : 0.231
	Equal prefixing and suffixing : 0.246
	Little affixation : 0.231
VOS
	Equal prefixing and suffixing : 0.35
	Suffixing : 0.15
	Prefixing : 0.25
	Little affixation : 0.25
OVS
	Suffixing : 0.429
	Equal prefixing and suffixing : 0.429
	Little affixation : 0.143


{'Suffixing': 0.527, 'Equal prefixing and suffixing': 0.158, 'Little affixation': 0.153, 'Prefixing': 0.162}
Suffixing
	SVO : 0.199
	No dominant order : 0.149
	SOV : 0.597
	VSO : 0.041
	VOS : 0.006
	OVS : 0.006
Equal prefixing and suffixing
	SOV : 0.304
	SVO : 

In [61]:
# Calculates conditional probabilities for each language family 

# How many languages in each langauge family? 
family_counts = []
for family in lang_families:
    count = sum([len(langs) for langs in lang_families[family].values()])
    family_counts.append((family, count))

family_counts =  sorted(family_counts, key=lambda tup: tup[1], reverse=True)
#print(family_counts[:30])

# Only tabulate statistics for langauge families with more than 15 languages
interested_families = [fam for fam in family_counts if fam[1] >= 10]
print(interested_families)
print("\n")

def format_prob_wordorder_given_affix(tups, probs_threshold=0.0):
    """Input tups: [(code, affix, word order), ...]"""
    # Tabulate joint counts
    joint_counts = {}
    for code, affix, word_order in tups:
        if affix in joint_counts:
            if word_order in joint_counts[affix]: 
                joint_counts[affix][word_order] += 1
            else: 
                joint_counts[affix][word_order] = 1
        else: 
            joint_counts[affix]= {word_order: 1}
    
    # Calculate priors
    priors = {}
    for affix in joint_counts:
        priors[affix] = round(sum(list(joint_counts[affix].values())) / len(tups), 3)

    # Convert joint_counts into joint probabilities
    for affix in joint_counts:
        affix_counts = sum(list(joint_counts[affix].values()))
        for word_order in joint_counts[affix]:
            joint_counts[affix][word_order] = round(joint_counts[affix][word_order] / affix_counts, 3)

    # Format joint_counts for writing into output file
    lines = []
    for affix in joint_counts:
        lines.append("\t"+affix+"\n")
        for word_count_probs in joint_counts[affix]:
            if joint_counts[affix][word_count_probs] >= probs_threshold:
                lines.append(f"\t\t{word_count_probs} : {joint_counts[affix][word_count_probs]}\n")
            else: 
                continue

    return lines + ["\n"]
    
lines = []
for family_name, family_count in interested_families:
    lines += [f"{family_name}: {family_count} langauges\n"]
    family_dict = lang_families[family_name]
    # Format into list of tuples [(code, affix, word order)]
    tups = []
    for combination in family_dict:
        affix, word_order = combination
        codes = family_dict[combination]
        for wals_code in codes: 
            tups.append((wals_code, affix, word_order))
    lines += format_prob_wordorder_given_affix(tups, probs_threshold=0.3)

with open("statistics_per_family2.txt", "w", encoding="utf-8") as f: 
    f.writelines(lines)

[('Austronesian', 104), ('Niger-Congo', 73), ('Indo-European', 51), ('Sino-Tibetan', 50), ('Afro-Asiatic', 50), ('Trans-New Guinea', 41), ('Pama-Nyungan', 38), ('Eastern Sudanic', 32), ('Uto-Aztecan', 24), ('Altaic', 23), ('Central Sudanic', 19), ('Oto-Manguean', 17), ('Austro-Asiatic', 14), ('Arawakan', 13), ('Uralic', 13), ('Penutian', 12), ('Hokan', 10)]


