In [1]:
import pandas as pd

df_81A = pd.read_csv('81A.txt', sep='\t')
df_26A = pd.read_csv("26A.txt", sep="\t")

cols_to_drop = ["value", "latitude", "longitude", "area"]

df_81A = df_81A.drop(columns=cols_to_drop, axis=1)
df_81A = df_81A.rename(columns={"wals code": "wals_code", "description": "description_81A"})

df_26A = df_26A.drop(columns=cols_to_drop, axis=1)
df_26A = df_26A.rename(columns={"wals code": "wals_code", "description": "description_26A"})

merge_columns = ["wals_code", "name", "genus", "family"]
df_merged = df_81A.merge(df_26A, on=merge_columns)
df_merged = df_merged.replace("Strongly suffixing", "Suffixing").replace("Weakly suffixing", "Suffixing")
df_merged = df_merged.replace("Strong prefixing", "Prefixing").replace("Weakly prefixing", "Prefixing")

In [13]:
cols_merged = [col for col in df_merged.columns]
print(cols_merged)
print(len(df_merged))
print(df_merged.description_26A.unique())

# Format accroding to Step 2 chart, write to csv
df_step2 = df_merged.drop(columns=["wals_code", "genus"])
df_step2 = df_step2.rename(columns={"name": "Language", "family": "Language Family", "description_81A": "Basic Order", "description_26A": "Affixes"})
df_step2 = df_step2.reindex(columns=["Language", "Language Family", "Affixes", "Basic Order"])
df_step2.to_csv("step2_chart", index=False)

['wals_code', 'name', 'description_81A', 'genus', 'family', 'description_26A']
876
['Suffixing' 'Equal prefixing and suffixing' 'Little affixation'
 'Prefixing']


In [7]:
# Produces graph shown on WALS website when tabulating features
suffix_word_order = {}

for row in df_merged.itertuples():
    wals_code, affixing, word_order = row.wals_code, row.description_26A, row.description_81A
    if (affixing, word_order) in suffix_word_order:
        suffix_word_order[(affixing, word_order)].append(wals_code)
    else:
        suffix_word_order[(affixing, word_order)] = [wals_code]

suffix_word_order = dict(sorted(suffix_word_order.items(), key=lambda item: len(item[1]), reverse=True))
for key,vals in suffix_word_order.items():
    print(f"{key} : {len(vals)}")


('Suffixing', 'SOV') : 276
('Suffixing', 'SVO') : 92
('Prefixing', 'SVO') : 81
('Little affixation', 'SVO') : 72
('Suffixing', 'No dominant order') : 69
('Equal prefixing and suffixing', 'SVO') : 43
('Equal prefixing and suffixing', 'SOV') : 42
('Little affixation', 'SOV') : 29
('Equal prefixing and suffixing', 'No dominant order') : 27
('Prefixing', 'No dominant order') : 22
('Suffixing', 'VSO') : 19
('Prefixing', 'SOV') : 19
('Equal prefixing and suffixing', 'VSO') : 16
('Prefixing', 'VSO') : 15
('Little affixation', 'VSO') : 15
('Little affixation', 'No dominant order') : 12
('Equal prefixing and suffixing', 'VOS') : 7
('Prefixing', 'VOS') : 5
('Little affixation', 'VOS') : 5
('Suffixing', 'VOS') : 3
('Suffixing', 'OVS') : 3
('Equal prefixing and suffixing', 'OVS') : 3
('Little affixation', 'OVS') : 1


In [9]:
word_order_dict = {}

for row in df_merged.itertuples():
    word_order = row.description_81A
    if word_order in word_order_dict:
        word_order_dict[word_order] += 1
    else:
        word_order_dict[word_order] = 1

word_order_dict = dict(sorted(word_order_dict.items(), key=lambda item: item[1], reverse=True))
for key,vals in word_order_dict.items():
    print(f"{key} : {vals}")

SOV : 366
SVO : 288
No dominant order : 130
VSO : 65
VOS : 20
OVS : 7


In [4]:
# ['wals_code', 'name', 'description_81A', 'genus', 'family', 'description_26A']
# Now order by language family: 
# {"language family": {"weak/SOV": ["code1", "code2"], "strong/SOV": []"code3", "code4"], ...} , ...}

lang_families = {}
for row in df_merged.itertuples():
    wals_code, affixing, word_order, family = row.wals_code, row.description_26A, row.description_81A, row.family
    affixing_word_order = (affixing, word_order)
    if family not in lang_families:
        lang_families[family] = {affixing_word_order: [wals_code]}
        affixing_dict = lang_families[family]
        affixing_dict = dict(sorted(affixing_dict.items(), key=lambda dict_tup: len(dict_tup[1]), reverse=True))
    else:
        affixing_dict = lang_families[family]
        if affixing_word_order not in affixing_dict:
            affixing_dict[affixing_word_order] = [wals_code]
        else:
            affixing_dict[affixing_word_order].append(wals_code)
        affixing_dict = dict(sorted(affixing_dict.items(), key=lambda dict_tup: len(dict_tup[1]), reverse=True))


with open("analysis2.txt", "w", encoding="utf-8") as f:
    for family, affixing_word_order in lang_families.items():
        f.write(f"Language Family: {family}\n")
        for tup, codes in affixing_word_order.items():
            f.write(f"\t{tup}: {len(codes)}\n")
        f.write("\n")

In [5]:
# Find top language families
langs_by_family = {}

for row in df_merged.itertuples():
    wals_code, family = row.wals_code, row.family
    if family in langs_by_family:
        langs_by_family[family].append(wals_code)
    else:
        langs_by_family[family] = [wals_code]


langs_by_family = dict(sorted(langs_by_family.items(), key=lambda dict_tup: len(dict_tup[1]), reverse=True))

for family, langs in langs_by_family.items():
    print(f"{family}: {len(langs)}")


Austronesian: 104
Niger-Congo: 73
Indo-European: 51
Sino-Tibetan: 50
Afro-Asiatic: 50
Trans-New Guinea: 41
Pama-Nyungan: 38
Eastern Sudanic: 32
Uto-Aztecan: 24
Altaic: 23
Central Sudanic: 19
Oto-Manguean: 17
Austro-Asiatic: 14
Arawakan: 13
Uralic: 13
Penutian: 12
Hokan: 10
Nakh-Daghestanian: 9
Tupian: 9
Torricelli: 8
Dravidian: 8
Mayan: 8
Pano-Tacanan: 7
Na-Dene: 7
Cariban: 6
Mande: 6
Gunwinyguan: 5
Siouan: 5
Algic: 5
Chibchan: 5
Kordofanian: 5
Guaicuruan: 4
Sepik: 4
Mangarrayi-Maran: 4
Salishan: 4
Saharan: 4
Iroquoian: 4
Muskogean: 4
Tai-Kadai: 4
North Halmaheran: 4
Northwest Caucasian: 3
Greater West Bomberai: 3
Eskimo-Aleut: 3
Tucanoan: 3
Skou: 3
Solomons East Papuan: 3
Hmong-Mien: 3
Chukotko-Kamchatkan: 3
Maban: 3
Mixe-Zoque: 3
Totonacan: 3
Keresan: 2
Asmat-Kamrau Bay: 2
Barbacoan: 2
Mangrida: 2
Oregon Coast: 2
Mirndi: 2
Dogon: 2
Teberan-Pawaian: 2
Worrorran: 2
Witotoan: 2
Zaparoan: 2
Kadu: 2
Tangkic: 2
Kiowa-Tanoan: 2
Khoe-Kwadi: 2
Songhay: 2
Tsimshianic: 2
Quechuan: 2
Yanomam: 2


In [6]:
langs = langs_by_family["Sepik"]
for lang in langs:
    print(df_merged[df_merged["wals_code"] == lang])

   wals_code      name description_81A       genus family description_26A
17       ala  Alamblak             SOV  Sepik Hill  Sepik       Suffixing
   wals_code   name description_81A genus family description_26A
61       awt  Awtuw             SOV   Ram  Sepik       Suffixing
    wals_code   name description_81A   genus family description_26A
394       kwo  Kwoma             SOV  Nukuma  Sepik       Suffixing
    wals_code   name description_81A         genus family  \
542       nam  Namia             SOV  Yellow River  Sepik   

                   description_26A  
542  Equal prefixing and suffixing  
