# WALS Greenberg Crosstables

## Imports

In [3]:
import pandas as pd
from collections import defaultdict
import lang2vec.lang2vec as l2v

## Available Languages

In [25]:
#print(f"Available languages: {l2v.available_languages()}")
print(f"Number of available languages: {len(l2v.available_languages())}")
languages = l2v.available_languages()

Number of available languages: 4005


In [26]:
print(f"Available feature sets: {l2v.available_feature_sets()}")

Available feature sets: ['syntax_wals', 'phonology_wals', 'syntax_sswl', 'syntax_ethnologue', 'phonology_ethnologue', 'inventory_ethnologue', 'inventory_phoible_aa', 'inventory_phoible_gm', 'inventory_phoible_saphon', 'inventory_phoible_spa', 'inventory_phoible_ph', 'inventory_phoible_ra', 'inventory_phoible_upsid', 'syntax_knn', 'phonology_knn', 'inventory_knn', 'syntax_average', 'phonology_average', 'inventory_average', 'fam', 'id', 'geo', 'learned']


In [27]:
syntax_wals_features = l2v.get_features(" ".join(languages), "syntax_wals", header=True)
codes = syntax_wals_features["CODE"]

def idx(name):
    return codes.index(name)

idx_S_SVO = idx("S_SVO")
idx_S_SOV = idx("S_SOV")
idx_S_VSO = idx("S_VSO")
idx_S_VOS = idx("S_VOS")
idx_S_OVS = idx("S_OVS")
idx_S_OSV = idx("S_OSV")

idx_ADP_BEFORE = idx("S_ADPOSITION_BEFORE_NOUN")
idx_ADP_AFTER  = idx("S_ADPOSITION_AFTER_NOUN")

## Greenberg Rule 3
Languages with dominant VSO order are always prepositional

In [28]:
def word_order_category(vec):
    """Collapse WALS-style word orders into VO / OV / other."""
    # values are 0.0 or 1.0 (and sometimes '--' for missing)
    def v(i):
        val = vec[i]
        return 0.0 if val == '--' else float(val)

    # verb-before-object patterns
    vo = v(idx_S_SVO) + v(idx_S_VSO) + v(idx_S_VOS)
    # object-before-verb patterns
    ov = v(idx_S_SOV) + v(idx_S_OVS) + v(idx_S_OSV)

    if vo > 0 and ov == 0:
        return "VO"
    elif ov > 0 and vo == 0:
        return "OV"
    else:
        return "other"  # mixed / unclear


In [29]:
def adposition_category(vec):
    """Collapse adposition features into preposition / postposition / other."""
    def v(i):
        val = vec[i]
        return 0.0 if val == '--' else float(val)

    before = v(idx_ADP_BEFORE)  # ADP before noun ≈ prepositions
    after  = v(idx_ADP_AFTER)   # ADP after noun  ≈ postpositions

    if before > 0 and after == 0:
        return "preposition"
    elif after > 0 and before == 0:
        return "postposition"
    else:
        return "other"

In [30]:
records = []

for lang in languages:
    vec = syntax_wals_features[lang]
    wo = word_order_category(vec)
    adp = adposition_category(vec)

    # keep only clear VO/OV and pre/post cases
    if wo in ["VO", "OV"] and adp in ["preposition", "postposition"]:
        records.append({"lang": lang, "word_order": wo, "adposition": adp})

df = pd.DataFrame(records)
print(df)

# Cross-table (contingency table)
crosstab = pd.crosstab(df["word_order"], df["adposition"])
print("\nCross-table VO/OV vs pre/post:")
print(crosstab)


    lang word_order    adposition
0    cco         VO   preposition
1    mag         OV  postposition
2    jei         OV  postposition
3    hop         OV  postposition
4    jbu         VO   preposition
..   ...        ...           ...
849  had         VO   preposition
850  aqc         OV  postposition
851  kzj         VO   preposition
852  tat         OV  postposition
853  sur         VO   preposition

[854 rows x 3 columns]

Cross-table VO/OV vs pre/post:
adposition  postposition  preposition
word_order                           
OV                   377           15
VO                    37          425


In [36]:
def get_val(vec, i):
    v = vec[i]
    return 0.0 if v == '--' else float(v)

IDX = {
    "SVO": idx("S_SVO"),
    "SOV": idx("S_SOV"),
    "VSO": idx("S_VSO"),
    "VOS": idx("S_VOS"),
    "OVS": idx("S_OVS"),
    "OSV": idx("S_OSV"),
}

counts = {
    "preposition":  {wo: 0 for wo in IDX.keys()},
    "postposition": {wo: 0 for wo in IDX.keys()}
}

for lang in languages:
    vec = syntax_wals_features[lang]

    # Determine if preposition or postposition
    before = get_val(vec, idx_ADP_BEFORE)
    after  = get_val(vec, idx_ADP_AFTER)

    if before > 0 and after == 0:
        row = "preposition"
    elif after > 0 and before == 0:
        row = "postposition"
    else:
        continue  # skip ambiguous/mixed cases

    # Check each word-order column
    for wo, i in IDX.items():
        if get_val(vec, i) > 0:
            counts[row][wo] += 1

# 5. Convert to DataFrame
df_wo_vs_adp = pd.DataFrame.from_dict(counts, orient="index")
print(df_wo_vs_adp)

              SVO  SOV  VSO  VOS  OVS  OSV
preposition   326   17   96   39    3    0
postposition   46  381    6    0   10    3
