In [2]:
import lang2vec.lang2vec as l2v
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")


  import pkg_resources


### 1. Fetch WALS syntax features

In [60]:
languages = [
    "eng",
    "deu",
    "spa",
    "rus",
    "jpn",
    "hin",
    "tur",
    "ara",
    "por",
    "ita",
  ]
languages = list(l2v.available_languages())

# iso_code -> feature_vector
features_dict = l2v.get_features(languages, "syntax_wals", header=True)

# extract feature names
feature_names = features_dict["CODE"]

# build a matrix for all languages
# missing values are encoded as the string '--'
rows = []
valid_langs = []

for lang in languages:
    if lang not in features_dict:
        print(f"[warning] language {lang} not available, skipping.")
        continue

    rows.append(features_dict[lang])
    valid_langs.append(lang)

# each row is a language, each column is a wals feature
all_syntax_features_df = pd.DataFrame(rows, index=valid_langs, columns=feature_names)

# replace the missing marker '--' with np.nan so pandas can handle it
all_syntax_features_df = all_syntax_features_df.replace("--", np.nan)

print("shape:", all_syntax_features_df.shape)
display(all_syntax_features_df.head())


shape: (4005, 103)


Unnamed: 0,S_SVO,S_SOV,S_VSO,S_VOS,S_OVS,S_OSV,S_SUBJECT_BEFORE_VERB,S_SUBJECT_AFTER_VERB,S_OBJECT_AFTER_VERB,S_OBJECT_BEFORE_VERB,...,S_XVO,S_XOV,S_OXV,S_OVX,S_OBLIQUE_AFTER_VERB,S_OBLIQUE_AFTER_OBJECT,S_OBLIQUE_BEFORE_VERB,S_OBLIQUE_BEFORE_OBJECT,S_ARTICLE_WORD_BEFORE_NOUN,S_ARTICLE_WORD_AFTER_NOUN
hye,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,,,,,,,,,,
moq,,,,,,,,,,,...,,,,,,,,,,
omb,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,
dit,,,,,,,,,,,...,,,,,,,,,,
crc,,,,,,,,,,,...,,,,,,,,,,


Selecting only the features related to 49A, 50A, 81A, 85A-90A

In [None]:
# keep columns whose name contains any of the given substrings
def select_cols(df, keywords):
    return [c for c in df.columns if any(k in c for k in keywords)]

cols_49A = select_cols(all_syntax_features_df, ["CASE", "CASES"]) # 49A number of cases
cols_50A = select_cols(all_syntax_features_df, ["ASYMMETRIC"]) # 50A asymmetrical case marking
cols_81A = select_cols(all_syntax_features_df, ["SVO", "SOV", "VSO", "VOS", "OVS", "OSV"]) # 81A basic order
cols_85A = select_cols(all_syntax_features_df, ["ADPOSITION", "ADP","S_ADPOSITION_BEFORE_NOUN", "S_ADPOSITION_AFTER_NOUN"]) # 85A adposition + np
cols_86A = select_cols(all_syntax_features_df, ["S_POSSESSOR_BEFORE_NOUN", "S_POSSESSOR_AFTER_NOUN"]) # 86A genitive + noun
cols_87A = select_cols(all_syntax_features_df, ["ADJECTIVE","S_ADJECTIVE_BEFORE_NOUN","S_ADJECTIVE_AFTER_NOUN"]) # 87A adjective + noun
cols_88A = select_cols(all_syntax_features_df, ["DEMONSTRATIVE"]) # 88A demonstrative + noun
cols_89A = select_cols(all_syntax_features_df, ["NUMERAL"]) # 89A numeral + noun
cols_90A = select_cols(all_syntax_features_df, ["RELATIVE"]) # 90A relative clause + noun

# flatten everything into one list of interesting columns
interesting_cols = (
    cols_49A + cols_50A + cols_81A +
    cols_85A + cols_86A + cols_87A +
    cols_88A + cols_89A + cols_90A
)

# keep only these columns in a new dataframe
focus_syntax_features_df = all_syntax_features_df[interesting_cols].copy()

print("shape (focused on 49A, 50A, 81A, 85A–90A):", focus_syntax_features_df.shape)
print("first few columns:", focus_syntax_features_df.columns[:10].tolist())
display(focus_syntax_features_df.head())


shape (focused on 49A, 50A, 81A, 85A–90A): (4005, 28)
first few columns: [np.str_('S_CASE_PREFIX'), np.str_('S_CASE_SUFFIX'), np.str_('S_CASE_PROCLITIC'), np.str_('S_CASE_ENCLITIC'), np.str_('S_CASE_MARK'), np.str_('S_SVO'), np.str_('S_SOV'), np.str_('S_VSO'), np.str_('S_VOS'), np.str_('S_OVS')]


Unnamed: 0,S_CASE_PREFIX,S_CASE_SUFFIX,S_CASE_PROCLITIC,S_CASE_ENCLITIC,S_CASE_MARK,S_SVO,S_SOV,S_VSO,S_VOS,S_OVS,...,S_ANY_AGREEMENT_ON_ADJECTIVES,S_DEMONSTRATIVE_WORD_BEFORE_NOUN,S_DEMONSTRATIVE_WORD_AFTER_NOUN,S_DEMONSTRATIVE_PREFIX,S_DEMONSTRATIVE_SUFFIX,S_NUMERAL_BEFORE_NOUN,S_NUMERAL_AFTER_NOUN,S_RELATIVE_BEFORE_NOUN,S_RELATIVE_AFTER_NOUN,S_RELATIVE_AROUND_NOUN
hye,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
moq,,,,,,,,,,,...,,,,,,,,,,
omb,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
dit,,,,,,,,,,,...,,,,,,,,,,
crc,,,,,,,,,,,...,,,,,,,,,,


# Helper Functions

In [16]:
def get_val(row, col):
    v = row[col]
    # Handle missing/placeholder values
    if pd.isna(v) or v == '--':
        return 0.0
    return float(v)

# Greenberg Rule 3
Languages with dominant VSO order are always prepositional

In [None]:
# map each word order to its column name
wo_cols = {}
for wo in ["SVO", "SOV", "VSO", "VOS", "OVS", "OSV"]:
    for c in cols_81A:
        if wo in c:
            wo_cols[wo] = c
            break
# pick the specific columns for adposition before/after the noun
adp_before_col = "S_ADPOSITION_BEFORE_NOUN"
adp_after_col  = "S_ADPOSITION_AFTER_NOUN"

# initialize counts dict for each word order and adposition type

counts = {
    "preposition":  {wo: 0 for wo in wo_cols.keys()},
    "postposition": {wo: 0 for wo in wo_cols.keys()},
}

# iterate over languages

for lang, row in focus_syntax_features_df.iterrows():
    # check if preposition or postposition
    before = get_val(row, adp_before_col)
    after  = get_val(row, adp_after_col)

    if before > 0 and after == 0:
        row_type = "preposition"
    elif after > 0 and before == 0:
        row_type = "postposition"
    else:
        continue  # skip mixed cases

    # Check each word-order column
    for wo, col_name in wo_cols.items():
        if get_val(row, col_name) > 0:
            counts[row_type][wo] += 1

# convert to dataframe
df_wo_vs_adp = pd.DataFrame.from_dict(counts, orient="index")
display(df_wo_vs_adp)

Unnamed: 0,SVO,SOV,VSO,VOS,OVS,OSV
preposition,326,17,96,39,3,0
postposition,46,381,6,0,10,3


According to WALS data 6 languages with VSO order use postpositions and thus violate greenberg #3.

# Greenberg Rule 4
With overwhelmingly more than chance frequency, languages with normal SOV order are postpositional.

In [20]:

sov_col = [c for c in cols_81A if "SOV" in c][0]
adp_before_col = "S_ADPOSITION_BEFORE_NOUN"
adp_after_col  = "S_ADPOSITION_AFTER_NOUN"

# build a cross table beteween SOV/non-SOV and pre/postposition
records = []

for lang, row in focus_syntax_features_df.iterrows():
    # word order group
    is_SOV = get_val(row, sov_col) > 0
    wo_group = "SOV" if is_SOV else "non-SOV"

    # adposition type
    before = get_val(row, adp_before_col)
    after  = get_val(row, adp_after_col)

    if before > 0 and after == 0:
        adp_type = "preposition"
    elif after > 0 and before == 0:
        adp_type = "postposition"
    else:
        adp_type = "mixed/other"  # ambiguous cases

    records.append({
        "language": lang,
        "word_order_group": wo_group,
        "adp_type": adp_type
    })

df_rule4 = pd.DataFrame.from_records(records).set_index("language")

# create dataframe
mask = df_rule4["adp_type"].isin(["preposition", "postposition"])
df_rule4_clear = df_rule4[mask]

table_rule4 = pd.crosstab(
    df_rule4_clear["word_order_group"],
    df_rule4_clear["adp_type"]
)

display(table_rule4)


# Greenberg Rule 4: SOV vs (pre/post)position
adp_type          postposition  preposition
word_order_group                           
SOV                        381           17
non-SOV                    172          487


adp_type,postposition,preposition
word_order_group,Unnamed: 1_level_1,Unnamed: 2_level_1
SOV,381,17
non-SOV,172,487


381 out of 398 SOV languages are postpositional. 
This confirms that SOV languages have a strong tendency to be postpositional.

## Greenberg Rule 5
If a language has dominant SOV order and the genitive follows the governing noun, then the adjective likewise follows the noun.

In [64]:
df = all_syntax_features_df

df_filtered = df[
    (df["S_SOV"] == 1) &
    (df["S_POSSESSOR_AFTER_NOUN"] == 1)
]

count_adj_after  = (df_filtered["S_ADJECTIVE_AFTER_NOUN"] == 1).sum()
count_adj_before = (df_filtered["S_ADJECTIVE_BEFORE_NOUN"] == 1).sum()

print("Languages with SOV=1 and POSSESSOR_AFTER_NOUN=1:", len(df_filtered))
print("… of which ADJECTIVE_AFTER=1:", count_adj_after)
print("… of which ADJECTIVE_BEFORE=1:", count_adj_before)


Languages with SOV=1 and POSSESSOR_AFTER_NOUN=1: 55
… of which ADJECTIVE_AFTER=1: 43
… of which ADJECTIVE_BEFORE=1: 8


In 43 out of 55 languages with SOV and genetive follows the governing noun greenberg rule 5 holds.