In [None]:
import lang2vec.lang2vec as l2v
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")


  import pkg_resources


### 1. Fetch WALS syntax features

In [None]:
languages = [
    "eng",
    "deu",
    "spa",
    "rus",
    "jpn",
    "hin",
    "tur",
    "ara",
    "por",
    "ita",
  ]
languages = list(l2v.available_languages())

# iso_code -> feature_vector
features_dict = l2v.get_features(languages, "syntax_wals", header=True)

# extract feature names
feature_names = features_dict["CODE"]

# build a matrix for all languages
# missing values are encoded as the string '--'
rows = []
valid_langs = []

for lang in languages:
    if lang not in features_dict:
        print(f"[warning] language {lang} not available, skipping.")
        continue

    rows.append(features_dict[lang])
    valid_langs.append(lang)

# each row is a language, each column is a wals feature
all_syntax_features_df = pd.DataFrame(rows, index=valid_langs, columns=feature_names)

# replace the missing marker '--' with np.nan so pandas can handle it
all_syntax_features_df = all_syntax_features_df.replace("--", np.nan)

print("shape:", all_syntax_features_df.shape)
display(all_syntax_features_df.head())


shape: (4005, 103)


Unnamed: 0,S_SVO,S_SOV,S_VSO,S_VOS,S_OVS,S_OSV,S_SUBJECT_BEFORE_VERB,S_SUBJECT_AFTER_VERB,S_OBJECT_AFTER_VERB,S_OBJECT_BEFORE_VERB,...,S_XVO,S_XOV,S_OXV,S_OVX,S_OBLIQUE_AFTER_VERB,S_OBLIQUE_AFTER_OBJECT,S_OBLIQUE_BEFORE_VERB,S_OBLIQUE_BEFORE_OBJECT,S_ARTICLE_WORD_BEFORE_NOUN,S_ARTICLE_WORD_AFTER_NOUN
kqq,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,,,,,,,,,,
csr,,,,,,,,,,,...,,,,,,,,,,
hni,,,,,,,1.0,,,1.0,...,,,,,,,,,,
aeb,,,,,,,,,,,...,,,,,,,,,,
nsq,,,,,,,,,,,...,,,,,,,,,,


Selecting only the features related to 49A, 50A, 81A, 85A-90A

In [None]:
# keep columns whose name contains any of the given substrings
def select_cols(df, keywords):
    return [c for c in df.columns if any(k in c for k in keywords)]

cols_49A = select_cols(all_syntax_features_df, ["CASE", "CASES"]) # 49A number of cases
cols_50A = select_cols(all_syntax_features_df, ["ASYMMETRIC"]) # 50A asymmetrical case marking
cols_81A = select_cols(all_syntax_features_df, ["SVO", "SOV", "VSO", "VOS", "OVS", "OSV"]) # 81A basic order
cols_85A = select_cols(all_syntax_features_df, ["ADPOSITION", "ADP","S_ADPOSITION_BEFORE_NOUN", "S_ADPOSITION_AFTER_NOUN"]) # 85A adposition + np
cols_86A = select_cols(all_syntax_features_df, ["S_POSSESSOR_BEFORE_NOUN", "S_POSSESSOR_AFTER_NOUN"]) # 86A genitive + noun
cols_87A = select_cols(all_syntax_features_df, ["ADJECTIVE","S_ADJECTIVE_BEFORE_NOUN","S_ADJECTIVE_AFTER_NOUN"]) # 87A adjective + noun
cols_88A = select_cols(all_syntax_features_df, ["DEMONSTRATIVE"]) # 88A demonstrative + noun
cols_89A = select_cols(all_syntax_features_df, ["NUMERAL"]) # 89A numeral + noun
cols_90A = select_cols(all_syntax_features_df, ["RELATIVE"]) # 90A relative clause + noun

# flatten everything into one list of interesting columns
interesting_cols = (
    cols_49A + cols_50A + cols_81A +
    cols_85A + cols_86A + cols_87A +
    cols_88A + cols_89A + cols_90A
)

# keep only these columns in a new dataframe
focus_syntax_features_df = all_syntax_features_df[interesting_cols].copy()

print("shape (focused on 49A, 50A, 81A, 85A–90A):", focus_syntax_features_df.shape)
print("first few columns:", focus_syntax_features_df.columns[:10].tolist())
display(focus_syntax_features_df.head())


shape (focused on 49A, 50A, 81A, 85A–90A): (4005, 30)
first few columns: [np.str_('S_CASE_PREFIX'), np.str_('S_CASE_SUFFIX'), np.str_('S_CASE_PROCLITIC'), np.str_('S_CASE_ENCLITIC'), np.str_('S_CASE_MARK'), np.str_('S_SVO'), np.str_('S_SOV'), np.str_('S_VSO'), np.str_('S_VOS'), np.str_('S_OVS')]


Unnamed: 0,S_CASE_PREFIX,S_CASE_SUFFIX,S_CASE_PROCLITIC,S_CASE_ENCLITIC,S_CASE_MARK,S_SVO,S_SOV,S_VSO,S_VOS,S_OVS,...,S_ANY_AGREEMENT_ON_ADJECTIVES,S_DEMONSTRATIVE_WORD_BEFORE_NOUN,S_DEMONSTRATIVE_WORD_AFTER_NOUN,S_DEMONSTRATIVE_PREFIX,S_DEMONSTRATIVE_SUFFIX,S_NUMERAL_BEFORE_NOUN,S_NUMERAL_AFTER_NOUN,S_RELATIVE_BEFORE_NOUN,S_RELATIVE_AFTER_NOUN,S_RELATIVE_AROUND_NOUN
kqq,,,,,,0.0,1.0,0.0,0.0,0.0,...,,0.0,1.0,0.0,0.0,0.0,1.0,,,
csr,,,,,,,,,,,...,,,,,,,,,,
hni,,,,,,,,,,,...,,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
aeb,,,,,,,,,,,...,,,,,,,,,,
nsq,,,,,,,,,,,...,,,,,,,,,,


# Helper Functions

In [None]:
def get_val(row, col):
    v = row[col]
    # Handle missing/placeholder values
    if pd.isna(v) or v == '--':
        return 0.0
    return float(v)

# Greenberg Rule 3
Languages with dominant VSO order are always prepositional

In [None]:
# map each word order to its column name
wo_cols = {}
for wo in ["SVO", "SOV", "VSO", "VOS", "OVS", "OSV"]:
    for c in cols_81A:
        if wo in c:
            wo_cols[wo] = c
            break
# pick the specific columns for adposition before/after the noun
adp_before_col = "S_ADPOSITION_BEFORE_NOUN"
adp_after_col  = "S_ADPOSITION_AFTER_NOUN"

# initialize counts dict for each word order and adposition type

counts = {
    "preposition":  {wo: 0 for wo in wo_cols.keys()},
    "postposition": {wo: 0 for wo in wo_cols.keys()},
}

# iterate over languages

for lang, row in focus_syntax_features_df.iterrows():
    # check if preposition or postposition
    before = get_val(row, adp_before_col)
    after  = get_val(row, adp_after_col)

    if before > 0 and after == 0:
        row_type = "preposition"
    elif after > 0 and before == 0:
        row_type = "postposition"
    else:
        continue  # skip mixed cases

    # Check each word-order column
    for wo, col_name in wo_cols.items():
        if get_val(row, col_name) > 0:
            counts[row_type][wo] += 1

# convert to dataframe
df_wo_vs_adp = pd.DataFrame.from_dict(counts, orient="index")
display(df_wo_vs_adp)

Unnamed: 0,SVO,SOV,VSO,VOS,OVS,OSV
preposition,326,17,96,39,3,0
postposition,46,381,6,0,10,3


According to WALS data 6 languages with VSO order use postpositions and thus violate greenberg #3.

**Interpretation of Greenberg Rule**

This rule says that VSO languages should always be prepositional. Our results show that most VSO languages are indeed prepositional, but six exceptions exist. This means the rule is a strong tendency, not an absolute law. Some languages may have mixed adposition systems or incomplete data in WALS.


# Greenberg Rule 4
With overwhelmingly more than chance frequency, languages with normal SOV order are postpositional.

In [None]:

sov_col = [c for c in cols_81A if "SOV" in c][0]
adp_before_col = "S_ADPOSITION_BEFORE_NOUN"
adp_after_col  = "S_ADPOSITION_AFTER_NOUN"

# build a cross table beteween SOV/non-SOV and pre/postposition
records = []

for lang, row in focus_syntax_features_df.iterrows():
    # word order group
    is_SOV = get_val(row, sov_col) > 0
    wo_group = "SOV" if is_SOV else "non-SOV"

    # adposition type
    before = get_val(row, adp_before_col)
    after  = get_val(row, adp_after_col)

    if before > 0 and after == 0:
        adp_type = "preposition"
    elif after > 0 and before == 0:
        adp_type = "postposition"
    else:
        adp_type = "mixed/other"  # ambiguous cases

    records.append({
        "language": lang,
        "word_order_group": wo_group,
        "adp_type": adp_type
    })

df_rule4 = pd.DataFrame.from_records(records).set_index("language")

# create dataframe
mask = df_rule4["adp_type"].isin(["preposition", "postposition"])
df_rule4_clear = df_rule4[mask]

table_rule4 = pd.crosstab(
    df_rule4_clear["word_order_group"],
    df_rule4_clear["adp_type"]
)

display(table_rule4)


adp_type,postposition,preposition
word_order_group,Unnamed: 1_level_1,Unnamed: 2_level_1
SOV,381,17
non-SOV,172,487


381 out of 398 SOV languages are postpositional.
This confirms that SOV languages have a strong tendency to be postpositional.

**Interpretation of Greenberg Rule 4**

We find that almost all SOV languages are postpositional. This supports the universal very strongly. The few exceptions might be the result of language contact or special historical developments.


## Greenberg Rule 5
If a language has dominant SOV order and the genitive follows the governing noun, then the adjective likewise follows the noun.

In [None]:
df = all_syntax_features_df

df_filtered = df[
    (df["S_SOV"] == 1) &
    (df["S_POSSESSOR_AFTER_NOUN"] == 1)
]

count_adj_after  = (df_filtered["S_ADJECTIVE_AFTER_NOUN"] == 1).sum()
count_adj_before = (df_filtered["S_ADJECTIVE_BEFORE_NOUN"] == 1).sum()

print("Languages with SOV=1 and POSSESSOR_AFTER_NOUN=1:", len(df_filtered))
print("… of which ADJECTIVE_AFTER=1:", count_adj_after)
print("… of which ADJECTIVE_BEFORE=1:", count_adj_before)


Languages with SOV=1 and POSSESSOR_AFTER_NOUN=1: 55
… of which ADJECTIVE_AFTER=1: 43
… of which ADJECTIVE_BEFORE=1: 8


In 43 out of 55 languages with SOV and genetive follows the governing noun greenberg rule 5 holds.

**Interpretation of Greenberg Rule 5**

Most SOV languages with a post-nominal possessor also place adjectives after the noun. This fits the idea that many word order features align consistently. The exceptions show that languages sometimes break expected patterns, but the general trend is clear.


## Greenberg Rule 6
All languages with dominant VSO order have the adjective after the noun.

In [None]:
df = all_syntax_features_df

df_filtered = df[
    (df["S_VSO"] == 1)
]

count_adj_after  = (df_filtered["S_ADJECTIVE_AFTER_NOUN"] == 1).sum()
count_adj_before = (df_filtered["S_ADJECTIVE_BEFORE_NOUN"] == 1).sum()

print("Languages dominant VSO order:", len(df_filtered))
print("… of which ADJECTIVE_AFTER=1:", count_adj_after)
print("… of which ADJECTIVE_BEFORE=1:", count_adj_before)

Languages dominant VSO order: 118
… of which ADJECTIVE_AFTER=1: 83
… of which ADJECTIVE_BEFORE=1: 38


The rule is about 70% of times correct.

**Interpretation of Greenberg Rule 6**

Around 70% of VSO languages put the adjective after the noun. This means the universal works well, but not perfect. Some languages might follow independent adjective-placement rules that are not tied to VSO order.


## Greenberg Rule 16
If a language has dominant order VSO in declarative sentences, it always puts prepositions before the noun.

The table presented under "Greenberg Rule 3" supports this universal, showing that all 39 languages with VSO word order are prepositional.

**Interpretation of Greenberg Rule 16**

Like rule 3, this rule says that VSO languages should use prepositions. Our earlier results show that this is true for most cases. A few exceptions indicate that the universal describes a strong preference, not a strict rule.


## Greenberg Rule 17
With overwhelmingly more than chance frequency, languages with dominant order SOV are postpositional.

Again the table presented under "Greenberg Rule 3" supports this universal, depicting that only 17 out of 398 SOV languages are also prepositional and 381 are postpositional.

**Interpretation of Greenberg Rule 17**

The data confirms this rule: SOV languages are almost always postpositional. This is one of the strongest universals in our analysis.


## Greenberg Rule 18
When the descriptive adjective precedes the noun, the demonstrative and the numeral likewise precede.

In [None]:
df = all_syntax_features_df
df_filtered = df[
    (df["S_ADJECTIVE_BEFORE_NOUN"] == 1)
]

count_demo_before = (df_filtered["S_DEMONSTRATIVE_WORD_BEFORE_NOUN"] == 1).sum()
count_demo_after  = (df_filtered["S_DEMONSTRATIVE_WORD_AFTER_NOUN"] == 1).sum()
count_num_before  = (df_filtered["S_NUMERAL_BEFORE_NOUN"] == 1).sum()
count_num_after   = (df_filtered["S_NUMERAL_AFTER_NOUN"] == 1).sum()

print("Languages with ADJECTIVE_BEFORE_NOUN=1:", len(df_filtered))
print("… of which DEMONSTRATIVE_WORD_BEFORE_NOUN=1:", count_demo_before)
print("… of which DEMONSTRATIVE_WORD_AFTER_NOUN=1:", count_demo_after)
print("… of which NUMERAL_BEFORE_NOUN=1:", count_num_before)
print("… of which NUMERAL_AFTER_NOUN=1:", count_num_after)

Languages with ADJECTIVE_BEFORE_NOUN=1: 464
… of which DEMONSTRATIVE_WORD_BEFORE_NOUN=1: 317
… of which DEMONSTRATIVE_WORD_AFTER_NOUN=1: 49
… of which NUMERAL_BEFORE_NOUN=1: 305
… of which NUMERAL_AFTER_NOUN=1: 81


Among the 464 languages where the adjective precedes the noun, only 317 place the demonstrative word before the noun and only 305 place the numer before the noun.
However, a minority of the languages do not follow this universal.

**Interpretation of Greenberg Rule 18**

Languages with the adjective before the noun also often place demonstratives and numerals before the noun. The pattern is strong but not perfect.

## Greenberg Rule 19
When the descriptive adjective follows the noun, the demonstrative and the numeral likewise follow.

In [None]:
df = all_syntax_features_df
df_filtered = df[
    (df["S_ADJECTIVE_AFTER_NOUN"] == 1)
]

count_demo_before = (df_filtered["S_DEMONSTRATIVE_WORD_BEFORE_NOUN"] == 1).sum()
count_demo_after  = (df_filtered["S_DEMONSTRATIVE_WORD_AFTER_NOUN"] == 1).sum()
count_num_before  = (df_filtered["S_NUMERAL_BEFORE_NOUN"] == 1).sum()
count_num_after   = (df_filtered["S_NUMERAL_AFTER_NOUN"] == 1).sum()

print("Languages with ADJECTIVE_BEFORE_NOUN=1:", len(df_filtered))
print("… of which DEMONSTRATIVE_WORD_BEFORE_NOUN=1:", count_demo_before)
print("… of which DEMONSTRATIVE_WORD_AFTER_NOUN=1:", count_demo_after)
print("… of which NUMERAL_BEFORE_NOUN=1:", count_num_before)
print("… of which NUMERAL_AFTER_NOUN=1:", count_num_after)

Languages with ADJECTIVE_BEFORE_NOUN=1: 952
… of which DEMONSTRATIVE_WORD_BEFORE_NOUN=1: 240
… of which DEMONSTRATIVE_WORD_AFTER_NOUN=1: 492
… of which NUMERAL_BEFORE_NOUN=1: 246
… of which NUMERAL_AFTER_NOUN=1: 563


Only about half of the languages follow this universal.

**Interpretation of Greenberg Rule 19**

When adjectives follow the noun, demonstratives and numerals follow only about half of the time. This shows weak support for the universal. Modifier order appears to be less stable when the adjective is post-nominal.


## Greenberg Rule 20
When any or all of the modifiers precede the noun, the genitive almost always precedes.

In [None]:
df = all_syntax_features_df
df_filtered = df[(
    (df["S_ADJECTIVE_BEFORE_NOUN"] == 1) |
    (df["S_DEMONSTRATIVE_WORD_BEFORE_NOUN"] == 1) |
    (df["S_NUMERAL_BEFORE_NOUN"] == 1)
)]

count_gen_before = (df_filtered["S_POSSESSOR_BEFORE_NOUN"] == 1).sum()
count_gen_after  = (df_filtered["S_POSSESSOR_AFTER_NOUN"] == 1).sum()

print("Languages with ANY of the modifiers before the noun (Adj/Dem/Num):", len(df_filtered))
print("… of which POSSESSOR_BEFORE_NOUN=1:", count_gen_before)
print("… of which POSSESSOR_AFTER_NOUN=1:", count_gen_after)

Languages with ANY of the modifiers before the noun (Adj/Dem/Num): 826
… of which POSSESSOR_BEFORE_NOUN=1: 469
… of which POSSESSOR_AFTER_NOUN=1: 288


Only about 56 percent follow this universal.

**Interpretation of Greenberg Rule 20**

Only about half of the languages with modifiers before the noun also place the genitive before the noun. This means the universal works poorly. Genitive placement may be influenced by other grammatical factors that do not affect adjectives or numerals.


## Greenberg Rule 21
When any or all of the modifiers follow the noun, the genitive almost always follows.

In [None]:
df = all_syntax_features_df
df_filtered = df[(
    (df["S_ADJECTIVE_AFTER_NOUN"] == 1) |
    (df["S_DEMONSTRATIVE_WORD_AFTER_NOUN"] == 1) |
    (df["S_NUMERAL_AFTER_NOUN"] == 1)
)]

count_gen_before = (df_filtered["S_POSSESSOR_BEFORE_NOUN"] == 1).sum()
count_gen_after  = (df_filtered["S_POSSESSOR_AFTER_NOUN"] == 1).sum()

print("Languages with ANY of the modifiers before the noun (Adj/Dem/Num):", len(df_filtered))
print("… of which POSSESSOR_BEFORE_NOUN=1:", count_gen_before)
print("… of which POSSESSOR_AFTER_NOUN=1:", count_gen_after)

Languages with ANY of the modifiers before the noun (Adj/Dem/Num): 1079
… of which POSSESSOR_BEFORE_NOUN=1: 483
… of which POSSESSOR_AFTER_NOUN=1: 475


Only about 44 percent follow this universal.

**Interpretation of Greenberg Rule 21**

This rule also shows weak support. Genitives follow the noun only in about 44% of the relevant languages. Genitive order seems to be more flexible across languages than modifier order.


## Greenberg Rule 22
If in a language the relative clause precedes the noun, the language is postpositional; if it follows, the language is prepositional.

In [None]:
df = all_syntax_features_df

# Languages where RELATIVE clause precedes the noun
df_rel_before = df[df["S_RELATIVE_BEFORE_NOUN"] == 1]

count_before_pre   = (df_rel_before["S_ADPOSITION_BEFORE_NOUN"] == 1).sum()
count_before_post  = (df_rel_before["S_ADPOSITION_AFTER_NOUN"] == 1).sum()

print("Languages with RELATIVE_BEFORE_NOUN=1:", len(df_rel_before))
print("… of which ADPOSITION_BEFORE_NOUN=1 (prepositional):", count_before_pre)
print("… of which ADPOSITION_AFTER_NOUN=1  (postpositional):", count_before_post)


# Languages where RELATIVE clause follows the noun
df_rel_after = df[df["S_RELATIVE_AFTER_NOUN"] == 1]

count_after_pre   = (df_rel_after["S_ADPOSITION_BEFORE_NOUN"] == 1).sum()
count_after_post  = (df_rel_after["S_ADPOSITION_AFTER_NOUN"] == 1).sum()

print("\nLanguages with RELATIVE_AFTER_NOUN=1:", len(df_rel_after))
print("… of which ADPOSITION_BEFORE_NOUN=1 (prepositional):", count_after_pre)
print("… of which ADPOSITION_AFTER_NOUN=1  (postpositional):", count_after_post)


Languages with RELATIVE_BEFORE_NOUN=1: 179
… of which ADPOSITION_BEFORE_NOUN=1 (prepositional): 13
… of which ADPOSITION_AFTER_NOUN=1  (postpositional): 141

Languages with RELATIVE_AFTER_NOUN=1: 607
… of which ADPOSITION_BEFORE_NOUN=1 (prepositional): 388
… of which ADPOSITION_AFTER_NOUN=1  (postpositional): 169


About 78 % of languages where the relative cluase precedes the noun and about 63 % where the relative clause follows the noun follow this rule.

**Interpretation of Greenberg Rule 22**

The rule shows good support: most languages with relative clauses before the noun are postpositional, and most with relative clauses after the noun are prepositional. This suggests that clause–noun order and adposition type interact strongly.


## Greenberg Rule 23
If in a language the verb precedes the object, the adjective likewise precedes the noun.

In [None]:
df = all_syntax_features_df

mask_vo = (
    (df["S_SVO"] == 1) |
    (df["S_VSO"] == 1) |
    (df["S_VOS"] == 1)
)

df_vo = df[mask_vo]

count_adj_before = (df_vo["S_ADJECTIVE_BEFORE_NOUN"] == 1).sum()
count_adj_after  = (df_vo["S_ADJECTIVE_AFTER_NOUN"] == 1).sum()

print("Languages where VERB precedes OBJECT (VO):", len(df_vo))
print("… of which ADJECTIVE_BEFORE_NOUN=1:", count_adj_before)
print("… of which ADJECTIVE_AFTER_NOUN=1:", count_adj_after)

Languages where VERB precedes OBJECT (VO): 661
… of which ADJECTIVE_BEFORE_NOUN=1: 146
… of which ADJECTIVE_AFTER_NOUN=1: 483


Only 22 % of lanuages follow this rule.

**Interpretation of Greenberg Rule 23**

Only about 22% of the languages place adjectives before nouns. This means the universal does not hold well. Adjective placement in the languages seems to follow different patterns that are not tied to verb–object order.


## Greenberg Rule 24
If in a language the verb follows the object, the adjective likewise follows the noun.

In [None]:
df = all_syntax_features_df

mask_ov = (
    (df["S_SOV"] == 1) |
    (df["S_OSV"] == 1) |
    (df["S_OVS"] == 1)
)

df_ov = df[mask_ov]

count_adj_before = (df_vo["S_ADJECTIVE_BEFORE_NOUN"] == 1).sum()
count_adj_after  = (df_ov["S_ADJECTIVE_AFTER_NOUN"] == 1).sum()

print("Languages where VERB follows the OBJECT (OV):", len(df_ov))
print("… of which ADJECTIVE_BEFORE_NOUN=1:", count_adj_before)
print("… of which ADJECTIVE_AFTER_NOUN=1:", count_adj_after)

Languages where VERB follows the OBJECT (OV): 587
… of which ADJECTIVE_BEFORE_NOUN=1: 146
… of which ADJECTIVE_AFTER_NOUN=1: 308


52 % of lanuages follow this rule.

**Interpretation of Greenberg Rule 24**

About half of the ov languages put adjectives after nouns. The universal shows moderate support but not a strong tendency. OV structure alone is not enough to predict adjective placement reliably.


## Greenberg Rule 25
If a language has dominant order VSO, it always has prepositions.

In [None]:
df = all_syntax_features_df

df_vso = df[df["S_VSO"] == 1]

count_vso = len(df_vso)
count_pre  = (df_vso["S_ADPOSITION_BEFORE_NOUN"] == 1).sum()
count_post = (df_vso["S_ADPOSITION_AFTER_NOUN"] == 1).sum()

print("Languages with VSO=1:", count_vso)
print("… of which ADPOSITION_BEFORE_NOUN=1 (prepositional):", count_pre)
print("… of which ADPOSITION_AFTER_NOUN=1  (postpositional):", count_post)
print("… potential violations (VSO but NOT prepositional):", count_vso - count_pre)


Languages with VSO=1: 118
… of which ADPOSITION_BEFORE_NOUN=1 (prepositional): 99
… of which ADPOSITION_AFTER_NOUN=1  (postpositional): 9
… potential violations (VSO but NOT prepositional): 19


**Interpretation of Greenberg Rule 25**

Most VSO languages use prepositions, but there are also some postpositional VSO languages. Again, this shows a strong preference but not an absolute rule. The mismatches show how diverse language structures can be.


## Greenberg Rule 26
If a language has dominant order SOV, it generally has postpositions.

In [None]:
df = all_syntax_features_df

df_sov = df[df["S_SOV"] == 1]

count_sov = len(df_sov)
count_pre  = (df_sov["S_ADPOSITION_BEFORE_NOUN"] == 1).sum()
count_post = (df_sov["S_ADPOSITION_AFTER_NOUN"] == 1).sum()

print("Languages with SOV=1:", count_sov)
print("… of which ADPOSITION_BEFORE_NOUN=1 (prepositional):", count_pre)
print("… of which ADPOSITION_AFTER_NOUN=1  (postpositional):", count_post)
print("… potential violations (SOV but prepositional):", count_pre)


Languages with SOV=1: 572
… of which ADPOSITION_BEFORE_NOUN=1 (prepositional): 34
… of which ADPOSITION_AFTER_NOUN=1  (postpositional): 398
… potential violations (SOV but prepositional): 34


**Interpretation of Greenberg Rule 26**

SOV languages strongly prefer postpositions, which fits with the results from Rule 4. This is one of the clearest patterns in the dataset. The few exceptions may be due to language contact or categorization differences.


## Greenberg Rule 41
If in a language the verb follows both the nominal subject and nominal object as the dominant order, the language almost always has a case system.

In [None]:
df = all_syntax_features_df

# Languages where the verb follows both nominal subject and object, i.e. SOV or OSV
mask_verb_final = (
    (df["S_SOV"] == 1) |
    (df["S_OSV"] == 1)
)

df_vf = df[mask_verb_final]

# Approximate "has a case system" using multiple possible WALS indicators:
mask_has_case = (
    (df_vf["S_CASE_MARK"] == 1) |
    (df_vf["S_CASE_PREFIX"] == 1) |
    (df_vf["S_CASE_SUFFIX"] == 1) |
    (df_vf["S_CASE_PROCLITIC"] == 1) |
    (df_vf["S_CASE_ENCLITIC"] == 1)
)


count_total = len(df_vf)
count_case = mask_has_case.sum()
count_no_case = count_total - count_case

print("Languages with SOV or OSV (verb follows both NP-subject and NP-object):", count_total)
print("… of which HAVE a case system:", count_case)
print("… of which DO NOT have a case system:", count_no_case)
print("→ proportion with case =", round(count_case / count_total, 3))


KeyError: 'CASES'