In [2]:
import lang2vec.lang2vec as l2v

from greenberg_pipeline import (
    features_to_df,
    run_rule_from_lang2vec,
    plot_macro_summary_bar,
    plot_rule_geomap
)
from rules_greenberg import (
    RULE20_FEATURES_NEEDED, evaluate_rule20_row,
    RULE19_FEATURES_NEEDED, evaluate_rule19_row,
    RULE21_FEATURES_NEEDED, evaluate_rule21_row,
    RULE23_FEATURES_NEEDED, evaluate_rule23_row,
    RULE24_FEATURES_NEEDED, evaluate_rule24_row,
    RULE41_FEATURES_NEEDED, evaluate_rule41_row
)

  import pkg_resources


## Load WALS features

In [3]:
languages = list(l2v.available_languages())
all_syntax_features_dict = l2v.get_features(languages, "syntax_wals", header=True)
all_syntax_features_df = features_to_df(all_syntax_features_dict)

  df = df.replace("--", np.nan)


## Greenberg Rule 19
When the descriptive adjective follows the noun, the demonstrative and the numeral likewise follow.

In [4]:
results19 = run_rule_from_lang2vec(
    all_syntax_features_df=all_syntax_features_df,
    wals_languages_csv_path="wals_languages.csv",
    rule_id="Greenberg Rule 19",
    rule_features_needed=RULE19_FEATURES_NEEDED,
    rule_fn=evaluate_rule19_row,
)

results19["coverage"], results19["macro_summary"]

(total_languages_in_lang2vec_syntax_df     4005.000000
 languages_testable_for_rule                648.000000
 testable_fraction                            0.161798
 languages_with_geo_metadata_after_join     610.000000
 geo_join_fraction_of_testable                0.941358
 dtype: float64,
       macro_area  n_languages  n_violations  violation_rate
 2        Eurasia           95            85        0.685484
 4      Papunesia          189            72        0.375000
 3  North America           54            52        0.881356
 5  South America           43            41        0.953488
 0         Africa          199            27        0.129808
 1      Australia           30            14        0.437500)

In [5]:
plot_macro_summary_bar(results19["macro_summary"], "Greenberg Rule 19", y_col="n_violations")
plot_macro_summary_bar(results19["macro_summary"], "Greenberg Rule 19", y_col="violation_rate")

In [6]:
plot_rule_geomap(
    df_geo=results19["rule_geo"],
    rule_id="Greenberg Rule 19",
    base_hover_cols=["lang_code", "macro_area", "family"],
    extra_hover_cols=RULE19_FEATURES_NEEDED
)

## Greenberg Rule 20
When any or all of the modifiers precede the noun, the genitive almost always precedes.

In [7]:
results20 = run_rule_from_lang2vec(
    all_syntax_features_df=all_syntax_features_df,
    wals_languages_csv_path="wals_languages.csv",
    rule_id="Greenberg Rule 20",
    rule_features_needed=RULE20_FEATURES_NEEDED,
    rule_fn=evaluate_rule20_row
)

results20["coverage"], results20["macro_summary"]

(total_languages_in_lang2vec_syntax_df     4005.000000
 languages_testable_for_rule                699.000000
 testable_fraction                            0.174532
 languages_with_geo_metadata_after_join     658.000000
 geo_join_fraction_of_testable                0.941345
 dtype: float64,
       macro_area  n_languages  n_violations  violation_rate
 4      Papunesia          125            72        0.558140
 2        Eurasia          216            67        0.248148
 0         Africa           82            52        0.590909
 3  North America          107            42        0.358974
 5  South America           88            10        0.113636
 1      Australia           40             1        0.023810)

In [8]:
plot_macro_summary_bar(results20["macro_summary"], "Greenberg Rule 20", y_col="n_violations")
plot_macro_summary_bar(results20["macro_summary"], "Greenberg Rule 20", y_col="violation_rate")

In [9]:
plot_rule_geomap(
    df_geo=results20["rule_geo"],
    rule_id="Greenberg Rule 20",
    base_hover_cols=["lang_code", "macro_area", "family"],
    extra_hover_cols=RULE20_FEATURES_NEEDED
)

## Greenberg Rule 21
When any or all of the modifiers follow the noun, the genitive almost always follows.

In [10]:
results21 = run_rule_from_lang2vec(
    all_syntax_features_df=all_syntax_features_df,
    wals_languages_csv_path="wals_languages.csv",
    rule_id="Greenberg Rule 21",
    rule_features_needed=RULE21_FEATURES_NEEDED,
    rule_fn=evaluate_rule21_row
)

results21["coverage"], results21["macro_summary"]

(total_languages_in_lang2vec_syntax_df     4005.000000
 languages_testable_for_rule                888.000000
 testable_fraction                            0.221723
 languages_with_geo_metadata_after_join     840.000000
 geo_join_fraction_of_testable                0.945946
 dtype: float64,
       macro_area  n_languages  n_violations  violation_rate
 4      Papunesia          255           136        0.523077
 2        Eurasia          111            95        0.616883
 0         Africa          275            68        0.236111
 5  South America           78            65        0.833333
 3  North America           75            50        0.609756
 1      Australia           46            21        0.428571)

In [11]:
plot_macro_summary_bar(results21["macro_summary"], "Greenberg Rule 21", y_col="n_violations")
plot_macro_summary_bar(results21["macro_summary"], "Greenberg Rule 21", y_col="violation_rate")

In [12]:
plot_rule_geomap(
    df_geo=results21["rule_geo"],
    rule_id="Greenberg Rule 21",
    base_hover_cols=["lang_code", "macro_area", "family"],
    extra_hover_cols=RULE21_FEATURES_NEEDED
)

## Greenberg Rule 23
If in a language the verb precedes the object, the adjective likewise precedes the noun.

In [13]:
results23 = run_rule_from_lang2vec(
    all_syntax_features_df=all_syntax_features_df,
    wals_languages_csv_path="wals_languages.csv",
    rule_id="Greenberg Rule 23",
    rule_features_needed=RULE23_FEATURES_NEEDED,
    rule_fn=evaluate_rule23_row
)

results23["coverage"], results23["macro_summary"]

(total_languages_in_lang2vec_syntax_df     4005.000000
 languages_testable_for_rule                594.000000
 testable_fraction                            0.148315
 languages_with_geo_metadata_after_join     553.000000
 geo_join_fraction_of_testable                0.930976
 dtype: float64,
       macro_area  n_languages  n_violations  violation_rate
 0         Africa          236           215        0.870445
 4      Papunesia          150           130        0.855263
 2        Eurasia           79            63        0.594340
 3  North America           47            24        0.444444
 5  South America           30            19        0.633333
 1      Australia           11             1        0.090909)

In [14]:
plot_macro_summary_bar(results23["macro_summary"], "Greenberg Rule 23", y_col="n_violations")
plot_macro_summary_bar(results23["macro_summary"], "Greenberg Rule 23", y_col="violation_rate")

In [15]:
plot_rule_geomap(
    df_geo=results23["rule_geo"],
    rule_id="Greenberg Rule 23",
    base_hover_cols=["lang_code", "macro_area", "family"],
    extra_hover_cols=RULE23_FEATURES_NEEDED
)

## Greenberg Rule 24
If in a language the verb follows the object, the adjective likewise follows the noun.

In [16]:
results24 = run_rule_from_lang2vec(
    all_syntax_features_df=all_syntax_features_df,
    wals_languages_csv_path="wals_languages.csv",
    rule_id="Greenberg Rule 24",
    rule_features_needed=RULE24_FEATURES_NEEDED,
    rule_fn=evaluate_rule24_row,
)

results24["coverage"], results24["macro_summary"]

(total_languages_in_lang2vec_syntax_df     4005.000000
 languages_testable_for_rule                494.000000
 testable_fraction                            0.123346
 languages_with_geo_metadata_after_join     468.000000
 geo_join_fraction_of_testable                0.947368
 dtype: float64,
       macro_area  n_languages  n_violations  violation_rate
 2        Eurasia          162           131        0.642157
 4      Papunesia          106            19        0.174312
 0         Africa           68            18        0.246575
 5  South America           58            16        0.275862
 3  North America           44            14        0.285714
 1      Australia           30             2        0.062500)

In [17]:
plot_macro_summary_bar(results24["macro_summary"], "Greenberg Rule 24", y_col="n_violations")
plot_macro_summary_bar(results24["macro_summary"], "Greenberg Rule 24", y_col="violation_rate")

In [18]:
plot_rule_geomap(
    df_geo=results24["rule_geo"],
    rule_id="Greenberg Rule 24",
    base_hover_cols=["lang_code", "macro_area", "family"],
    extra_hover_cols=RULE24_FEATURES_NEEDED
)

## Greenberg Rule 41
If in a language the verb follows both the nominal subject and nominal object as the dominant order, the language almost always has a case system.

In [19]:
results41 = run_rule_from_lang2vec(
    all_syntax_features_df=all_syntax_features_df,
    wals_languages_csv_path="wals_languages.csv",
    rule_id="Greenberg Rule 41",
    rule_features_needed=RULE41_FEATURES_NEEDED,
    rule_fn=evaluate_rule41_row,
)

results41["coverage"], results41["macro_summary"]

(total_languages_in_lang2vec_syntax_df     4005.000000
 languages_testable_for_rule                422.000000
 testable_fraction                            0.105368
 languages_with_geo_metadata_after_join     396.000000
 geo_join_fraction_of_testable                0.938389
 dtype: float64,
       macro_area  n_languages  n_violations  violation_rate
 2        Eurasia          140            26        0.146893
 4      Papunesia           85            24        0.279070
 0         Africa           48            21        0.420000
 3  North America           52            10        0.172414
 5  South America           46            10        0.217391
 1      Australia           25             1        0.038462)

In [20]:
plot_macro_summary_bar(results41["macro_summary"], "Greenberg Rule 41", y_col="n_violations")
plot_macro_summary_bar(results41["macro_summary"], "Greenberg Rule 41", y_col="violation_rate")

In [21]:
plot_rule_geomap(
    df_geo=results41["rule_geo"],
    rule_id="Greenberg Rule 41",
    base_hover_cols=["lang_code", "macro_area", "family"],
    extra_hover_cols=RULE41_FEATURES_NEEDED
)