In [7]:
import lang2vec.lang2vec as l2v

from greenberg_pipeline import (
    features_to_df,
    run_rule_from_lang2vec,
    plot_macro_summary_bar,
    plot_rule_geomap,
)
from rules_greenberg import RULE20_FEATURES_NEEDED, evaluate_rule20_row

## Load WALS features

In [8]:
languages = list(l2v.available_languages())
all_syntax_features_dict = l2v.get_features(languages, "syntax_wals", header=True)
all_syntax_features_df = features_to_df(all_syntax_features_dict)


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



## Greenberg Rule 20

In [9]:
results20 = run_rule_from_lang2vec(
    all_syntax_features_df=all_syntax_features_df,
    wals_languages_csv_path="wals_languages.csv",
    rule_id="Greenberg Rule 20",
    rule_features_needed=RULE20_FEATURES_NEEDED,
    rule_fn=evaluate_rule20_row,
)

results20["coverage"], results20["macro_summary"]

(total_languages_in_lang2vec_syntax_df     4005.000000
 languages_testable_for_rule                699.000000
 testable_fraction                            0.174532
 languages_with_geo_metadata_after_join     658.000000
 geo_join_fraction_of_testable                0.941345
 dtype: float64,
       macro_area  n_languages  n_violations  violation_rate
 4      Papunesia          125            72        0.558140
 2        Eurasia          216            67        0.248148
 0         Africa           82            52        0.590909
 3  North America          107            42        0.358974
 5  South America           88            10        0.113636
 1      Australia           40             1        0.023810)

In [10]:
plot_macro_summary_bar(results20["macro_summary"], "Greenberg Rule 20", y_col="n_violations")
plot_macro_summary_bar(results20["macro_summary"], "Greenberg Rule 20", y_col="violation_rate")

In [11]:
plot_rule_geomap(
    df_geo=results20["rule_geo"],
    rule_id="Greenberg Rule 20",
    base_hover_cols=["lang_code", "macro_area", "family"],
    extra_hover_cols=RULE20_FEATURES_NEEDED,
)