In [7]:
import lang2vec.lang2vec as l2v
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")


### 1. Fetch WALS syntax features

In [8]:
languages = [
    "eng",
    "deu",
    "spa",
    "rus",
    "jpn",
    "hin",
    "tur",
    "ara",
    "por",
    "ita",
  ]

# iso_code -> feature_vector
features_dict = l2v.get_features(languages, "syntax_wals", header=True)

# extract feature names
feature_names = features_dict["CODE"]

# build a matrix for all languages
# missing values are encoded as the string '--'
rows = []
valid_langs = []

for lang in languages:
    if lang not in features_dict:
        print(f"[warning] language {lang} not available, skipping.")
        continue

    rows.append(features_dict[lang])
    valid_langs.append(lang)

# each row is a language, each column is a wals feature
all_syntax_features_df = pd.DataFrame(rows, index=valid_langs, columns=feature_names)

# replace the missing marker '--' with np.nan so pandas can handle it
all_syntax_features_df = all_syntax_features_df.replace("--", np.nan)

print("shape:", all_syntax_features_df.shape)
display(all_syntax_features_df.head())


shape: (10, 103)


Unnamed: 0,S_SVO,S_SOV,S_VSO,S_VOS,S_OVS,S_OSV,S_SUBJECT_BEFORE_VERB,S_SUBJECT_AFTER_VERB,S_OBJECT_AFTER_VERB,S_OBJECT_BEFORE_VERB,...,S_XVO,S_XOV,S_OXV,S_OVX,S_OBLIQUE_AFTER_VERB,S_OBLIQUE_AFTER_OBJECT,S_OBLIQUE_BEFORE_VERB,S_OBLIQUE_BEFORE_OBJECT,S_ARTICLE_WORD_BEFORE_NOUN,S_ARTICLE_WORD_AFTER_NOUN
eng,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,
deu,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,,,,,,,,,,
spa,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,
rus,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,,,,,,,,,,
jpn,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,,,,,,,,,,


Selecting only the features related to 49A, 50A, 81A, 85A-90A

In [9]:
# keep columns whose name contains any of the given substrings
def select_cols(df, keywords):
    return [c for c in df.columns if any(k in c for k in keywords)]

cols_49A = select_cols(all_syntax_features_df, ["CASE", "CASES"]) # 49A number of cases
cols_50A = select_cols(all_syntax_features_df, ["ASYMMETRIC"]) # 50A asymmetrical case marking
cols_81A = select_cols(all_syntax_features_df, ["SVO", "SOV", "VSO", "VOS", "OVS", "OSV"]) # 81A basic order
cols_85A = select_cols(all_syntax_features_df, ["ADPOSITION", "ADP"]) # 85A adposition + np
cols_86A = select_cols(all_syntax_features_df, ["GENITIVE"]) # 86A genitive + noun
cols_87A = select_cols(all_syntax_features_df, ["ADJECTIVE"]) # 87A adjective + noun
cols_88A = select_cols(all_syntax_features_df, ["DEMONSTRATIVE"]) # 88A demonstrative + noun
cols_89A = select_cols(all_syntax_features_df, ["NUMERAL"]) # 89A numeral + noun
cols_90A = select_cols(all_syntax_features_df, ["RELATIVE"]) # 90A relative clause + noun

# flatten everything into one list of interesting columns
interesting_cols = (
    cols_49A + cols_50A + cols_81A +
    cols_85A + cols_86A + cols_87A +
    cols_88A + cols_89A + cols_90A
)

# keep only these columns in a new dataframe
focus_syntax_features_df = all_syntax_features_df[interesting_cols].copy()

print("shape (focused on 49A, 50A, 81A, 85A–90A):", focus_syntax_features_df.shape)
print("first few columns:", focus_syntax_features_df.columns[:10].tolist())
display(focus_syntax_features_df.head())


shape (focused on 49A, 50A, 81A, 85A–90A): (10, 28)
first few columns: [np.str_('S_CASE_PREFIX'), np.str_('S_CASE_SUFFIX'), np.str_('S_CASE_PROCLITIC'), np.str_('S_CASE_ENCLITIC'), np.str_('S_CASE_MARK'), np.str_('S_SVO'), np.str_('S_SOV'), np.str_('S_VSO'), np.str_('S_VOS'), np.str_('S_OVS')]


Unnamed: 0,S_CASE_PREFIX,S_CASE_SUFFIX,S_CASE_PROCLITIC,S_CASE_ENCLITIC,S_CASE_MARK,S_SVO,S_SOV,S_VSO,S_VOS,S_OVS,...,S_ANY_AGREEMENT_ON_ADJECTIVES,S_DEMONSTRATIVE_WORD_BEFORE_NOUN,S_DEMONSTRATIVE_WORD_AFTER_NOUN,S_DEMONSTRATIVE_PREFIX,S_DEMONSTRATIVE_SUFFIX,S_NUMERAL_BEFORE_NOUN,S_NUMERAL_AFTER_NOUN,S_RELATIVE_BEFORE_NOUN,S_RELATIVE_AFTER_NOUN,S_RELATIVE_AROUND_NOUN
eng,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
deu,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
spa,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
rus,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
jpn,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
