In [13]:
import lang2vec.lang2vec as l2v
import pandas as pd
import numpy as np

In [None]:
languages = [
    "eng",
    "deu",
    "spa",
    "rus",
    "jpn",
    "hin",
    "tur",
    "ara",
    "por",
    "ita",
  ]

# iso_code -> feature_vector
features_dict = l2v.get_features(languages, "syntax_wals", header=True)

# extract feature names from the special 'CODE' entry
feature_names = features_dict["CODE"]

# build a matrix for all languages
# missing values are encoded as the string '--'
rows = []
valid_langs = []

for lang in languages:
    # some languages might not be available; skip if not present
    if lang not in features_dict:
        print(f"[warning] language {lang} not available, skipping.")
        continue

    rows.append(features_dict[lang])
    valid_langs.append(lang)

# each row is a language, each column is a wals feature
df = pd.DataFrame(rows, index=valid_langs, columns=feature_names)

# replace the missing marker '--' with np.nan so pandas can handle it
df = df.replace("--", np.nan)

# convert numeric strings to floats
df = df.apply(pd.to_numeric, errors="ignore")

# load wals syntax features into a dataframe
print("shape:", df.shape)
print("first few rows:")
print(df.head())


shape: (10, 103)
first few rows:
     S_SVO  S_SOV  S_VSO  S_VOS  S_OVS  S_OSV  S_SUBJECT_BEFORE_VERB  \
eng    1.0    0.0    0.0    0.0    0.0    0.0                    1.0   
deu    1.0    1.0    0.0    0.0    0.0    0.0                    1.0   
spa    1.0    0.0    0.0    0.0    0.0    0.0                    1.0   
rus    1.0    0.0    0.0    0.0    0.0    0.0                    1.0   
jpn    0.0    1.0    0.0    0.0    0.0    0.0                    1.0   

     S_SUBJECT_AFTER_VERB  S_OBJECT_AFTER_VERB  S_OBJECT_BEFORE_VERB  ...  \
eng                   0.0                  1.0                   0.0  ...   
deu                   0.0                  1.0                   1.0  ...   
spa                   1.0                  1.0                   0.0  ...   
rus                   0.0                  1.0                   0.0  ...   
jpn                   0.0                  0.0                   1.0  ...   

     S_XVO  S_XOV  S_OXV  S_OVX  S_OBLIQUE_AFTER_VERB  S_OBLIQUE_AFTER_

  df = df.replace("--", np.nan)
  df = df.apply(pd.to_numeric, errors="ignore")
