In [None]:
import json
from itertools import chain, combinations

import pandas as pd

In [None]:
def powerset(iterable):
    s = set(iterable)
    return [
        tuple(sorted(el)) for el in chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))
    ]

In [None]:
def get_natural_classes(p, natural_classes):
    p_natural_classes = set()
    for natural_class in natural_classes:
        if p in natural_class:
            p_natural_classes.add(natural_class)
    return p_natural_classes

In [None]:
feature_table = pd.read_csv("table.fea", sep="\t")
feature_table = feature_table.iloc[1:-2, 1:].reset_index(drop=True)

In [None]:
phoneme2features = {}
for i in range(len(feature_table)):
    phoneme_features = feature_table.iloc[i, 1:].astype(int)
    phoneme2features[feature_table.iloc[i, 0]] = tuple(sorted([
        f"{f}{phoneme_features[f]}" for f in phoneme_features.index if phoneme_features[f] >= 0
    ]))

In [None]:
feature_combinations = set()
for features in phoneme2features.values():
    feature_combinations.update(
        powerset(features)
    )

In [None]:
natural_classes = set()
for feature_combination in feature_combinations:
    phonemes = "".join(
        sorted([p for p in phoneme2features if set(feature_combination) <= set(phoneme2features[p])])
    )
    natural_classes.add(phonemes)

In [None]:
distances = {}
for p_i in phoneme2features:
    for p_j in phoneme2features:
        if p_i == p_j:
            continue
        p_i_ncs = get_natural_classes(p_i, natural_classes)
        p_j_ncs = get_natural_classes(p_j, natural_classes)
        distances[f"{p_i}_{p_j}"] = 1 - (len(p_i_ncs & p_j_ncs) / len(p_i_ncs | p_j_ncs))

In [None]:
with open("feature_distances.json", "w") as f:
    json.dump(distances, f, sort_keys=True, indent=4)