<a href="https://colab.research.google.com/github/xh313/natural-class-selector/blob/main/natural_class_selector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
# Run this before anything
import pandas as pd
import matplotlib.pyplot as plt
import regex as re

def selector(df, features=dict()):
  """
  Input the features you are looking for as a dictionary.
  """
  try:
    for key in features:
      # redundancy check
      if list(df[df[key] == features[key]].index) == list(df.index):
        print(f'The feature {features[key]}{key} is redundant.')
      # empty set check
      elif list(df[df[key] == features[key]].index) == []:
        print(f'The feature {features[key]}{key} returned no phonemes.')
        return None
      # filtering if things are normal
      else:
        df = df[df[key] == features[key]]
        print(f'Applied {features[key]}{key}: {list(df.index)}')

    print('\n')
    return list(df.index)
  except IndexError:
    # for empty features
    pass


# Minimally sufficient features to select
def minimally_sufficient(df, phones='s'):
  """
  Returns the minimally sufficient selectors to 
  select out this unique phoneme.
  """
  original = df
  selectors = dict()
  if type(phones) != list:
    phones = [phones]  # listify the string
  for phone in phones:
    df = original
    features = []
    featuresdict = dict()
    try:
      while len(df) != 1:
        for feature in list(df.columns):
          presence = df.loc[phone, feature]
          if (presence != 0) and (list(
              df[df[feature] == presence].index) != list(df.index)):
            
            features.append(f'{presence}{feature}')
            #print(f'{phone}:{presence}{feature}')
            featuresdict[feature] = presence
            df = df[df[feature] == presence]
      #print(f'{phone}: {features}')
      selectors[phone] = featuresdict
    except ValueError:
      print('An ambiguity is present in the dataframe.')
      pass
  return selectors

def view_difference(df, phones):
  """
  Takes the feature chart and a 2-tuple (or a 2-tuple of 
  which the second element is a list)
  and returns the differences between them.
  """
  try:
    if type(phones[-1]) == list:
      phonemes = phones[-1].append(phones[0])
      return df.transpose().query(f'`{phones[0]}` != {phones[1]}')[phonemes]
    else:
      return df.transpose().query(f'`{phones[0]}` != `{phones[1]}`')[[phones[0], 
                                                               phones[1]]]
  except:
    print(f"Unable to detect feature differences for {phones}.")

In [10]:
# Data

df = pd.read_csv('https://github.com/xh313/natural-class-selec\
tor/raw/main/Features.csv').set_index('phoneme')


# Selector Example

# Manually enter the features
selector(df, {#'syllabic': '-',
              'consonantal': '+',
              'continuant': '-',
              'dorsal': '+',
              'high': '-'})

# Use an automatically generated feature dict
#selector(df, minimally_sufficient(df, ['ʃ','s',])['ʃ'])

Applied +consonantal: ['ŋ+', 'ʟ', 'ɫ', 'ɴ', 'ʀ', 'ɲ', 'ʎ', 'ŋ', 'ŋ˗', 'ʟ', 'ʟ̠', 'ɳ', 'ʙ', 'ɭ', 'ɺ', 'ɻ', 'ɽ', 'r', 'n', 'm', 'l', 'ɾ', 'ɱ', 'ʔ', 'ɣ+', 'x+', 'k+', 'ɡ+', 'k+͡x+', 'ɡ+͡ɣ+', 'ħ', 'ʕ', 'ʁ', 'q', 'χ', 'ɢ', 'ɕ', 'ɉ', 'ʝ', 'c', 'ç', 'd͡ʑ', 't͡ɕ', 'ɣ', 'ɣ̠ ', 'x', 'x̠', 'k', 'k̠', 'ɡ', 'ɡ̠', 'ʑ', 'ʈ', 'ɖ', 'ɬ', 'ʐ', 'ɸ', 'ʂ', 'ʒ', 'z', 'v', 't', 'ʃ', 's', 'p', 'f', 'd', 'b', 'θ', 'ɮ', 'ð', 'β', 'd͡ʒ', 'd͡z', 'd͡ɮ', 'd̠͡ɮ̠', 't͡ʃ', 't̠͡ɬ̠', 't͡s', 't͡ɬ', 't̪͡s̪', 't̪͡ɬ̪', 'd̪͡z̪', 'd̪͡ɮ̪', 'ʈ͡ʂ', 'ɖ͡ʐ', 'p͡f', 'b͡v', 'p͡ɸ', 'b͡β', 't̪͡θ', 'd̪͡ð', 'c͡ç', 'ɉ͡ʝ', 'k͡x', 'k̠͡x̠', 'ɡ͡ɣ', 'ɡ̠̠͡ɣ̠', 'q͡χ', 'ɢ͡ʁ', 'ɧ', 'k͡p', 'g͡b', 'p͡t', 'b͡d']
Applied -continuant: ['ŋ+', 'ɴ', 'ɲ', 'ŋ', 'ŋ˗', 'ɳ', 'n', 'm', 'ɱ', 'ʔ', 'k+', 'ɡ+', 'k+͡x+', 'ɡ+͡ɣ+', 'q', 'ɢ', 'ɉ', 'c', 'd͡ʑ', 't͡ɕ', 'k', 'k̠', 'ɡ', 'ɡ̠', 'ʈ', 'ɖ', 't', 'p', 'd', 'b', 'd͡ʒ', 'd͡z', 'd͡ɮ', 'd̠͡ɮ̠', 't͡ʃ', 't̠͡ɬ̠', 't͡s', 't͡ɬ', 't̪͡s̪', 't̪͡ɬ̪', 'd̪͡z̪', 'd̪͡ɮ̪', 'ʈ͡ʂ', 'ɖ͡ʐ', 'p͡f', 'b͡v', 'p͡ɸ', 'b͡β', 't̪͡θ', 'd̪͡ð', '

['ɴ', 'q', 'ɢ', 'q͡χ', 'ɢ͡ʁ']

In [4]:
# minimally sufficient selectors
minimally_sufficient(df, ['q','ɴ',])

{'q': {'syllabic': '-',
  'consonantal': '+',
  'sonorant': '-',
  'continuant': '-',
  'delayed release': '-',
  'voice': '-',
  'constr gl': '-',
  'labial': '-',
  'coronal': '-',
  'high': '-'},
 'ɴ': {'syllabic': '-',
  'consonantal': '+',
  'sonorant': '+',
  'continuant': '-',
  'labial': '-',
  'coronal': '-',
  'high': '-'}}

In [23]:
# Difference viewer
view_difference(df, ('t͡s', 't'))

phoneme,t͡s,t
delayed release,+,-
strident,+,-


In [24]:
# Spreadsheet Viewer
df.loc['t͡s']#.loc[['a', 'ħ', 'χ', 'ɕ', 'ç', 'x', 'x̠', 'ɬ', 'ɸ', 'ʂ', 'ʃ', 's', 'o', 'e', 'y']]

syllabic           -
stress             -
long               -
consonantal        +
sonorant           -
continuant         -
delayed release    +
approximant        -
tap                -
trill              -
nasal              -
voice              -
spread gl          -
constr gl          -
labial             -
round              -
labiodental        -
coronal            +
anterior           +
distributed        -
strident           +
lateral            -
dorsal             -
high               0
low                0
front              0
back               0
tense              0
Name: t͡s, dtype: object