In [None]:
import json
import tqdm

import numpy as np
import pandas as pd

import helpers

In [None]:
# Load feature distances
with open("feature_distances.json", "r") as f:
    distances = json.load(f)

In [None]:
# Load IPA mappings
with open(f"nonce.txt", "r") as f:
    ipa_bases = f.read().strip().split("\n")
ipa_bases = {
    ipa_base.split("\t")[1]: ipa_base.split("\t")[0] for ipa_base in ipa_bases
}

In [None]:
# Define GCM hyperparameters
c, s, p = 0.6, 3e-01, 1

In [None]:
# Define suffixes
suffixes = ["able", "ish", "ive", "ous"]

In [None]:
# Run type-based GCM
preds = {}
for suffix in suffixes:

    # Load exemplars
    with open(f"exemplars/{suffix}.in", "r") as f:
        lines = f.read().strip().split("\n")
    train_forms = lines[1:lines.index("Test forms:")]
    test_forms = lines[lines.index("Test forms:") + 1:]

    # Prepare training data
    columns = [
        "ipa_base", 
        "ipa_derivative", 
        "frequency", 
        "base", 
        "derivative", 
        "suffix"
    ]
    train_data = pd.DataFrame(
        [l.split("\t") for l in train_forms], 
        columns=columns
    )

    # Make predictions
    for test_form in tqdm.tqdm(test_forms):
        suffix_pred, score = helpers.predict_suffix(
            test_form, 
            train_data,
            distances,
            c,
            s,
            p
        )
        preds[ipa_bases[test_form]] = (suffix_pred, score)

# Store predictions
with open(f"preds_type.json", "w") as f:
    json.dump(preds, f, sort_keys=True, indent=4)

In [None]:
# Run token-based GCM
preds = {}
for suffix in suffixes:

    # Load exemplars
    with open(f"exemplars/{suffix}.in", "r") as f:
        lines = f.read().strip().split("\n")
    train_forms = lines[1:lines.index("Test forms:")]
    test_forms = lines[lines.index("Test forms:") + 1:]

    # Prepare training data
    columns = [
        "ipa_base", 
        "ipa_derivative", 
        "frequency", 
        "base", 
        "derivative", 
        "suffix"
    ]
    train_data = pd.DataFrame(
        [l.split("\t") for l in train_forms], 
        columns=columns
    )

    # Make predictions
    for test_form in tqdm.tqdm(test_forms):
        suffix_pred, score = helpers.predict_suffix(
            test_form, 
            train_data,
            distances,
            c,
            s,
            p,
            True
        )
        preds[ipa_bases[test_form]] = (suffix_pred, score)

# Store predictions
with open(f"preds_token.json", "w") as f:
    json.dump(preds, f, sort_keys=True, indent=4)