In [1]:
import random
import grammar_induction
from gitta.context_free_grammar import ContextFreeGrammar

random.seed(123)

rules = {
    'origin': '<hello>, <location>!',
    'hello': ['Hello', 'Greetings', 'Howdy', 'Hey'],
    'location': ['world', 'solar system', 'galaxy', 'universe']
}

grammar = ContextFreeGrammar.from_string(rules)
original_dataset = grammar.generate_all_string()

original_dataset

['Hey, world!',
 'Hello, universe!',
 'Howdy, world!',
 'Hey, galaxy!',
 'Hey, universe!',
 'Greetings, galaxy!',
 'Howdy, universe!',
 'Greetings, universe!',
 'Hey, solar system!',
 'Howdy, solar system!',
 'Howdy, galaxy!',
 'Hello, solar system!',
 'Greetings, world!',
 'Greetings, solar system!',
 'Hello, world!',
 'Hello, galaxy!']

In [2]:
dataset = list(original_dataset)

# Let's only take half the dataset for induction, to show it is generalising!
# You can of course also leave this code out and learn from all examples instead.

number_of_training_instances = 9
random.shuffle(dataset)
dataset = dataset[:number_of_training_instances]
dataset

['Howdy, solar system!',
 'Hey, galaxy!',
 'Hey, solar system!',
 'Greetings, galaxy!',
 'Howdy, world!',
 'Greetings, universe!',
 'Howdy, galaxy!',
 'Hello, solar system!',
 'Hey, world!']

In [3]:
reconstructed_grammar = grammar_induction.induce_grammar_using_template_trees(
    dataset,
    words_per_slot=2,
    relative_similarity_threshold=0.2, # This value decides when to join value lists
)

In [4]:
print(reconstructed_grammar.to_json())


{
    "origin": [
        "<A>, <C>!"
    ],
    "A": [
        "Greetings",
        "Hello",
        "Hey",
        "Howdy"
    ],
    "C": [
        "galaxy",
        "solar system",
        "universe",
        "world"
    ]
}


In [5]:
all_generations = reconstructed_grammar.generate_all()
all_generations

{"Greetings, galaxy!",
 "Greetings, solar system!",
 "Greetings, universe!",
 "Greetings, world!",
 "Hello, galaxy!",
 "Hello, solar system!",
 "Hello, universe!",
 "Hello, world!",
 "Hey, galaxy!",
 "Hey, solar system!",
 "Hey, universe!",
 "Hey, world!",
 "Howdy, galaxy!",
 "Howdy, solar system!",
 "Howdy, universe!",
 "Howdy, world!"}

In [6]:
print("Same grammar:", reconstructed_grammar.is_isomorphic_with(grammar))
print("Same grammar output:", {s.to_flat_string() for s in all_generations} == set(original_dataset))

Same grammar: True
Same grammar output: True
