In [1]:
import re
from itertools import chain, product
from helpers import data

In [2]:
rules_arr, messages = data(19, parser=lambda s: s.split("\n"), sep="\n\n")

In [3]:
messages[:3]

['bbbbbbbbbaaaabbaababbabbaaabbbabbbbaaabb',
 'babaabbaabbbbaababbaabbabaababba',
 'baabbaaabbabbbbaaabababb']

In [4]:
# This task is probably made much easier by setting up the rules as nicely 
# as possible. I'll make a dictionary of them, where each value is an array 
# of the possible matches (separated by "|"). The matches within each array 
# will be an array themselves of ints. 

# Each rule is either a single character, or a list of sub-rules which are 
# sequences of rule numbers, with each sequence separated by a pipe "|". 
# Each value in parsed_rules is a list of sub-rules (list) or a single character. 
parsed_rules = {}
for rule in rules_arr: 
    # Find rule number
    key = int(re.match("\d+", rule).group())
    parsed_rules[key] = []
    # Remove rule number 
    rule = re.sub("\d+: ", "", rule)
    # Get each allowed sub-rule 
    sub_rules = rule.split(" | ")
    # Get ints in each sub-rule 
    for sub_rule in sub_rules:
        rule_numbers = re.findall("\d+", sub_rule)
        if not rule_numbers: 
            # Must be a character 
            parsed_rules[key] = sub_rule[1]
        else: 
            parsed_rules[key].append([int(d) for d in rule_numbers])

In [5]:
parsed_rules[5]

[[45, 47], [44, 18]]

In [6]:
# Now I'll go through parsed_rules to find rules with only a character and 
# add the string to rules. Then I'll repeat for values in parsed_rules 
# that are already in rules, etc. 
# rules is a dictionary of int: str or list[str]
rules = {}

# Get rules that are just one character 
for k, v in parsed_rules.items():
    if isinstance(v, str):
        rules[k] = v 

# Get rules that are based on already-processed rules 
while len(rules) != len(parsed_rules):
    for k, v in parsed_rules.items():
        # Check that sub-rule numbers are already processed
        if all((r in rules) for r in chain.from_iterable(v)):
            # Convert subrules to strings 
            # Sub-rule are potentially nested, so to get a list of strings we 
            # need to get all Cartesian products. 
            rules[k] = []
            for sub_rule in v:
                sub_rule_strings = ["".join(p) 
                                    for p in product(*[rules[sub_rule_number] 
                                                       for sub_rule_number in sub_rule])]
                # Not append; want a 1-deep array
                rules[k] += sub_rule_strings                

**Task 1:** Find messages that match rule 0.

In [7]:
rule = rules[0]
len(rule) # LOTS of subrules...

2097152

In [8]:
valid_messages = []
for message in messages:
    if isinstance(rule, str): 
        if rule == message: 
            valid_messages.append(message)
    else: 
        for sub_rule in rule: 
            if sub_rule == message: 
                valid_messages.append(message)
                break
                
len(valid_messages)

120

Okay, that was really slow. I can speed this up a lot by tracking which rules pass and only checking the rules that contain them if at least one passes. 

**Part 2:** Change rule 8 to "42 | 42 8" and rule 11 to "42 31 | 42 11 31". This means we add loops, but they're very confined. How many messages match rule 0 now? 

In [9]:
parsed_rules[0]

[[8, 11]]

In [10]:
parsed_rules[8]

[[42]]

In [11]:
parsed_rules[11]

[[42, 31]]

Okay, so since there weren't any loops before and rule 0 is made up explicitly of rules 8 and 11, we know **no other rules** subrule 8 or 11. Looking at the change to rules 8 and 11 makes it clear that: 
- Rule 8 becomes: 42 | 42 42 | 42 42 42 | 42 42 42 42 | ... 
- Rule 11 becomes: 42 31 | 42 42 31 31 | 42 42 42 | 31 31 31 | ...

Since rule 0 is 8 followed by 11, we know it matches strings of the form: x\*42 + y\*42 + y\*31, where x,y $\geq$ 1. Our strategy will be to check if rule 31 exists as a suffix and rule 42 exists as a prefix, and then shave them off. Continue until no more rule 31s on the back. Then ensure all remaining blocks are rule 42. 

In [12]:
# We only need to check messages that weren't valid before 
invalid_messages = [message for message in messages 
                    if message not in valid_messages]

assert len(invalid_messages) + len(valid_messages) == len(messages)

In [13]:
def prefix_42(message):
    """Return remaining substring if rule 42 is a prefix, else Error."""
    for subrule in rules[42]:
        if message.startswith(subrule):
            return message[len(subrule):]
    raise ValueError("42 wasn't prefix")

def suffix_31(message):
    """Return remaining substring if rule 31 is a suffix, else Error."""
    for subrule in rules[31]:
        if message.endswith(subrule):
            return message[:-len(subrule)]
    raise ValueError("31 wasn't suffix")

In [15]:
for message in invalid_messages:
    msg = message[:]
    check_31 = True
    x = 0
    y = 0
    while True: 
        try: 
            if check_31:
                msg = suffix_31(msg)
            msg = prefix_42(msg)
            if check_31: 
                y += 1
            else:
                x += 1
        except ValueError as e: 
            if str(e) == "42 wasn't prefix": 
                if not check_31 and not msg and x and y:
                    # Matched suffix 31 and prefix 42s, now empty string, so we're good 
                    valid_messages.append(message)
                break
            elif str(e) == "31 wasn't suffix":
                # Didn't have suffix 31, check if remaining are 42 prefix 
                check_31 = False
            else: 
                print("ERROR: didn't expect e", str(e))
                break
                
len(valid_messages)

350

**Norvig's solution is amazing.** I'll copy it and try to think about how I could have come up with something like that...