# Parser

Parsers are one of the core techniques in fuzzing. You need parsers to take a structured input apart, and reuse the parts in other inputs without affecting the validity of the input.

## Synopsis

```python
import parser as P
my_grammar = {'<start>': [['1', '<A>'],
                          ['2']
                         ],
              '<A>'    : [['a']]}
my_parser = P.LL1Parser(my_grammar)
for tree in my_parser.parse_on(text='1a', start_symbol='<start>'):
    print(P.format_parsetree(tree))
```



Secondly, as per traditional implementations,
there can only be one expansion rule for the `<start>` symbol. We work around
this restriction by simply constructing as many charts as there are expansion
rules, and returning all parse trees.

In [None]:
import ipynb.fs.full.x0_1_Grammars as grammars
import ipynb.fs.full.Railroads as diagrams

In [None]:
# diagrams.syntax_diagram(grammars.EXPR_GRAMMAR)

In [None]:
grammar = {
    '<start>': [['<expr>']],
    '<expr>': [
        ['<term>', '+', '<expr>'],
        ['<term>', '-', '<expr>'],
        ['<term>']],
    '<term>': [
        ['<fact>', '*', '<term>'],
        ['<fact>', '/', '<term>'],
        ['<fact>']],
    '<fact>': [
        ['<digits>'],
        ['(','<expr>',')']],
    '<digits>': [
        ['<digit>','<digits>'],
        ['<digit>']],
    '<digit>': [["%s" % str(i)] for i in range(10)],
}
START = '<start>'

In [None]:
# diagrams.syntax_diagram(grammars.BEXPR_GRAMMAR)

In [None]:
import src.utils as utils

## Summary

An LL(1) parser executes the following steps for parsing:

The idea behind a simple $LL(1)$ recognizer is that, you try to unify the string you want to match with the corresponding key in the grammar. If the key is not present in the grammar, it is a literal, which needs to be matched with string equality. If the key is present in the grammar, get the corresponding productions (rules) for that key, and start unifying each rule one by one on the string to be matched.

In [None]:
import sys
import functools

class LL1Parser:
    def __init__(self, grammar):
        self.grammar = grammar

    @functools.lru_cache(maxsize=None)
    def unify_key(self, key, text, at=0):
        if not utils.is_nt(key):
            if text[at:].startswith(key): return (at + len(key), (key, [])) 
            else: return (at, None)
        rules = self.grammar[key]
        for rule in rules:
            l, res = self.unify_rule(rule, text, at)
            if res is not None: return l, (key, res)
        return (0, None)

For unifying rules, the idea is similar. We take each token in the rule, and try to unify that token with the string to be matched. We rely on unify_key for doing the unification of the token. if the unification fails, we return empty handed.

In [None]:
class LL1Parser(LL1Parser):
    def unify_rule(self, parts, text, tfrom):
        results = []
        for part in parts:
            tfrom, res = self.unify_key(part, text, tfrom)
            if res is None: return tfrom, None
            results.append(res)
        return tfrom, results

    def parse_on(self, text, start_symbol):
        till, result = self.unify_key(start_symbol, text, 0)
        yield result

In [None]:
if __name__ == '__main__':
    small_grammar = {'<start>': [['1', '<A>'],
                              ['2']
                             ],
                  '<A>'    : [['a']]}
    my_parser = LL1Parser(small_grammar)
    for tree in my_parser.parse_on(text='1a', start_symbol='<start>'):
        utils.display_tree(tree)

In [None]:
if __name__ == '__main__':
    my_parser = LL1Parser(grammars.EXPR_GRAMMAR)
    tree = list(my_parser.parse_on(text='(8/3)*49', start_symbol='<start>'))[0]
    utils.display_tree(tree)

## Rule Stats

In [None]:
class InputStats:
    def __init__(self, grammar):
        self.grammar = grammar
        self.vars = {}

    def rule_str(self, rule):
        return ''.join(rule)

    def to_rule_str(self, tree):
        key, children = tree
        return ''.join([c[0] for c in children])

    def process_parsetree(self, parse_tree):
        key, children = parse_tree
        if not utils.is_nt(key): return
        rule_str = self.to_rule_str(parse_tree)
        if key not in self.vars:
            self.vars[key] = {self.rule_str(r): 0 for r in self.grammar[key]}
        key_vars = self.vars[key]
        if rule_str not in key_vars:
            key_vars[rule_str] = 1
        else:
            key_vars[rule_str] += 1
        for child in children:
            self.process_parsetree(child)
        return self.vars

In [None]:
if __name__ == '__main__':
    stats = InputStats(grammars.EXPR_GRAMMAR)
    stats.process_parsetree(tree)
    for k in stats.vars:
        print(k, stats.vars[k])

## Depth Based Rule Stats

In [None]:
class InputStats(InputStats):
    def __init__(self, grammar):
        self.grammar = grammar
        self.vars = {}  # Original counts without depth
        self.vars_with_depth = {}  # Counts with depth
    
    def process_parsetree_with_depth(self, parse_tree, depth=0):
        key, children = parse_tree        
        if not utils.is_nt(key): return

        rule_str = self.to_rule_str(parse_tree)
        if key not in self.vars_with_depth:
            self.vars_with_depth[key] = {self.rule_str(r):[] for r in self.grammar[key]}
        rule_depths = self.vars_with_depth[key][rule_str]

        # Track with depth
        rule_depths.append(depth)

        for child in children:
            self.process_parsetree_with_depth(child, depth + 1)
        return self.vars_with_depth

In [None]:
if __name__ == '__main__':
    import statistics
    stats = InputStats(grammars.EXPR_GRAMMAR)
    stats.process_parsetree_with_depth(tree)
    for k in stats.vars_with_depth:
        print(k)
        kv = stats.vars_with_depth[k]
        for v in kv:
            print("- ", v, kv[v])
            if len(kv[v]) > 0:
                print('  mean depth:', statistics.mean(kv[v]))
            if len(kv[v]) > 1:
                print('  stdev (%0.2f)' % statistics.stdev(kv[v]))
        print()

## Position Based Rule Stats

In [None]:
class InputStats(InputStats):
    def __init__(self, grammar):
        self.grammar = grammar
        self.vars = {}  # Original counts without depth
        self.vars_with_depth = {}  # Counts with depth
        self.vars_with_position = {}
    
    def process_parsetree_with_position(self, parse_tree, position=None):
        key, children = parse_tree        
        # Position advances when a terminal is consumed.
        if position is None: position = [0]
        if not utils.is_nt(key):
            position[0] += 1
            return

        rule_str = self.to_rule_str(parse_tree)
        if key not in self.vars_with_position:
            self.vars_with_position[key] = {self.rule_str(r):[] for r in self.grammar[key]}
        rule_pos = self.vars_with_position[key][rule_str]

        # Track with position
        rule_pos.append(position[0])

        for child in children:
            self.process_parsetree_with_position(child, position)
        return self.vars_with_position

In [None]:
if __name__ == '__main__':
    stats = InputStats(grammars.EXPR_GRAMMAR)
    stats.process_parsetree_with_position(tree)
    for k in stats.vars_with_position:
        print(k)
        kv = stats.vars_with_position[k]
        for v in kv:
            print("- ", v, kv[v])
            if len(kv[v]) > 0:
                print('  mean pos:', statistics.mean(kv[v]))
            if len(kv[v]) > 1:
                print('  stdev (%0.2f)' % statistics.stdev(kv[v]))
        print()

## Arborist

In [None]:
class Tree:
    def __init__(self, tree):
        self.tree = tree

    def to_str(self):
        return utils.tree_to_str(self.tree)

    def __repr__(self):
        return self.to_str()

    def count_tokens(self, tree, pos=None):
        key, children = tree
        if pos is None: pos = [0]
        if not utils.is_nt(key):
            pos[0] += 1
        for c in children:
            self.count_tokens(c, pos)
        return pos[0]

    def path(self, path, tree=None, pos=0):
        fst, *rest = path
        if tree is None: tree = self.tree
        children = tree[1]
        for i in range(fst):
            pos += self.count_tokens(children[i])
        if not rest:
            return pos, Tree(children[fst])
        return self.path(rest, children[fst], pos)

In [None]:
if __name__ == '__main__':
    target = tree[1][0][1][0][1][0][1]
    utils.display_tree(target[1])

In [None]:
if __name__ == '__main__':
    t = Tree(tree)
    print(t.to_str())
    print('Tokens:', t.count_tokens(tree))
    pos, t_t = t.path([0, 0, 0, 1])
    print('pos:', pos)
    utils.display_tree(t_t.tree)

In [None]:
if __name__ == '__main__':
    subtree = list(my_parser.parse_on(text='2+1', start_symbol='<expr>'))[0]
    utils.display_tree(subtree)

In [None]:
if __name__ == '__main__':
    target[1] = subtree

In [None]:
if __name__ == '__main__':
    utils.display_tree(tree)

In [None]:
if __name__ == '__main__':
    print(utils.tree_to_str(tree))

In [None]:
if __name__ == '__main__':
    newtree = list(my_parser.parse_on(text='18439249', start_symbol='<start>'))[0]
    utils.display_tree(newtree)

What if you want to parse more grammar varieties? For example, the following grammar describing the same language will not be parsable by `LL1Parser`.
```
grammar = {
    '<start>': [['<expr>']],
    '<expr>': [
        ['<expr>', '+', '<expr>'],
        ['<expr>', '-', '<expr>'],
        ['<expr>', '*', '<expr>'],
        ['<expr>', '/', '<expr>'],
        ['(','<expr>',')']],
        ['<digits>']],
    '<digits>': [
        ['<digit>','<digits>'],
        ['<digit>']],
    '<digit>': [["%s" % str(i)] for i in range(10)],
}
START = '<start>'
```

In the case of such grammars, we can use one of the general context-free parsers. These include
* Earley parser (in this repository)
* GLL parser
* GLR parser
* CYK parser
* Valiant parser
  and so on.

The tradeoff is that each of these parsers are costly when compared to the simple LL1Parser ($O(N^3)$ or beyond compared to O(N) for LL1Parser.).

# Done

In [None]:
#%tb