# Building Fast Grammar Fuzzers

## System configuration

In [1]:
!uname -v

Darwin Kernel Version 18.5.0: Mon Mar 11 22:26:31 PDT 2019; root:xnu-4903.251.3~5/RELEASE_X86_64


In [2]:
!sw_vers

ProductName:	Mac OS X
ProductVersion:	10.14.4
BuildVersion:	18E2035


In [3]:
!system_profiler SPHardwareDataType

Hardware:

    Hardware Overview:

      Model Name: MacBook Pro
      Model Identifier: MacBookPro15,2
      Processor Name: Intel Core i5
      Processor Speed: 2,4 GHz
      Number of Processors: 1
      Total Number of Cores: 4
      L2 Cache (per Core): 256 KB
      L3 Cache: 6 MB
      Memory: 16 GB
      Boot ROM Version: 220.250.366.0.0 (iBridge: 16.16.4756.0.0,0)
      Serial Number (system): C02YT7PQLVDL
      Hardware UUID: C3D7AD26-A92B-5FDD-9EB0-7B1D6CA0AE2E



In [4]:
import pkg_resources
import json
def ipkg(pkg, repo):
    try:
        distinfo = pkg_resources.get_distribution(pkg)
    except pkg_resources.DistributionNotFound:
        !pip install {repo}
    else:
        print(pkg, 'found')

In [5]:
ipkg('fuzzingbook', 'fuzzingbook')

fuzzingbook found


In [6]:
from fuzzingbook.Timer import Timer
from fuzzingbook.ExpectError import ExpectTimeout

In [7]:
!rm -rf /tmp/stop.ffg
!rm -rf testers tests
!mkdir -p testers tests

In [8]:
!rm -rf results

**important** We rely on a relatively high recursion limit : 20900 which is only available on MacOSX (Not in Linux).

## Why focus on faster grammar fuzzing?
* Guided fuzzing of uninstrumented code (aka. memory fuzzing).
* Grammar Mining advances

## Building a simple fuzzer

In [9]:
import random
import string
import statistics

In [10]:
def producer(chars, l, n=1):
    return [''.join([random.choice(chars) for i in range(l)]) for i in range(n)]

In [11]:
producer(string.printable, 100)

["7`<A)m+s=8Sz'!>0(6>5C?IM8IYMc/vL4rz,%\x0bF6*$*H\tC%c[9-'6:3m,\tJ\t@e Yg1o#A4|vw\\ehd*39L0c9#{.T\r.|E59F0@>l<"]

In [12]:
import os, subprocess
from datetime import datetime

## A better tester

In [13]:
import os, subprocess
from datetime import datetime
from resource import getrusage as resource_usage, RUSAGE_CHILDREN
from time import time as timestamp
START_TIME = datetime.now()

In [14]:
class timeit():
    def __init__(self):
        pass
    def __enter__(self):
        self.start_time, self.start_resources = timestamp(), resource_usage(RUSAGE_CHILDREN)
        return self
    def __exit__(self, *args, **kwargs):
        end_time, end_resources = timestamp(), resource_usage(RUSAGE_CHILDREN)
        self._runtime = end_time - self.start_time
        self.sys_runtime = end_resources.ru_stime - self.start_resources.ru_stime 
        self.usr_runtime = end_resources.ru_utime - self.start_resources.ru_utime
        self.runtime = self.sys_runtime + self.usr_runtime

In [15]:
TX = {}

In [16]:
# II
class Tester:
    def __init__(self, name=None, max_num=10000, start_depth=3, limit_depth=9, timeout=3600, iterations=2):
        global TX
        if name is not None:
            self.tname = name
        else:
            self.tname = self.__class__.__name__
        self.tx = TX
        self.max_num, self.start_depth, self.limit_depth, self.timeout, self.iterations = \
            max_num, start_depth, limit_depth, timeout, iterations
        self.tst = {}
        self.tx[self.tname] = self.tst
        self.WARMUP_TIMES = 10
        self.timedout = None
        
    def write_t(self, cmd):
        self.t = "testers/%s-t.sh" % self.tname
        with open(self.t, 'w') as f:
            print('''\
#!/usr/bin/env bash
TIMEFORMAT="%%U %%S";
time %(cmd)s''' % {'cmd':cmd}, file=f)
        !chmod +x {self.t}
        
        
    def init_run(self):
        !rm -rf testers/{self.tname}
        !mkdir -p testers/{self.tname}

    def pre_time(self):
        !rm -rf tests
        !mkdir -p tests
        
    def pre_exec(self, t):
        pass
        
    def exec_program(self, seed, max_depth, t):
        raise NotImplementedError()

    def post_exec(self, t):
        pass
    
    def post_time(self):
        if not self._runtime: return
        if self.file is not None and os.path.exists(self.file):
            lines_cmd = ("cat %s| wc -l" % self.file)
            self.lines = subprocess.getoutput(lines_cmd).strip()
            #unique_cmd = ("cat %s| sort -u| wc -l" % self.file)
            #self.unique_lines = subprocess.getoutput(unique_cmd).strip()
            self.size = os.stat(self.file).st_size
        else:
            self.unique_lines = ''
            self.lines = ''
            self.size = 0
        self.throughput = (self.size/1024/self._runtime, self._runtime)
    
    def timed_exec(self, seed, max_depth, verbose):
        self.pre_time()
        self._runtime = None
        self.timedout = True
        with ExpectTimeout(self.timeout): #, print_traceback=False, mute=True):
            #with timeit() as t:
            t = None
            self.pre_exec(t)
            cmdline = self.exec_program(seed, max_depth, t)
            self.write_t(cmdline)
            !{self.t} 2>./testers/time.out
            self.post_exec(t)
            with open('testers/time.out') as f:
                usr, sys = f.read().strip().split(' ')
            self._runtime = float(usr)+ float(sys)
            self._sys_runtime = float(sys)
            self._usr_runtime = float(usr)
            self.timedout = False
        self.post_time()

    def ofile(self, max_depth, seed):
        fn = 'testers/%s/%d_%d.x' % (self.tname, max_depth, seed)
        return fn
    
    def check_continue(self):
        if os.path.exists('/tmp/stop.ffg'):
            raise Exception('/tmp/stop.ffg -- abort tests')

    def run_test(self, verbose=False):
        def warmup(seed):
            # for warming up, we simply run it a few times before in the
            # same seed as the first, and discount it in computation.
            return [seed]*self.WARMUP_TIMES
        current_time = datetime.now()
        self.init_run()
        # depth is for later when we deal with grammars.
        
        # warmup loop
        for md in [self.start_depth]:
            max_depth = 2**md
            for seed in warmup(0):
                self.file = self.ofile(max_depth, seed)
                self.timed_exec(seed, max_depth, verbose)
                if os.path.exists(self.file): os.remove(self.file)
                
        for md in range(self.start_depth, self.limit_depth):
            max_depth = 2**md
            v = {}
            res = {'detail': v}
            self.tst[max_depth] = res
            seeds = list(range(self.iterations))
            for seed in seeds:
                if self.timedout: break
                self.file = self.ofile(max_depth, seed)
                if verbose: print('depth:', max_depth, 'seed:', seed, 'file:', self.file)
                self.timed_exec(seed, max_depth, verbose)
                if self._runtime:
                    v[seed] = {
                        'runtime':self._runtime,
                        'sys_runtime':self._sys_runtime,
                        'usr_runtime':self._usr_runtime,
                        'size': self.size,
                        #'uniq': self.unique_lines,
                        'lines': self.lines,
                        # in kbytes
                        'throughput': self.size/self._runtime/(1024)}
                if verbose:
                    print(v[seed])
                if os.path.exists(self.file): os.remove(self.file)
                self.check_continue()
            if self.timedout:
                print('Timeout')
                break # we do not expect larger depths to work.
            size = [t['size'] for t in v.values()]
            res['avgsize'] = statistics.mean(size)
                
            sec = [t['runtime'] for t in v.values()]
            res['avgruntime'] = statistics.mean(sec)
            res['stdevruntime'] = statistics.stdev(sec)
                
            tp = [t['throughput'] for t in v.values()]
            res['avgthroughput'] = statistics.mean(tp)
            res['stdevthroughput'] = statistics.stdev(tp)
            print('depth=', max_depth, "size=", res['avgsize'], 'time=', round(res['avgruntime'],3), "stdev(%s)" % str(round(res['stdevruntime'],3)), 'throughput=',res['avgthroughput'], "stdev(%s)" % str(round(res['stdevthroughput'])))
        self.total_test_time = datetime.now() - current_time
        self.dump()
        return self
    
    def dump(self):
        curtime = datetime.now().isoformat()
        name = 'results/%s-tx.json' % (self.tname)
        !mkdir -p results
        with open(name, 'w+') as f:
            print(json.dumps(TX), file=f)
    
    def show(self):
        max_throughput = 0
        best_depth = None
        for depth in self.tst.keys():
            res = self.tst[depth]
            if res.get('avgthroughput',0) > max_throughput:
                max_throughput = res['avgthroughput'] 
                best_depth = depth
        print('Throughput of ', max_throughput, ' kilobytes per second at depth = ', best_depth)
        print("Total time:",str(self.total_test_time))

In [17]:
class RandomTester(Tester):
    def pre_time(self):
        with open('testers/RandomTester/r.py', 'w+') as f:
            print('''
import string,random,sys
random.seed(int(sys.argv[1]))
def producer(chars, l, n=1):
    return [''.join([random.choice(chars) for i in range(l)]) for i in range(n)]
print(producer(string.printable, int(sys.argv[2])))''', file=f)
            
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"python testers/RandomTester/r.py {seed} {max_depth} > {fn}"

In [18]:
RandomTester().run_test().show()

depth= 8 size= 14.5 time= 0.038 stdev(0.001) throughput= 0.3732323068260568 stdev(0)
depth= 16 size= 22.5 time= 0.037 stdev(0.001) throughput= 0.5939327485380117 stdev(0)
depth= 32 size= 41 time= 0.037 stdev(0.001) throughput= 1.09789965746997 stdev(0)
depth= 64 size= 75 time= 0.037 stdev(0.0) throughput= 1.9795185810810807 stdev(0)
depth= 128 size= 146 time= 0.038 stdev(0.001) throughput= 3.80553765113798 stdev(0)
depth= 256 size= 279.5 time= 0.039 stdev(0.003) throughput= 7.006208285266974 stdev(0)
Throughput of  7.006208285266974  kilobytes per second at depth =  256
Total time: 0:00:06.503435


In [19]:
TX

{'RandomTester': {8: {'detail': {0: {'runtime': 0.039,
     'sys_runtime': 0.009,
     'usr_runtime': 0.03,
     'size': 14,
     'lines': '1',
     'throughput': 0.3505608974358974},
    1: {'runtime': 0.037000000000000005,
     'sys_runtime': 0.008,
     'usr_runtime': 0.029,
     'size': 15,
     'lines': '1',
     'throughput': 0.3959037162162162}},
   'avgsize': 14.5,
   'avgruntime': 0.038000000000000006,
   'stdevruntime': 0.0014142135623730913,
   'avgthroughput': 0.3732323068260568,
   'stdevthroughput': 0.03206221463767613},
  16: {'detail': {0: {'runtime': 0.036000000000000004,
     'sys_runtime': 0.008,
     'usr_runtime': 0.028,
     'size': 22,
     'lines': '1',
     'throughput': 0.5967881944444444},
    1: {'runtime': 0.038,
     'sys_runtime': 0.009,
     'usr_runtime': 0.029,
     'size': 23,
     'lines': '1',
     'throughput': 0.591077302631579}},
   'avgsize': 22.5,
   'avgruntime': 0.037000000000000005,
   'stdevruntime': 0.0014142135623730913,
   'avgthroughput

## Grammars

In [20]:
# from fuzzingbook.Parser import make_grammar
# We need a bit of modification make the grammars more varied

In [21]:
def prod_line_grammar(nonterminals, terminals):
    g = {
        '<start>': ['<symbols>'],
        '<symbols>': ['<symbol><symbols>', '<symbol>'],
        '<symbol>': ['<nonterminals>', '<terminals>'],
        '<nonterminals>': ['<lt><alpha><gt>'],
        '<lt>': ['<'],
        '<gt>': ['>'],
        '<alpha>': nonterminals,
        '<terminals>': terminals
    }

    if not nonterminals:
        g['<nonterminals>'] = ['']
        del g['<lt>']
        del g['<alpha>']
        del g['<gt>']

    return g


In [22]:
from fuzzingbook.GrammarFuzzer import GrammarFuzzer
from fuzzingbook.Parser import canonical
from fuzzingbook.Grammars import unreachable_nonterminals, RE_NONTERMINAL
import random, string, re

In [23]:
def make_rule(nonterminals, terminals, num_alts):
    prod_grammar = prod_line_grammar(nonterminals, terminals)

    gf = GrammarFuzzer(prod_grammar, min_nonterminals=3, max_nonterminals=5)
    name = "<%s>" % ''.join(random.choices(string.ascii_uppercase, k=3))

    return (name, [gf.fuzz() for _ in range(num_alts)])

In [24]:
make_rule(["A", "B", "C"], ["1", "2", "3"], 3)

('<PJB>', ['122', '2<A><C>2', '323<A>'])

In [25]:
def make_grammar(num_symbols=3, num_alts=3):
    terminals = list(string.ascii_lowercase)
    grammar = {}
    name = None
    for _ in range(num_symbols):
        nonterminals = [k[1:-1] for k in grammar.keys()]
        name, expansions = \
            make_rule(nonterminals, terminals, num_alts)
        grammar[name] = expansions

    grammar['<start>'] = [name]

    # Remove unused parts
    for nonterminal in unreachable_nonterminals(grammar):
        del grammar[nonterminal]
        
    return grammar

In [26]:
canonical(make_grammar())

{'<DAZ>': [['jm'], ['w'], ['ws']],
 '<FZM>': [['<DAZ>', 't', '<DAZ>', 'p'], ['<DAZ>', 'be'], ['<DAZ>', 'a']],
 '<HZS>': [['<FZM>', 'ay'],
  ['<DAZ>', '<DAZ>', '<DAZ>'],
  ['kd', '<DAZ>', '<DAZ>']],
 '<start>': [['<HZS>']]}

### A CSS grammar

In [27]:
css_grammar = {
    '<start>': [['<stylesheet>']],
    '<stylesheet>': [[
        '<[CHARSET_SYM_STRING_SEMI]-1>', ' ', '<[S_OR_CDO_OR_CDC]-1>', ' ',
        '<[import_CDO_S_OR_CDC_S]-1>', ' ', '<[stylesheet_closing_GROUPING]-1>'
    ]],
    '<[CHARSET_SYM_STRING_SEMI]>': [['<CHARSET_SYM>', ' ', '<STRING>', ' ;']],
    '<[S_OR_CDO_OR_CDC]>': [['<Sp>'], ['<CDO>'], ['<CDC>']],
    '<[import_CDO_S_OR_CDC_S]>': [['<import>', ' ', '<[CDO_S_OR_CDC_S]-1>']],
    '<[CDO_S_OR_CDC_S]>': [['<CDO>', ' ', '<Ss>'], ['<CDC>', ' ', '<Ss>']],
    '<[ruleset_OR_media_OR_page]>': [['<ruleset>'], ['<media>'], ['<page>']],
    '<[stylesheet_closing_GROUPING]>':
    [['<[ruleset_OR_media_OR_page]>', ' ', '<[CDO_S_OR_CDC_S]-2>']],
    '<import>': [[
        '<IMPORT_SYM>', ' ', '<Ss>', ' ', '<[STRING_OR_URI]>', ' ', '<Ss>',
        ' ', '<media_list-1>', ' ; ', '<Ss>'
    ]],
    '<[STRING_OR_URI]>': [['<STRING>'], ['<URI>']],
    '<media>': [[
        '<MEDIA_SYM>', ' ', '<Ss>', ' ', '<media_list>', ' { ', '<Ss>', ' ',
        '<ruleset-1>', ' } ', '<Ss>'
    ]],
    '<media_list>': [['<medium>', ' ', '<[COMMA_S_medium]-1>']],
    '<[COMMA_S_medium]>': [[', ', '<Ss>', ' ', '<medium>']],
    '<medium>': [['<IDENT>', ' ', '<Ss>']],
    '<page>': [[
        '<PAGE_SYM>', ' ', '<Ss>', ' ', '<pseudo_page-1>', ' { ', '<Ss>', ' ',
        '<declaration-1>', ' ', '<[SEMI_S_declaration]-1>', ' } ', '<Ss>'
    ]],
    '<[SEMI_S_declaration]>': [['; ', '<Ss>', ' ', '<declaration-2>']],
    '<pseudo_page>': [[': ', '<IDENT>', ' ', '<Ss>']],
    '<operator>': [['/ ', '<Ss>'], [', ', '<Ss>']],
    '<combinator>': [['+ ', '<Ss>'], ['> ', '<Ss>']],
    '<unary_operator>': [['-'], ['+']],
    '<property>': [['<IDENT>', ' ', '<Ss>']],
    '<ruleset>': [[
        '<selector>', ' ', '<COMMA_S_selector-1>', ' { ', '<Ss>', ' ',
        '<declaration-3>', ' ', '<[SEMI_S_declaration]-2>', ' } ', '<Ss>'
    ]],
    '<COMMA_S_selector>': [[', ', '<Ss>', ' selector']],
    '<selector>':
    [['<simple_selector>', ' ', '<[combinator_selector_OR_S]-1>']],
    '<[combinator_selector]>': [['<combinator-1>', ' ', '<selector>']],
    '<[combinator_selector_OR_S]>':
    [['<combinator>', ' ', '<selector>'],
     ['<Sp>', ' ', '<[combinator_selector]-1>']],
    '<simple_selector>':
    [['<element_name>', ' ', '<[HASH_OR_class_OR_attrib_OR_pseudo]-1>'],
     ['<[HASH_OR_class_OR_attrib_OR_pseudo]-2>']],
    '<[HASH_OR_class_OR_attrib_OR_pseudo]>': [['<HASH>'], ['<class>'],
                                              ['<attrib>'], ['<pseudo>']],
    '<class>': [['.', '<IDENT>']],
    '<element_name>': [['<IDENT>'], ['*']],
    '<attrib>': [[
        '[ ', '<Ss>', ' ', '<IDENT>', ' ', '<Ss>', ' ',
        '<[attrib_GROUPING]-1>', ' ]'
    ]],
    '<[EQUAL_OR_INCLUDES_OR_DASHMATCH]>': [['='], ['<INCLUDES>'],
                                           ['<DASHMATCH>']],
    '<[IDENT_OR_STRING]>': [['<IDENT>'], ['<STRING>']],
    '<[attrib_GROUPING]>': [[
        '<[EQUAL_OR_INCLUDES_OR_DASHMATCH]>', ' ', '<Ss>', ' ',
        '<[IDENT_OR_STRING]>', ' ', '<Ss>'
    ]],
    '<pseudo>': [[': ', '<[IDENT_OR_FUNCTION]>']],
    '<[IDENT_OR_FUNCTION]>':
    [['<IDENT>'], ['<FUNCTION>', ' ', '<Ss>', ' ', '<[IDENT_S]-1>', ' )']],
    '<[IDENT_S]>': [['<IDENT>', ' ', '<Ss>']],
    '<declaration>':
    [['<property>', ' : ', '<Ss>', ' ', '<expr>', ' ', '<prio-1>']],
    '<prio>': [['<IMPORT_SYM>', ' ', '<Ss>']],
    '<expr>': [['<term>', ' ', '<[operator_term]-1>']],
    '<[operator_term]>': [['<operator-1>', ' ', '<term>']],
    '<term>': [['<unary_operator-1>', ' ', '<[term_GROUPING]>'],
               ['<STRING>', ' ', '<Ss>'], ['<IDENT>', ' ', '<Ss>'],
               ['<URI>', ' ', '<Ss>'], ['<hexcolor>'], ['<function>']],
    '<[term_GROUPING]>': [['<NUMBER>', ' ', '<Ss>'],
                          ['<PERCENTAGE>', ' ', '<Ss>'],
                          ['<LENGTH>', ' ', '<Ss>'], ['<EMS>', ' ', '<Ss>'],
                          ['<EXS>', ' ', '<Ss>'], ['<ANGLE>', ' ', '<Ss>'],
                          ['<TIME>', ' ', '<Ss>'], ['<FREQ>', ' ', '<Ss>']],
    '<function>': [['<FUNCTION>', ' ', '<Ss>', ' ', '<expr>', ' ) ', '<Ss>']],
    '<hexcolor>': [['#', '<[three_char_HEX]>', ' ', '<Ss>'],
                   ['#', '<[six_char_HEX]>', ' ', '<Ss>']],
    '<[three_char_HEX]>': [['<HEX_CHAR>', '<HEX_CHAR>', '<HEX_CHAR>']],
    '<[six_char_HEX]>': [[
        '<HEX_CHAR>', '<HEX_CHAR>', '<HEX_CHAR>', '<HEX_CHAR>', '<HEX_CHAR>',
        '<HEX_CHAR>'
    ]],
    '<HEX_CHAR>': [['0'], ['1'], ['2'], ['3'], ['4'], ['5'], ['6'], ['7'],
                   ['8'], ['9'], ['a'], ['b'], ['c'], ['d'], ['e'], ['f']],
    '<Sp>': [['<WHITESPACE-1>']],
    '<Ss>': [['<WHITESPACE-2>']],
    '<So>': [[], ['<WHITESPACE>']],
    '<CDO>': [['<!--']],
    '<CDC>': [['-->']],
    '<INCLUDES>': [['~=']],
    '<DASHMATCH>': [['|=']],
    '<STRING>': [['<string1>'], ['<string2>']],
    '<IDENT>': [['<{ident}>']],
    '<HASH>': [['#', '<{name}>']],
    '<IMPORT_SYM>': [['@import']],
    '<PAGE_SYM>': [['@page']],
    '<MEDIA_SYM>': [['@media']],
    '<CHARSET_SYM>': [['@charset']],
    '<EMS>': [['<{num}>', 'em']],
    '<EXS>': [['<{num}>', 'ex']],
    '<LENGTH>': [['<{num}>', 'px'], ['<{num}>', 'cm'], ['<{num}>', 'mm'],
                 ['<{num}>', 'in'], ['<{num}>', 'pt'], ['<{num}>', 'pc']],
    '<ANGLE>': [['<{num}>', 'deg'], ['<{num}>', 'rad'], ['<{num}>', 'grad']],
    '<TIME>': [['<{num}>', 'ms'], ['<{num}>', 's']],
    '<FREQ>': [['<{num}>', 'hz'], ['<{num}>', 'khz']],
    '<PERCENTAGE>': [['<{num}>', '%']],
    '<NUMBER>': [['<{num}>']],
    '<URI>': [['url("', '<So>', '<url>', '<So>', '")'],
              ['url("', '<So>', '<STRING>', '<So>', '")']],
    '<url>': [['<url_-1>']],
    '<url_>': [['!'], ['#'], ['$'], ['%'], ['&'], ['*'], ['-'], ['~'],
               ['<escape>']],
    '<FUNCTION>': [['<{ident}>', '(']],
    '<string1>': [['"', '<qmychars1-1>', '"']],
    '<string2>': [["'", '<qmychars2-1>', "'"]],
    '<qnonl1>': [['7'], ['Q'], ['J'], ['@'], ['2'], ['g'], ['\t'], ['X'],
                 ['`'], ['G'], ['e'], ['['], ['?'], ['v'], ['$'], ['j'], ['K'],
                 ['d'], ['A'], ['n'], ['h'], ['l'], ['4'], ['D'], ['a'], ['#'],
                 ['f'], ['y'], ['B'], ['U'], ['P'], ['3'], ['O'], ['S'], [')'],
                 [' '], ['W'], ['o'], ['b'], ['|'], ['q'], ['L'], [']'], ['V'],
                 ['*'], ['z'], ['}'], ['6'], ['u'], ['^'], [','], ['N'], ['>'],
                 ['+'], ['Y'], ['t'], ['k'], ['!'], ['p'], ['Z'], ['E'], ['('],
                 ['\\'], ['<'], ['F'], ['%'], ['9'], ['0'], ['s'], [';'],
                 ['&'], ['C'], ['T'], ['r'], ['5'], ['R'], ["'"], ['_'], ['.'],
                 ['8'], ['H'], ['i'], ['/'], ['M'], ['~'], ['{'], [':'], ['c'],
                 ['I'], ['-'], ['\x0b'], ['1'], ['w'], ['x'], ['m'], ['=']],
    '<qnonl2>': [['7'], ['Q'], ['J'], ['@'], ['2'], ['g'], ['\t'], ['X'],
                 ['`'], ['G'], ['e'], ['['], ['?'], ['v'], ['$'], ['j'], ['K'],
                 ['d'], ['A'], ['n'], ['h'], ['l'], ['4'], ['D'], ['a'], ['#'],
                 ['f'], ['y'], ['B'], ['U'], ['P'], ['3'], ['O'], ['S'], [')'],
                 [' '], ['W'], ['o'], ['b'], ['|'], ['q'], ['L'], [']'], ['V'],
                 ['*'], ['z'], ['}'], ['6'], ['u'], ['^'], [','], ['N'], ['>'],
                 ['+'], ['Y'], ['t'], ['k'], ['!'], ['p'], ['Z'], ['E'], ['('],
                 ['\\'], ['<'], ['F'], ['%'], ['9'], ['0'], ['s'], [';'],
                 ['&'], ['C'], ['T'], ['r'], ['5'], ['"'], ['R'], ['_'], ['.'],
                 ['8'], ['H'], ['i'], ['/'], ['M'], ['~'], ['{'], [':'], ['c'],
                 ['I'], ['-'], ['\x0b'], ['1'], ['w'], ['x'], ['m'], ['=']],
    '<qmychars1>': [['<qnonl1>'], ['\\', '<nl>'], ['<escape>']],
    '<qmychars2>': [['<qnonl2>'], ['\\', '<nl>'], ['<escape>']],
    '<nl>': [['\r'], ['\n'], ['\x0c'], ['\r\n']],
    '<escape>': [['\\', '<echar>']],
    '<echar>': [['Q'], ['J'], ['@'], ['g'], ['\t'], ['X'], ['`'], ['G'], ['['],
                ['?'], ['v'], ['$'], ['j'], ['K'], ['A'], ['n'], ['h'], ['l'],
                ['D'], ['#'], ['y'], ['B'], ['U'], ['P'], ['O'], ['S'], [')'],
                [' '], ['W'], ['o'], ['|'], ['q'], ['L'], [']'], ['V'], ['*'],
                ['z'], ['}'], ['u'], ['^'], [','], ['N'], ['>'], ['+'], ['Y'],
                ['t'], ['k'], ['!'], ['p'], ['Z'], ['E'], ['('], ['\\'], ['<'],
                ['F'], ['%'], ['s'], [';'], ['&'], ['C'], ['T'], ['r'], ['"'],
                ['R'], ["'"], ['_'], ['.'], ['H'], ['i'], ['/'], ['M'], ['~'],
                ['{'], [':'], ['I'], ['-'], ['\x0b'], ['w'], ['x'], ['m'],
                ['=']],
    '<{ident}>': [['<minus-1>', '<nmstart>', '<nmchar-1>']],
    '<nmstart>': [['a'], ['b'], ['c'], ['d'], ['e'], ['f'], ['g'], ['h'],
                  ['i'], ['j'], ['k'], ['l'], ['m'], ['n'], ['o'], ['p'],
                  ['q'], ['r'], ['s'], ['t'], ['u'], ['v'], ['w'], ['x'],
                  ['y'], ['z'], ['<escape>'], ['_']],
    '<nmchar>': [['a'], ['b'], ['c'], ['d'], ['e'], ['f'], ['g'], ['h'], ['i'],
                 ['j'], ['k'], ['l'], ['m'], ['n'], ['o'], ['p'], ['q'], ['r'],
                 ['s'], ['t'], ['u'], ['v'], ['w'], ['x'], ['y'], ['z'], ['0'],
                 ['1'], ['2'], ['3'], ['4'], ['5'], ['6'], ['7'], ['8'], ['9'],
                 ['<escape>'], ['_'], ['-']],
    '<minus>': [['-']],
    '<{name}>': [['<nmchar-2>']],
    '<{num}>': [['<INTEGER>']],
    '<WHITESPACE>': [[' '], ['\t']],
    '<INTEGER>': [['<DIGIT>', '<INTEGER>'], ['<DIGIT>']],
    '<DIGIT>': [['0'], ['1'], ['2'], ['3'], ['4'], ['5'], ['6'], ['7'], ['8'],
                ['9']],
    '<[CHARSET_SYM_STRING_SEMI]-1>': [[], ['<[CHARSET_SYM_STRING_SEMI]>']],
    '<[S_OR_CDO_OR_CDC]-1>': [[],
                              ['<[S_OR_CDO_OR_CDC]>',
                               '<[S_OR_CDO_OR_CDC]-1>']],
    '<[import_CDO_S_OR_CDC_S]-1>':
    [[], ['<[import_CDO_S_OR_CDC_S]>', '<[import_CDO_S_OR_CDC_S]-1>']],
    '<[stylesheet_closing_GROUPING]-1>':
    [[],
     ['<[stylesheet_closing_GROUPING]>', '<[stylesheet_closing_GROUPING]-1>']],
    '<[CDO_S_OR_CDC_S]-1>': [[],
                             ['<[CDO_S_OR_CDC_S]>', '<[CDO_S_OR_CDC_S]-1>']],
    '<[CDO_S_OR_CDC_S]-2>': [[],
                             ['<[CDO_S_OR_CDC_S]>', '<[CDO_S_OR_CDC_S]-2>']],
    '<media_list-1>': [[], ['<media_list>']],
    '<ruleset-1>': [[], ['<ruleset>', '<ruleset-1>']],
    '<[COMMA_S_medium]-1>': [[],
                             ['<[COMMA_S_medium]>', '<[COMMA_S_medium]-1>']],
    '<pseudo_page-1>': [[], ['<pseudo_page>']],
    '<declaration-1>': [[], ['<declaration>']],
    '<[SEMI_S_declaration]-1>':
    [[], ['<[SEMI_S_declaration]>', '<[SEMI_S_declaration]-1>']],
    '<declaration-2>': [[], ['<declaration>']],
    '<COMMA_S_selector-1>': [[],
                             ['<COMMA_S_selector>', '<COMMA_S_selector-1>']],
    '<declaration-3>': [[], ['<declaration>']],
    '<[SEMI_S_declaration]-2>':
    [[], ['<[SEMI_S_declaration]>', '<[SEMI_S_declaration]-2>']],
    '<[combinator_selector_OR_S]-1>': [[], ['<[combinator_selector_OR_S]>']],
    '<combinator-1>': [[], ['<combinator>']],
    '<[combinator_selector]-1>': [[], ['<[combinator_selector]>']],
    '<[HASH_OR_class_OR_attrib_OR_pseudo]-1>':
    [[],
     [
         '<[HASH_OR_class_OR_attrib_OR_pseudo]>',
         '<[HASH_OR_class_OR_attrib_OR_pseudo]-1>'
     ]],
    '<[HASH_OR_class_OR_attrib_OR_pseudo]-2>':
    [['<[HASH_OR_class_OR_attrib_OR_pseudo]>'],
     [
         '<[HASH_OR_class_OR_attrib_OR_pseudo]>',
         '<[HASH_OR_class_OR_attrib_OR_pseudo]-2>'
     ]],
    '<[attrib_GROUPING]-1>': [[], ['<[attrib_GROUPING]>']],
    '<[IDENT_S]-1>': [[], ['<[IDENT_S]>']],
    '<prio-1>': [[], ['<prio>']],
    '<[operator_term]-1>': [[], ['<[operator_term]>', '<[operator_term]-1>']],
    '<operator-1>': [[], ['<operator>']],
    '<unary_operator-1>': [[], ['<unary_operator>']],
    '<WHITESPACE-1>': [['<WHITESPACE>'], ['<WHITESPACE>', '<WHITESPACE-1>']],
    '<WHITESPACE-2>': [[], ['<WHITESPACE>', '<WHITESPACE-2>']],
    '<url_-1>': [[], ['<url_>', '<url_-1>']],
    '<qmychars1-1>': [[], ['<qmychars1>', '<qmychars1-1>']],
    '<qmychars2-1>': [[], ['<qmychars2>', '<qmychars2-1>']],
    '<minus-1>': [[], ['<minus>']],
    '<nmchar-1>': [[], ['<nmchar>', '<nmchar-1>']],
    '<nmchar-2>': [['<nmchar>'], ['<nmchar>', '<nmchar-2>']]
}

### A JSON grammar

In [28]:
json_grammar = {
    '<start>': [['<json>']],
    '<json>': [['<element>']],
    '<element>': [['<ws>', '<value>', '<ws>']],
    '<value>': [['<object>'], ['<array>'], ['<string>'], ['<number>'],
                ['true'], ['false'],
                ['null']],
    '<object>': [['{', '<ws>', '}'], ['{', '<members>', '}']],
    '<members>': [['<member>', '<symbol-2>']],
    '<member>': [['<ws>', '<string>', '<ws>', ':', '<element>']],
    '<array>': [['[', '<ws>', ']'], ['[', '<elements>', ']']],
    '<elements>': [['<element>', '<symbol-1-1>']],
    '<string>': [['"', '<characters>', '"']],
    '<characters>': [['<character-1>']],
    '<character>': [['0'], ['1'], ['2'], ['3'], ['4'], ['5'], ['6'], ['7'],
                    ['8'], ['9'], ['a'], ['b'], ['c'], ['d'], ['e'], ['f'],
                    ['g'], ['h'], ['i'], ['j'], ['k'], ['l'], ['m'], ['n'],
                    ['o'], ['p'], ['q'], ['r'], ['s'], ['t'], ['u'], ['v'],
                    ['w'], ['x'], ['y'], ['z'], ['A'], ['B'], ['C'], ['D'],
                    ['E'], ['F'], ['G'], ['H'], ['I'], ['J'], ['K'], ['L'],
                    ['M'], ['N'], ['O'], ['P'], ['Q'], ['R'], ['S'], ['T'],
                    ['U'], ['V'], ['W'], ['X'], ['Y'], ['Z'], ['!'], ['#'],
                    ['$'], ['%'], ['&'], ['\''], ['('], [')'], ['*'], ['+'],
                    [','], ['-'], ['.'], ['/'], [':'], [';'], ['<'], ['='],
                    ['>'], ['?'], ['@'], ['['], [']'], ['^'], ['_'], ['`'],
                    ['{'], ['|'], ['}'], ['~'], [' '], ['<esc>']],
    '<esc>': [['\\','<escc>']],
    '<escc>': [['\\'],['b'],['f'], ['n'], ['r'],['t'],['"']],
    '<number>': [['<int>', '<frac>', '<exp>']],
    '<int>': [['<digit>'], ['<onenine>', '<digits>'], ['-', '<digits>'],
              ['-', '<onenine>', '<digits>']],
    '<digits>': [['<digit-1>']],
    '<digit>': [['0'], ['<onenine>']],
    '<onenine>': [['1'], ['2'], ['3'], ['4'], ['5'], ['6'], ['7'], ['8'],
                  ['9']],
    '<frac>': [[], ['.', '<digits>']],
    '<exp>': [[], ['E', '<sign>', '<digits>'], ['e', '<sign>', '<digits>']],
    '<sign>': [[], ['+'], ['-']],
    '<ws>': [['<sp1>', '<ws>'], []],
    '<sp1>': [[' '],['\n'],['\t'],['\r']],
    '<symbol>': [[',', '<members>']],
    '<symbol-1>': [[',', '<elements>']],
    '<symbol-2>': [[], ['<symbol>', '<symbol-2>']],
    '<symbol-1-1>': [[], ['<symbol-1>', '<symbol-1-1>']],
    '<character-1>': [[], ['<character>', '<character-1>']],
    '<digit-1>': [['<digit>'], ['<digit>', '<digit-1>']]
}

### An EXPR grammar

In [29]:
expr_grammar = {
    "<start>": [["<expr>"]],
    "<expr>": [["<term>", "+", "<expr>"], ["<term>", "-", "<expr>"],
               ["<term>"]],
    "<term>": [["<factor>", "*", "<term>"], ["<factor>", "/", "<term>"],
               ["<factor>"]],
    "<factor>": [["+", "<factor>"], ["-", "<factor>"], ["(", "<expr>", ")"],
                 ["<integer>", ".", "<integer>"], ["<integer>"]],
    "<integer>": [["<digit>", "<integer>"], ["<digit>"]],
    "<digit>": [["0"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"],
                ["9"]]
}

### An HTML grammar

In [30]:
# NOTE: HTML grammar requires precomputing key and rule recursion
html_grammar = {
    '<start>': [['<_l_>', '!DOCTYPE html', '<_r_>', '<html_document>']],
    '<_l_>': [['<']],
    '<_r_>': [['>']],
    '<_cl_>': [['</']],
    '<a_tag>':
    [['<_l_>', 'a', '<d>', '<_r_>', '<a_content-1>', '<_cl_>', 'a', '<_r_>']],
    '<a_content>': [['<heading>'], ['<text>']],
    '<abbr_tag>':
    [['<_l_>', 'abbr', '<d>', '<_r_>', '<text>', '<_cl_>', 'abbr', '<_r_>']],
    '<acronym_tag>': [[
        '<_l_>', 'acronym', '<d>', '<_r_>', '<text>', '<_cl_>', 'acronym',
        '<_r_>'
    ]],
    '<address_tag>': [[
        '<_l_>', 'address', '<d>', '<_r_>', '<address_content-1>', '<_cl_>',
        'address', '<_r_>'
    ]],
    '<address_content>': [['<p_tag>'], ['<text>']],
    '<applet_content>': [['<param-1>', '<body_content>']],
    '<area>': [['<_l_>', 'area', '<d>', '<_r_>']],
    '<applet_tag>': [[
        '<_l_>', 'applet', '<d>', '<_r_>', '<applet_content>', '<_cl_>',
        'applet', '<_r_>'
    ]],
    '<b_tag>':
    [['<_l_>', 'b', '<d>', '<_r_>', '<text>', '<_cl_>', 'b', '<_r_>']],
    '<basefont_tag>': [[
        '<_l_>', 'basefront', '<d>', '<_r_>', '<body_content>', '<_cl_>',
        'basefront', '<_r_>'
    ]],
    '<bdo_tag>':
    [['<_l_>', 'bdo', '<d>', '<_r_>', '<text>', '<_cl_>', 'bdo', '<_r_>']],
    '<big_tag>':
    [['<_l_>', 'big', '<d>', '<_r_>', '<text>', '<_cl_>', 'big', '<_r_>']],
    '<blink_tag>':
    [['<_l_>', 'blink', '<d>', '<_r_>', '<text>', '<_cl_>', 'blink', '<_r_>']],
    '<block>': [['<block_content-1>']],
    '<block_content>': [['<basefont_tag>'], ['<blockquote_tag>'],
                        ['<center_tag>'], ['<dir_tag>'], ['<div_tag>'],
                        ['<dl_tag>'], ['<form_tag>'], ['<listing_tag>'],
                        ['<menu_tag>'], ['<multicol_tag>'], ['<nobr_tag>'],
                        ['<ol_tag>'], ['<p_tag>'], ['<pre_tag>'],
                        ['<table_tag>'], ['<ul_tag>'], ['<xmp_tag>']],
    '<blockquote_tag>': [[
        '<_l_>', 'blockquote', '<d>', '<_r_>', '<body_content>', '<_cl_>',
        'blockquote', '<_r_>'
    ]],
    '<body_content>': [['<_l_>', 'bgsound', '<d>', '<_r_>'],
                       ['<_l_>', 'hr', '<_r_>'],
                       ['<address_tag>'], ['<block>'], ['<del_tag>'],
                       ['<heading>'], ['<ins_tag>'], ['<layer_tag>'],
                       ['<map_tag>'], ['<marquee_tag>'], ['<text>']],
    '<body_tag>': [[
        '<_l_>', 'body', '<d>', '<_r_>', '<body_content-1>', '<_cl_>', 'body',
        '<_r_>'
    ]],
    '<caption_tag>': [[
        '<_l_>', 'caption', '<d>', '<_r_>', '<body_content-2>', '<_cl_>',
        'caption', '<_r_>'
    ]],
    '<center_tag>': [[
        '<_l_>', 'center', '<d>', '<_r_>', '<body_content-3>', '<_cl_>',
        'center', '<_r_>'
    ]],
    '<cite_tag>':
    [['<_l_>', 'cite', '<d>', '<_r_>', '<text>', '<_cl_>', 'cite', '<_r_>']],
    '<code_tag>':
    [['<_l_>', 'code', '<d>', '<_r_>', '<text>', '<_cl_>', 'code', '<_r_>']],
    '<colgroup_content>': [['<_l_>', 'col', '<d>', '<_r_-1>']],
    '<colgroup_tag>':
    [['<_l_>', 'colgroup', '<d>', '<_r_>', '<colgroup_content>']],
    '<content_style>': [['<abbr_tag>'], ['<acronym_tag>'], ['<cite_tag>'],
                        ['<code_tag>'], ['<dfn_tag>'], ['<em_tag>'],
                        ['<kbd_tag>'], ['<q_tag>'], ['<strong_tag>'],
                        ['<var_tag>']],
    '<dd_tag>':
    [['<_l_>', 'dd', '<d>', '<_r_>', '<flow>', '<_cl_>', 'dd', '<_r_>']],
    '<del_tag>':
    [['<_l_>', 'del', '<d>', '<_r_>', '<flow>', '<_cl_>', 'del', '<_r_>']],
    '<dfn_tag>':
    [['<_l_>', 'dfn', '<d>', '<_r_>', '<text>', '<_cl_>', 'dfn', '<_r_>']],
    '<dir_tag>': [[
        '<_l_>', 'dir', '<d>', '<_r_>', '<li_tag-1>', '<_cl_>', 'dir', '<_r_>'
    ]],
    '<div_tag>': [[
        '<_l_>', 'div', '<d>', '<_r_>', '<body_content>', '<_cl_>', 'div',
        '<_r_>'
    ]],
    '<dl_content>': [['<dt_tag>', '<dd_tag>']],
    '<dl_tag>': [[
        '<_l_>', 'dl', '<d>', '<_r_>', '<dl_content-1>', '<_cl_>', 'dl',
        '<_r_>'
    ]],
    '<dt_tag>': [[
        '<_l_>', 'dt', '<d>', '<_r_>', '<text>', '<_cl_>', 'dt', '<_r_>'
    ]],
    '<em_tag>': [[
        '<_l_>', 'em', '<d>', '<_r_>', '<text>', '<_cl_>', 'em', '<_r_>'
    ]],
    '<fieldset_tag>': [[
        '<_l_>', 'fieldset', '<d>', '<_r_>', '<legend_tag-1>',
        '<form_content-1>', '<_cl_>', 'fieldset', '<_r_>'
    ]],
    '<flow>': [['<flow_content-1>']],
    '<flow_content>': [['<block>'], ['<text>']],
    '<font_tag>': [[
        '<_l_>', 'font', '<d>', '<_r_>', '<style_text>', '<_cl_>', 'font',
        '<_r_>'
    ]],
    '<form_content>': [['<_l_>', 'input', '<d>', '<_r_>'],
                       ['<_l_>', 'keygen', '<d>', '<_r_>'], ['<body_content>'],
                       ['<fieldset_tag>'], ['<label_tag>'], ['<select_tag>'],
                       ['<textarea_tag>']],
    '<form_tag>': [[
        '<_l_>', 'form', '<d>', '<_r_>', '<form_content-2>', '<_cl_>', 'form',
        '<_r_>'
    ]],
    '<frameset_content>': [['<_l_>', 'frame', '<d>', '<_r_>'],
                           ['<noframes_tag>']],
    '<frameset_tag>': [[
        '<_l_>', 'frameset', '<d>', '<_r_>', '<frameset_content-1>', '<_cl_>',
        'frameset', '<_r_>'
    ]],
    '<h1_tag>': [[
        '<_l_>', 'h1', '<d>', '<_r_>', '<text>', '<_cl_>', 'h1', '<_r_>'
    ]],
    '<h2_tag>': [[
        '<_l_>', 'h2', '<d>', '<_r_>', '<text>', '<_cl_>', 'h2', '<_r_>'
    ]],
    '<h3_tag>': [[
        '<_l_>', 'h3', '<d>', '<_r_>', '<text>', '<_cl_>', 'h3', '<_r_>'
    ]],
    '<h4_tag>': [[
        '<_l_>', 'h4', '<d>', '<_r_>', '<text>', '<_cl_>', 'h4', '<_r_>'
    ]],
    '<h5_tag>': [[
        '<_l_>', 'h5', '<d>', '<_r_>', '<text>', '<_cl_>', 'h5', '<_r_>'
    ]],
    '<h6_tag>': [[
        '<_l_>', 'h6', '<d>', '<_r_>', '<text>', '<_cl_>', 'h6', '<_r_>'
    ]],
    '<head_content>': [['<_l_>', 'base', '<d>', '<_r_>'],
                       ['<_l_>', 'link', '<d>', '<_r_>'],
                       ['<_l_>', 'meta', '<d>', '<_r_>'], ['<style_tag>'],
                       ['<title_tag>'], ['<script_tag>']],
    '<head_tag>': [[
        '<_l_>', 'head', '<d>', '<_r_>', '<head_content-1>', '<_cl_>', 'head',
        '<_r_>'
    ]],
    '<heading>': [['<h1_tag>'], ['<h2_tag>'], ['<h3_tag>'], ['<h4_tag>'],
                  ['<h5_tag>'], ['<h6_tag>']],
    '<html_content>': [['<head_tag>', '<body_tag>'],
                       ['<head_tag>', '<frameset_tag>']],
    '<html_document>': [['<html_tag>']],
    '<html_tag>': [[
        '<_l_>', 'html', '<_r_>', '<html_content>', '<_cl_>', 'html', '<_r_>'
    ]],
    '<i_tag>': [[
        '<_l_>', 'i', '<d>', '<_r_>', '<text>', '<_cl_>', 'i', '<_r_>'
    ]],
    '<ilayer_tag>': [[
        '<_l_>', 'ilayer', '<d>', '<_r_>', '<body_content>', '<_cl_>',
        'ilayer', '<_r_>'
    ]],
    '<ins_tag>': [[
        '<_l_>', 'ins', '<d>', '<_r_>', '<flow>', '<_cl_>', 'ins', '<_r_>'
    ]],
    '<kbd_tag>': [[
        '<_l_>', 'kbd', '<d>', '<_r_>', '<text>', '<_cl_>', 'kbd', '<_r_>'
    ]],
    '<label_content>': [['<_l_>', 'input', '<d>', '<_r_>'], ['<body_content>'],
                        ['<select_tag>'], ['<textarea_tag>']],
    '<label_tag>': [[
        '<_l_>', 'label', '<d>', '<_r_>', '<label_content-1>', '<_cl_>',
        'label', '<_r_>'
    ]],
    '<layer_tag>': [[
        '<_l_>', 'layer', '<d>', '<_r_>', '<body_content>', '<_cl_>', 'layer',
        '<_r_>'
    ]],
    '<legend_tag>': [[
        '<_l_>', 'legend', '<d>', '<_r_>', '<text>', '<_cl_>', 'legend',
        '<_r_>'
    ]],
    '<li_tag>': [[
        '<_l_>', 'li', '<d>', '<_r_>', '<flow>', '<_cl_>', 'li', '<_r_>'
    ]],
    '<literal_text>': [['<plain_text>']],
    '<listing_tag>': [[
        '<_l_>', 'listing', '<d>', '<_r_>', '<literal_text>', '<_cl_>',
        'listing', '<_r_>'
    ]],
    '<map_content>': [['<area-1>']],
    '<map_tag>': [[
        '<_l_>', 'map', '<d>', '<_r_>', '<map_content>', '<_cl_>', 'map',
        '<_r_>'
    ]],
    '<marquee_tag>': [[
        '<_l_>', 'marquee', '<d>', '<_r_>', '<style_text>', '<_cl_>',
        'marquee', '<_r_>'
    ]],
    '<menu_tag>': [[
        '<_l_>', 'menu', '<d>', '<_r_>', '<li_tag-2>', '<_cl_>', 'menu',
        '<_r_>'
    ]],
    '<multicol_tag>': [[
        '<_l_>', 'multicol', '<d>', '<_r_>', '<body_content>', '<_cl_>',
        'multicol', '<_r_>'
    ]],
    '<nobr_tag>': [[
        '<_l_>', 'nobr', '<d>', '<_r_>', '<text>', '<_cl_>', 'nobr', '<_r_>'
    ]],
    '<noembed_tag>': [[
        '<_l_>', 'noembed', '<d>', '<_r_>', '<text>', '<_cl_>', 'noembed',
        '<_r_>'
    ]],
    '<noframes_tag>': [[
        '<_l_>', 'noframes', '<d>', '<_r_>', '<body_content-4>', '<_cl_>',
        'noframes', '<_r_>'
    ]],
    '<noscript_tag>': [[
        '<_l_>', 'noscript', '<d>', '<_r_>', '<text>', '<_cl_>', 'noscript',
        '<_r_>'
    ]],
    '<object_content>': [['<param-2>', '<body_content>']],
    '<object_tag>': [[
        '<_l_>', 'object', '<d>', '<_r_>', '<object_content>', '<_cl_>',
        'object', '<_r_>'
    ]],
    '<ol_tag>': [[
        '<_l_>', 'ol', '<d>', '<_r_>', '<li_tag-3>', '<_cl_>', 'ol', '<_r_>'
    ]],
    '<optgroup_tag>': [[
        '<_l_>', 'optgroup', '<d>', '<_r_>', '<option_tag-1>', '<_cl_>',
        'optgroup', '<_r_>'
    ]],
    '<option_tag>': [[
        '<_l_>', 'option', '<d>', '<_r_>', '<plain_text-1>', '<_cl_>',
        'option', '<_r_>'
    ]],
    '<p_tag>': [['<_l_>', 'p', '<_r_>', '<text>', '<_cl_>', 'p', '<_r_>']],
    '<param>': [['<_l_>', 'param', '<_r_>']],
    '<plain_text>': [['<entity-1>']],
    '<entity>': [['<char>'], ['<ampersand>']],
    '<char>': [['7'], ['*'], [':'], [']'], ['n'], ['m'], ['N'], ['/'], ['.'],
               ['K'], ['T'], ['I'], ['f'], ['o'], [','], ['l'], ['W'], ['-'],
               ['?'], ['\\'], ['%'], ['1'], ['c'], ['H'], ['!'], ['A'], ['$'],
               ['9'], ['q'], ['['], [')'], [' '], [';'], ['b'], ['i'], ['L'],
               ["'"], ['Y'], ['\t'], ['3'], ['g'], ['F'], ['E'], ['D'], ['C'],
               ['@'], ['t'], ['R'], ['"'], ['2'], ['}'], ['~'], ['5'], ['4'],
               ['z'], ['X'], ['S'], ['O'], ['v'], ['J'], ['`'], ['B'], ['\n'],
               ['y'], ['p'], ['6'], ['0'], ['k'], ['w'], ['\r'], ['V'], ['_'],
               ['s'], ['x'], ['{'], ['d'], ['a'], ['#'], ['Q'], ['<'], ['u'],
               ['r'], ['U'], ['h'], ['>'], ['('], ['P'], ['G'], ['\x0c'],
               ['Z'], ['j'], ['|'], ['e'], ['^'], ['='], ['8'], ['+'], ['M']],
    '<ampersand>': [['&nbsp;']],
    '<physical_style>': [['<b_tag>'], ['<bdo_tag>'], ['<big_tag>'],
                         ['<blink_tag>'], ['<font_tag>'], ['<i_tag>'],
                         ['<s_tag>'], ['<small_tag>'], ['<span_tag>'],
                         ['<strike_tag>'], ['<sub_tag>'], ['<sup_tag>'],
                         ['<tt_tag>'], ['<u_tag>']],
    '<pre_content>': [['<_l_>', 'br', '<_r_>'], ['<_l_>', 'hr', '<_r_>'],
                      ['<a_tag>'], ['<style_text>']],
    '<pre_tag>': [[
        '<_l_>', 'pre', '<_r_>', '<pre_content-1>', '<_cl_>', 'pre', '<_r_>'
    ]],
    '<q_tag>': [['<_l_>', 'q', '<_r_>', '<text>', '<_cl_>', 'q', '<_r_>']],
    '<s_tag>': [['<_l_>', 's', '<_r_>', '<text>', '<_cl_>', 's', '<_r_>']],
    '<script_tag>': [[
        '<_l_>', 'script', '<d>', '<_r_>', '<plain_text>', '<_cl_>', 'script',
        '<_r_>'
    ]],
    '<select_content>': [['<optgroup_tag>'], ['<option_tag>']],
    '<select_tag>': [[
        '<_l_>', 'select', '<d>', '<_r_>', '<select_content-1>', '<_cl_>',
        'select', '<_r_>'
    ]],
    '<small_tag>': [[
        '<_l_>', 'small', '<d>', '<_r_>', '<text>', '<_cl_>', 'small', '<_r_>'
    ]],
    '<span_tag>': [[
        '<_l_>', 'span', '<d>', '<_r_>', '<text>', '<_cl_>', 'span', '<_r_>'
    ]],
    '<strike_tag>': [[
        '<_l_>', 'strike', '<d>', '<_r_>', '<text>', '<_cl_>', 'strike',
        '<_r_>'
    ]],
    '<strong_tag>': [[
        '<_l_>', 'strong', '<d>', '<_r_>', '<text>', '<_cl_>', 'strong',
        '<_r_>'
    ]],
    '<style_tag>': [[
        '<_l_>', 'style', '<d>', '<_r_>', '<plain_text>', '<_cl_>', 'style',
        '<_r_>'
    ]],
    '<style_text>': [['<plain_text>']],
    '<sub_tag>': [[
        '<_l_>', 'sub', '<d>', '<_r_>', '<text>', '<_cl_>', 'sub', '<_r_>'
    ]],
    '<sup_tag>': [[
        '<_l_>', 'sup', '<d>', '<_r_>', '<text>', '<_cl_>', 'sup', '<_r_>'
    ]],
    '<table_cell>': [['<td_tag>'], ['<th_tag>']],
    '<table_content>': [['<_l_>', 'tbody', '<d>', '<_r_>'],
                        ['<_l_>', 'tfoot', '<d>', '<_r_>'],
                        ['<_l_>', 'thead', '<d>', '<_r_>'], ['<tr_tag>']],
    '<table_tag>': [[
        '<_l_>', 'table', '<d>', '<_r_>', '<caption_tag-1>',
        '<colgroup_tag-1>', '<table_content-1>', '<_cl_>', 'table', '<_r_>'
    ]],
    '<td_tag>': [[
        '<_l_>', 'td', '<d>', '<_r_>', '<body_content>', '<_cl_>', 'td',
        '<_r_>'
    ]],
    '<text>': [['<text_content-1>']],
    '<text_content>': [['<_l_>', 'br', '<d>', '<_r_>'],
                       ['<_l_>', 'embed', '<d>', '<_r_>'],
                       ['<_l_>', 'iframe', '<d>', '<_r_>'],
                       ['<_l_>', 'img', '<d>', '<_r_>'],
                       ['<_l_>', 'spacer', '<d>', '<_r_>'],
                       ['<_l_>', 'wbr', '<d>', '<_r_>'], ['<a_tag>'],
                       ['<applet_tag>'], ['<content_style>'], ['<ilayer_tag>'],
                       ['<noembed_tag>'], ['<noscript_tag>'], ['<object_tag>'],
                       ['<plain_text>'], ['<physical_style>']],
    '<textarea_tag>': [[
        '<_l_>', 'textarea', '<d>', '<_r_>', '<plain_text>', '<_cl_>',
        'textarea', '<_r_>'
    ]],
    '<th_tag>': [[
        '<_l_>', 'th', '<d>', '<_r_>', '<body_content>', '<_cl_>', 'th',
        '<_r_>'
    ]],
    '<title_tag>': [[
        '<_l_>', 'title', '<d>', '<_r_>', '<plain_text>', '<_cl_>', 'title',
        '<_r_>'
    ]],
    '<tr_tag>': [[
        '<_l_>', 'tr', '<d>', '<_r_>', '<table_cell-1>', '<_cl_>', 'tr',
        '<_r_>'
    ]],
    '<tt_tag>': [[
        '<_l_>', 'tt', '<d>', '<_r_>', '<text>', '<_cl_>', 'tt', '<_r_>'
    ]],
    '<u_tag>': [[
        '<_l_>', 'u', '<d>', '<_r_>', '<text>', '<_cl_>', 'u', '<_r_>'
    ]],
    '<ul_tag>': [[
        '<_l_>', 'ul', '<d>', '<_r_>', '<li_tag-4>', '<_cl_>', 'ul', '<_r_>'
    ]],
    '<var_tag>': [[
        '<_l_>', 'var', '<d>', '<_r_>', '<text>', '<_cl_>', 'var', '<_r_>'
    ]],
    '<xmp_tag>': [[
        '<_l_>', 'xmp', '<d>', '<_r_>', '<literal_text>', '<_cl_>', 'xmp',
        '<_r_>'
    ]],
    '<d>': [['<space-1>', '<attributes-1>', '<space-2>'], []],
    '<attribute>': [['<key>'], ['<key>', '="', '<value>', '"'],
                    ['<key>', "='", '<value>', "'"],
                    ['<key>', '=', '<uqvalue>']],
    '<key>': [['<allchars>']],
    '<allchars>': [
        ['7'], ['*'], [':'], ['&'], [']'], ['n'], ['m'], ['N'], ['.'], ['K'],
        ['T'], ['I'], ['f'], ['o'], [','], ['l'], ['W'], ['-'], ['?'], ['\\'],
        ['%'], ['1'], ['c'], ['H'], ['!'], ['A'], ['$'], ['9'], ['q'], ['['],
        [')'], [';'], ['b'], ['i'], ['L'], ['Y'], ['3'], ['g'], ['F'], ['E'],
        ['D'], ['C'], ['@'], ['t'], ['R'], ['2'], ['}'], ['~'], ['5'], ['4'],
        ['z'], ['X'], ['S'], ['O'], ['v'], ['J'], ['`'], ['B'], ['y'], ['p'],
        ['6'], ['0'], ['k'], ['w'], ['\r'], ['V'], ['_'], ['s'], ['x'], ['{'],
        ['d'], ['a'], ['#'], ['Q'], ['u'], ['r'], ['U'], ['h'], ['('], ['P'],
        ['G'], ['\x0c'], ['Z'], ['j'], ['|'], ['e'], ['^'], ['8'], ['+'],
        ['M']
    ],
    '<value>': [['<anychars>']],
    '<anychar>': [['0'], ['1'], ['2'], ['3'], ['4'], ['5'], ['6'], ['7'],
                  ['8'], ['9'], ['a'], ['b'], ['c'], ['d'], ['e'], ['f'
                                                                    ], ['g'],
                  ['h'], ['i'], ['j'], ['k'], ['l'], ['m'], ['n'], ['o'
                                                                    ], ['p'],
                  ['q'], ['r'], ['s'], ['t'], ['u'], ['v'], ['w'], ['x'
                                                                    ], ['y'],
                  ['z'], ['A'], ['B'], ['C'], ['D'], ['E'], ['F'], ['G'
                                                                    ], ['H'],
                  ['I'], ['J'], ['K'], ['L'], ['M'], ['N'], ['O'], ['P'
                                                                    ], ['Q'],
                  ['R'], ['S'], ['T'], ['U'], ['V'], ['W'], ['X'], ['Y'
                                                                    ], ['Z'],
                  ['!'], ['"'], ['#'], ['$'], ['%'], ['&'], ["'"], ['('
                                                                    ], [')'],
                  ['*'], ['+'], [','], ['-'], ['.'], ['/'], [':'], [';'
                                                                    ], ['<'],
                  ['='], ['>'], ['?'], ['@'], ['['], ['\\'], [']'], ['^'],
                  ['_'], ['`'], ['{'], ['|'], ['}'], ['~'], [' '], ['\t'],
                  ['\n'], ['\r'], ['\x0b'], ['\x0c']],
    '<anychars>': [['<anychar-1>']],
    '<uqvalue>': [['<uqchars>']],
    '<uqchar>': [['7'], ['*'], [':'], ['&'], [']'], ['n'], ['m'], ['N'], ['.'],
                 ['K'], ['T'], ['I'], ['f'], ['o'], [','], ['l'], ['W'], ['-'],
                 ['?'], ['\\'], ['%'], ['1'], ['c'], ['H'], ['!'], ['A'],
                 ['$'], ['9'], ['q'], ['['], [')'], [';'], ['b'], ['i'], ['L'],
                 ['Y'], ['3'], ['g'], ['F'], ['E'], ['D'], ['C'], ['@'], ['t'],
                 ['R'], ['2'], ['}'], ['~'], ['5'], ['4'], ['z'], ['X'], ['S'],
                 ['O'], ['v'], ['J'], ['B'], ['y'], ['p'], ['6'], ['0'], ['k'],
                 ['w'], ['\r'], ['V'], ['_'], ['s'], ['x'], ['{'], ['d'],
                 ['a'], ['#'], ['Q'], ['u'], ['r'], ['U'], ['h'], ['('], ['P'],
                 ['G'], ['\x0c'], ['Z'], ['j'], ['|'], ['e'], ['^'], ['8'],
                 ['+'], ['M']],
    '<uqchars>': [['<uqchar-1>']],
    '<attributes>': [['<attribute>'],
                     ['<attribute>', '<space-3>', '<attributes>']],
    '<space>': [[' '], ['\t'], ['\n']],
    '<a_content-1>': [[], ['<a_content>', '<a_content-1>']],
    '<address_content-1>': [[], ['<address_content>', '<address_content-1>']],
    '<param-1>': [[], ['<param>', '<param-1>']],
    '<block_content-1>': [[], ['<block_content>', '<block_content-1>']],
    '<body_content-1>': [[], ['<body_content>', '<body_content-1>']],
    '<body_content-2>': [[], ['<body_content>', '<body_content-2>']],
    '<body_content-3>': [[], ['<body_content>', '<body_content-3>']],
    '<_r_-1>': [[], ['<_r_>', '<_r_-1>']],
    '<li_tag-1>': [['<li_tag>'], ['<li_tag>', '<li_tag-1>']],
    '<dl_content-1>': [['<dl_content>'], ['<dl_content>', '<dl_content-1>']],
    '<legend_tag-1>': [[], ['<legend_tag>', '<legend_tag-1>']],
    '<form_content-1>': [[], ['<form_content>', '<form_content-1>']],
    '<flow_content-1>': [[], ['<flow_content>', '<flow_content-1>']],
    '<form_content-2>': [[], ['<form_content>', '<form_content-2>']],
    '<frameset_content-1>': [[],
                             ['<frameset_content>', '<frameset_content-1>']],
    '<head_content-1>': [[], ['<head_content>', '<head_content-1>']],
    '<label_content-1>': [[], ['<label_content>', '<label_content-1>']],
    '<area-1>': [[], ['<area>', '<area-1>']],
    '<li_tag-2>': [[], ['<li_tag>', '<li_tag-2>']],
    '<body_content-4>': [[], ['<body_content>', '<body_content-4>']],
    '<param-2>': [[], ['<param>', '<param-2>']],
    '<li_tag-3>': [['<li_tag>'], ['<li_tag>', '<li_tag-3>']],
    '<option_tag-1>': [[], ['<option_tag>', '<option_tag-1>']],
    '<plain_text-1>': [['<plain_text>'], ['<plain_text>', '<plain_text-1>']],
    '<entity-1>': [[], ['<entity>', '<entity-1>']],
    '<pre_content-1>': [[], ['<pre_content>', '<pre_content-1>']],
    '<select_content-1>': [[], ['<select_content>', '<select_content-1>']],
    '<caption_tag-1>': [[], ['<caption_tag>', '<caption_tag-1>']],
    '<colgroup_tag-1>': [[], ['<colgroup_tag>', '<colgroup_tag-1>']],
    '<table_content-1>': [[], ['<table_content>', '<table_content-1>']],
    '<text_content-1>': [[], ['<text_content>', '<text_content-1>']],
    '<table_cell-1>': [[], ['<table_cell>', '<table_cell-1>']],
    '<li_tag-4>': [[], ['<li_tag>', '<li_tag-4>']],
    '<space-1>': [['<space>'], ['<space>', '<space-1>']],
    '<attributes-1>': [[], ['<attributes>', '<attributes-1>']],
    '<space-2>': [[], ['<space>', '<space-2>']],
    '<anychar-1>': [[], ['<anychar>', '<anychar-1>']],
    '<uqchar-1>': [['<uqchar>'], ['<uqchar>', '<uqchar-1>']],
    
    '<space-3>': [['<space>'], ['<space>', '<space-3>']]
}

In [31]:
# Warning -- set to False if not HTML
IS_HTML = False

In [32]:
HTML_KEY_RECURSION = {'<start>': False,
 '<_l_>': False,
 '<_r_>': False,
 '<_cl_>': False,
 '<a_tag>': False,
 '<a_content>': False,
 '<abbr_tag>': False,
 '<acronym_tag>': False,
 '<address_tag>': False,
 '<address_content>': False,
 '<applet_content>': False,
 '<area>': False,
 '<applet_tag>': False,
 '<b_tag>': False,
 '<basefont_tag>': False,
 '<bdo_tag>': False,
 '<big_tag>': False,
 '<blink_tag>': False,
 '<block>': False,
 '<block_content>': False,
 '<blockquote_tag>': False,
 '<body_content>': False,
 '<body_tag>': False,
 '<caption_tag>': False,
 '<center_tag>': False,
 '<cite_tag>': False,
 '<code_tag>': False,
 '<colgroup_content>': False,
 '<colgroup_tag>': False,
 '<content_style>': False,
 '<dd_tag>': False,
 '<del_tag>': False,
 '<dfn_tag>': False,
 '<dir_tag>': False,
 '<div_tag>': False,
 '<dl_content>': False,
 '<dl_tag>': False,
 '<dt_tag>': False,
 '<em_tag>': False,
 '<fieldset_tag>': False,
 '<flow>': False,
 '<flow_content>': False,
 '<font_tag>': False,
 '<form_content>': False,
 '<form_tag>': False,
 '<frameset_content>': False,
 '<frameset_tag>': False,
 '<h1_tag>': False,
 '<h2_tag>': False,
 '<h3_tag>': False,
 '<h4_tag>': False,
 '<h5_tag>': False,
 '<h6_tag>': False,
 '<head_content>': False,
 '<head_tag>': False,
 '<heading>': False,
 '<html_content>': False,
 '<html_document>': False,
 '<html_tag>': False,
 '<i_tag>': False,
 '<ilayer_tag>': False,
 '<ins_tag>': False,
 '<kbd_tag>': False,
 '<label_content>': False,
 '<label_tag>': False,
 '<layer_tag>': False,
 '<legend_tag>': False,
 '<li_tag>': False,
 '<literal_text>': False,
 '<listing_tag>': False,
 '<map_content>': False,
 '<map_tag>': False,
 '<marquee_tag>': False,
 '<menu_tag>': False,
 '<multicol_tag>': False,
 '<nobr_tag>': False,
 '<noembed_tag>': False,
 '<noframes_tag>': False,
 '<noscript_tag>': False,
 '<object_content>': False,
 '<object_tag>': False,
 '<ol_tag>': False,
 '<optgroup_tag>': False,
 '<option_tag>': False,
 '<p_tag>': False,
 '<param>': False,
 '<plain_text>': False,
 '<entity>': False,
 '<char>': False,
 '<ampersand>': False,
 '<physical_style>': False,
 '<pre_content>': False,
 '<pre_tag>': False,
 '<q_tag>': False,
 '<s_tag>': False,
 '<script_tag>': False,
 '<select_content>': False,
 '<select_tag>': False,
 '<small_tag>': False,
 '<span_tag>': False,
 '<strike_tag>': False,
 '<strong_tag>': False,
 '<style_tag>': False,
 '<style_text>': False,
 '<sub_tag>': False,
 '<sup_tag>': False,
 '<table_cell>': False,
 '<table_content>': False,
 '<table_tag>': False,
 '<td_tag>': False,
 '<text>': False,
 '<text_content>': False,
 '<textarea_tag>': False,
 '<th_tag>': False,
 '<title_tag>': False,
 '<tr_tag>': False,
 '<tt_tag>': False,
 '<u_tag>': False,
 '<ul_tag>': False,
 '<var_tag>': False,
 '<xmp_tag>': False,
 '<d>': False,
 '<attribute>': False,
 '<key>': False,
 '<allchars>': False,
 '<value>': False,
 '<anychar>': False,
 '<anychars>': False,
 '<uqvalue>': False,
 '<uqchar>': False,
 '<uqchars>': False,
 '<attributes>': True,
 '<space>': False,
 '<a_content_1>': True,
 '<address_content_1>': True,
 '<param_1>': True,
 '<block_content_1>': True,
 '<body_content_1>': True,
 '<body_content_2>': True,
 '<body_content_3>': True,
 '<_r__1>': True,
 '<li_tag_1>': True,
 '<dl_content_1>': True,
 '<legend_tag_1>': True,
 '<form_content_1>': True,
 '<flow_content_1>': True,
 '<form_content_2>': True,
 '<frameset_content_1>': True,
 '<head_content_1>': True,
 '<label_content_1>': True,
 '<area_1>': True,
 '<li_tag_2>': True,
 '<body_content_4>': True,
 '<param_2>': True,
 '<li_tag_3>': True,
 '<option_tag_1>': True,
 '<plain_text_1>': True,
 '<entity_1>': True,
 '<pre_content_1>': True,
 '<select_content_1>': True,
 '<caption_tag_1>': True,
 '<colgroup_tag_1>': True,
 '<table_content_1>': True,
 '<text_content_1>': True,
 '<table_cell_1>': True,
 '<li_tag_4>': True,
 '<space_1>': True,
 '<attributes_1>': True,
 '<space_2>': True,
 '<anychar_1>': True,
 '<uqchar_1>': True,
 '<space_3>': True}

HTML_RULE_RECURSION = {'gen_start_0': False,
 'gen__l__0': False,
 'gen__r__0': False,
 'gen__cl__0': False,
 'gen_a_tag_0': True,
 'gen_a_content_0': True,
 'gen_a_content_1': True,
 'gen_abbr_tag_0': True,
 'gen_acronym_tag_0': True,
 'gen_address_tag_0': True,
 'gen_address_content_0': True,
 'gen_address_content_1': True,
 'gen_applet_content_0': True,
 'gen_area_0': False,
 'gen_applet_tag_0': True,
 'gen_b_tag_0': True,
 'gen_basefont_tag_0': True,
 'gen_bdo_tag_0': True,
 'gen_big_tag_0': True,
 'gen_blink_tag_0': True,
 'gen_block_0': True,
 'gen_block_content_0': True,
 'gen_block_content_1': True,
 'gen_block_content_2': True,
 'gen_block_content_3': True,
 'gen_block_content_4': True,
 'gen_block_content_5': True,
 'gen_block_content_6': True,
 'gen_block_content_7': True,
 'gen_block_content_8': False,
 'gen_block_content_9': False,
 'gen_block_content_10': True,
 'gen_block_content_11': True,
 'gen_block_content_12': True,
 'gen_block_content_13': True,
 'gen_block_content_14': True,
 'gen_block_content_15': True,
 'gen_block_content_16': True,
 'gen_blockquote_tag_0': True,
 'gen_body_content_0': False,
 'gen_body_content_1': False,
 'gen_body_content_2': True,
 'gen_body_content_3': True,
 'gen_body_content_4': True,
 'gen_body_content_5': True,
 'gen_body_content_6': True,
 'gen_body_content_7': False,
 'gen_body_content_8': False,
 'gen_body_content_9': True,
 'gen_body_content_10': True,
 'gen_body_tag_0': False,
 'gen_caption_tag_0': True,
 'gen_center_tag_0': True,
 'gen_cite_tag_0': True,
 'gen_code_tag_0': True,
 'gen_colgroup_content_0': False,
 'gen_colgroup_tag_0': False,
 'gen_content_style_0': True,
 'gen_content_style_1': True,
 'gen_content_style_2': True,
 'gen_content_style_3': True,
 'gen_content_style_4': True,
 'gen_content_style_5': True,
 'gen_content_style_6': True,
 'gen_content_style_7': True,
 'gen_content_style_8': True,
 'gen_content_style_9': True,
 'gen_dd_tag_0': True,
 'gen_del_tag_0': True,
 'gen_dfn_tag_0': True,
 'gen_dir_tag_0': True,
 'gen_div_tag_0': True,
 'gen_dl_content_0': True,
 'gen_dl_tag_0': True,
 'gen_dt_tag_0': True,
 'gen_em_tag_0': True,
 'gen_fieldset_tag_0': True,
 'gen_flow_0': True,
 'gen_flow_content_0': True,
 'gen_flow_content_1': True,
 'gen_font_tag_0': False,
 'gen_form_content_0': False,
 'gen_form_content_1': False,
 'gen_form_content_2': True,
 'gen_form_content_3': True,
 'gen_form_content_4': True,
 'gen_form_content_5': False,
 'gen_form_content_6': False,
 'gen_form_tag_0': True,
 'gen_frameset_content_0': False,
 'gen_frameset_content_1': False,
 'gen_frameset_tag_0': False,
 'gen_h1_tag_0': True,
 'gen_h2_tag_0': True,
 'gen_h3_tag_0': True,
 'gen_h4_tag_0': True,
 'gen_h5_tag_0': True,
 'gen_h6_tag_0': True,
 'gen_head_content_0': False,
 'gen_head_content_1': False,
 'gen_head_content_2': False,
 'gen_head_content_3': False,
 'gen_head_content_4': False,
 'gen_head_content_5': False,
 'gen_head_tag_0': False,
 'gen_heading_0': True,
 'gen_heading_1': True,
 'gen_heading_2': True,
 'gen_heading_3': True,
 'gen_heading_4': True,
 'gen_heading_5': True,
 'gen_html_content_0': False,
 'gen_html_content_1': False,
 'gen_html_document_0': False,
 'gen_html_tag_0': False,
 'gen_i_tag_0': True,
 'gen_ilayer_tag_0': True,
 'gen_ins_tag_0': True,
 'gen_kbd_tag_0': True,
 'gen_label_content_0': False,
 'gen_label_content_1': True,
 'gen_label_content_2': False,
 'gen_label_content_3': False,
 'gen_label_tag_0': True,
 'gen_layer_tag_0': True,
 'gen_legend_tag_0': True,
 'gen_li_tag_0': True,
 'gen_literal_text_0': False,
 'gen_listing_tag_0': False,
 'gen_map_content_0': False,
 'gen_map_tag_0': False,
 'gen_marquee_tag_0': False,
 'gen_menu_tag_0': True,
 'gen_multicol_tag_0': True,
 'gen_nobr_tag_0': True,
 'gen_noembed_tag_0': True,
 'gen_noframes_tag_0': False,
 'gen_noscript_tag_0': True,
 'gen_object_content_0': True,
 'gen_object_tag_0': True,
 'gen_ol_tag_0': True,
 'gen_optgroup_tag_0': False,
 'gen_option_tag_0': False,
 'gen_p_tag_0': True,
 'gen_param_0': False,
 'gen_plain_text_0': False,
 'gen_entity_0': False,
 'gen_entity_1': False,
 'gen_char_0': False,
 'gen_char_1': False,
 'gen_char_2': False,
 'gen_char_3': False,
 'gen_char_4': False,
 'gen_char_5': False,
 'gen_char_6': False,
 'gen_char_7': False,
 'gen_char_8': False,
 'gen_char_9': False,
 'gen_char_10': False,
 'gen_char_11': False,
 'gen_char_12': False,
 'gen_char_13': False,
 'gen_char_14': False,
 'gen_char_15': False,
 'gen_char_16': False,
 'gen_char_17': False,
 'gen_char_18': False,
 'gen_char_19': False,
 'gen_char_20': False,
 'gen_char_21': False,
 'gen_char_22': False,
 'gen_char_23': False,
 'gen_char_24': False,
 'gen_char_25': False,
 'gen_char_26': False,
 'gen_char_27': False,
 'gen_char_28': False,
 'gen_char_29': False,
 'gen_char_30': False,
 'gen_char_31': False,
 'gen_char_32': False,
 'gen_char_33': False,
 'gen_char_34': False,
 'gen_char_35': False,
 'gen_char_36': False,
 'gen_char_37': False,
 'gen_char_38': False,
 'gen_char_39': False,
 'gen_char_40': False,
 'gen_char_41': False,
 'gen_char_42': False,
 'gen_char_43': False,
 'gen_char_44': False,
 'gen_char_45': False,
 'gen_char_46': False,
 'gen_char_47': False,
 'gen_char_48': False,
 'gen_char_49': False,
 'gen_char_50': False,
 'gen_char_51': False,
 'gen_char_52': False,
 'gen_char_53': False,
 'gen_char_54': False,
 'gen_char_55': False,
 'gen_char_56': False,
 'gen_char_57': False,
 'gen_char_58': False,
 'gen_char_59': False,
 'gen_char_60': False,
 'gen_char_61': False,
 'gen_char_62': False,
 'gen_char_63': False,
 'gen_char_64': False,
 'gen_char_65': False,
 'gen_char_66': False,
 'gen_char_67': False,
 'gen_char_68': False,
 'gen_char_69': False,
 'gen_char_70': False,
 'gen_char_71': False,
 'gen_char_72': False,
 'gen_char_73': False,
 'gen_char_74': False,
 'gen_char_75': False,
 'gen_char_76': False,
 'gen_char_77': False,
 'gen_char_78': False,
 'gen_char_79': False,
 'gen_char_80': False,
 'gen_char_81': False,
 'gen_char_82': False,
 'gen_char_83': False,
 'gen_char_84': False,
 'gen_char_85': False,
 'gen_char_86': False,
 'gen_char_87': False,
 'gen_char_88': False,
 'gen_char_89': False,
 'gen_char_90': False,
 'gen_char_91': False,
 'gen_char_92': False,
 'gen_char_93': False,
 'gen_char_94': False,
 'gen_char_95': False,
 'gen_char_96': False,
 'gen_char_97': False,
 'gen_ampersand_0': False,
 'gen_physical_style_0': False,
 'gen_physical_style_1': True,
 'gen_physical_style_2': True,
 'gen_physical_style_3': True,
 'gen_physical_style_4': True,
 'gen_physical_style_5': True,
 'gen_physical_style_6': True,
 'gen_physical_style_7': True,
 'gen_physical_style_8': True,
 'gen_physical_style_9': True,
 'gen_physical_style_10': True,
 'gen_physical_style_11': True,
 'gen_physical_style_12': True,
 'gen_physical_style_13': True,
 'gen_pre_content_0': False,
 'gen_pre_content_1': False,
 'gen_pre_content_2': True,
 'gen_pre_content_3': False,
 'gen_pre_tag_0': True,
 'gen_q_tag_0': True,
 'gen_s_tag_0': True,
 'gen_script_tag_0': False,
 'gen_select_content_0': False,
 'gen_select_content_1': False,
 'gen_select_tag_0': False,
 'gen_small_tag_0': True,
 'gen_span_tag_0': True,
 'gen_strike_tag_0': True,
 'gen_strong_tag_0': True,
 'gen_style_tag_0': False,
 'gen_style_text_0': False,
 'gen_sub_tag_0': True,
 'gen_sup_tag_0': True,
 'gen_table_cell_0': True,
 'gen_table_cell_1': True,
 'gen_table_content_0': False,
 'gen_table_content_1': False,
 'gen_table_content_2': False,
 'gen_table_content_3': True,
 'gen_table_tag_0': True,
 'gen_td_tag_0': True,
 'gen_text_0': True,
 'gen_text_content_0': False,
 'gen_text_content_1': False,
 'gen_text_content_2': False,
 'gen_text_content_3': False,
 'gen_text_content_4': False,
 'gen_text_content_5': False,
 'gen_text_content_6': True,
 'gen_text_content_7': False,
 'gen_text_content_8': True,
 'gen_text_content_9': True,
 'gen_text_content_10': True,
 'gen_text_content_11': True,
 'gen_text_content_12': True,
 'gen_text_content_13': True,
 'gen_text_content_14': True,
 'gen_textarea_tag_0': False,
 'gen_th_tag_0': True,
 'gen_title_tag_0': False,
 'gen_tr_tag_0': True,
 'gen_tt_tag_0': True,
 'gen_u_tag_0': True,
 'gen_ul_tag_0': True,
 'gen_var_tag_0': True,
 'gen_xmp_tag_0': False,
 'gen_d_0': False,
 'gen_d_1': False,
 'gen_attribute_0': False,
 'gen_attribute_1': False,
 'gen_attribute_2': False,
 'gen_attribute_3': False,
 'gen_key_0': False,
 'gen_allchars_0': False,
 'gen_allchars_1': False,
 'gen_allchars_2': False,
 'gen_allchars_3': False,
 'gen_allchars_4': False,
 'gen_allchars_5': False,
 'gen_allchars_6': False,
 'gen_allchars_7': False,
 'gen_allchars_8': False,
 'gen_allchars_9': False,
 'gen_allchars_10': False,
 'gen_allchars_11': False,
 'gen_allchars_12': False,
 'gen_allchars_13': False,
 'gen_allchars_14': False,
 'gen_allchars_15': False,
 'gen_allchars_16': False,
 'gen_allchars_17': False,
 'gen_allchars_18': False,
 'gen_allchars_19': False,
 'gen_allchars_20': False,
 'gen_allchars_21': False,
 'gen_allchars_22': False,
 'gen_allchars_23': False,
 'gen_allchars_24': False,
 'gen_allchars_25': False,
 'gen_allchars_26': False,
 'gen_allchars_27': False,
 'gen_allchars_28': False,
 'gen_allchars_29': False,
 'gen_allchars_30': False,
 'gen_allchars_31': False,
 'gen_allchars_32': False,
 'gen_allchars_33': False,
 'gen_allchars_34': False,
 'gen_allchars_35': False,
 'gen_allchars_36': False,
 'gen_allchars_37': False,
 'gen_allchars_38': False,
 'gen_allchars_39': False,
 'gen_allchars_40': False,
 'gen_allchars_41': False,
 'gen_allchars_42': False,
 'gen_allchars_43': False,
 'gen_allchars_44': False,
 'gen_allchars_45': False,
 'gen_allchars_46': False,
 'gen_allchars_47': False,
 'gen_allchars_48': False,
 'gen_allchars_49': False,
 'gen_allchars_50': False,
 'gen_allchars_51': False,
 'gen_allchars_52': False,
 'gen_allchars_53': False,
 'gen_allchars_54': False,
 'gen_allchars_55': False,
 'gen_allchars_56': False,
 'gen_allchars_57': False,
 'gen_allchars_58': False,
 'gen_allchars_59': False,
 'gen_allchars_60': False,
 'gen_allchars_61': False,
 'gen_allchars_62': False,
 'gen_allchars_63': False,
 'gen_allchars_64': False,
 'gen_allchars_65': False,
 'gen_allchars_66': False,
 'gen_allchars_67': False,
 'gen_allchars_68': False,
 'gen_allchars_69': False,
 'gen_allchars_70': False,
 'gen_allchars_71': False,
 'gen_allchars_72': False,
 'gen_allchars_73': False,
 'gen_allchars_74': False,
 'gen_allchars_75': False,
 'gen_allchars_76': False,
 'gen_allchars_77': False,
 'gen_allchars_78': False,
 'gen_allchars_79': False,
 'gen_allchars_80': False,
 'gen_allchars_81': False,
 'gen_allchars_82': False,
 'gen_allchars_83': False,
 'gen_allchars_84': False,
 'gen_allchars_85': False,
 'gen_allchars_86': False,
 'gen_allchars_87': False,
 'gen_allchars_88': False,
 'gen_allchars_89': False,
 'gen_value_0': False,
 'gen_anychar_0': False,
 'gen_anychar_1': False,
 'gen_anychar_2': False,
 'gen_anychar_3': False,
 'gen_anychar_4': False,
 'gen_anychar_5': False,
 'gen_anychar_6': False,
 'gen_anychar_7': False,
 'gen_anychar_8': False,
 'gen_anychar_9': False,
 'gen_anychar_10': False,
 'gen_anychar_11': False,
 'gen_anychar_12': False,
 'gen_anychar_13': False,
 'gen_anychar_14': False,
 'gen_anychar_15': False,
 'gen_anychar_16': False,
 'gen_anychar_17': False,
 'gen_anychar_18': False,
 'gen_anychar_19': False,
 'gen_anychar_20': False,
 'gen_anychar_21': False,
 'gen_anychar_22': False,
 'gen_anychar_23': False,
 'gen_anychar_24': False,
 'gen_anychar_25': False,
 'gen_anychar_26': False,
 'gen_anychar_27': False,
 'gen_anychar_28': False,
 'gen_anychar_29': False,
 'gen_anychar_30': False,
 'gen_anychar_31': False,
 'gen_anychar_32': False,
 'gen_anychar_33': False,
 'gen_anychar_34': False,
 'gen_anychar_35': False,
 'gen_anychar_36': False,
 'gen_anychar_37': False,
 'gen_anychar_38': False,
 'gen_anychar_39': False,
 'gen_anychar_40': False,
 'gen_anychar_41': False,
 'gen_anychar_42': False,
 'gen_anychar_43': False,
 'gen_anychar_44': False,
 'gen_anychar_45': False,
 'gen_anychar_46': False,
 'gen_anychar_47': False,
 'gen_anychar_48': False,
 'gen_anychar_49': False,
 'gen_anychar_50': False,
 'gen_anychar_51': False,
 'gen_anychar_52': False,
 'gen_anychar_53': False,
 'gen_anychar_54': False,
 'gen_anychar_55': False,
 'gen_anychar_56': False,
 'gen_anychar_57': False,
 'gen_anychar_58': False,
 'gen_anychar_59': False,
 'gen_anychar_60': False,
 'gen_anychar_61': False,
 'gen_anychar_62': False,
 'gen_anychar_63': False,
 'gen_anychar_64': False,
 'gen_anychar_65': False,
 'gen_anychar_66': False,
 'gen_anychar_67': False,
 'gen_anychar_68': False,
 'gen_anychar_69': False,
 'gen_anychar_70': False,
 'gen_anychar_71': False,
 'gen_anychar_72': False,
 'gen_anychar_73': False,
 'gen_anychar_74': False,
 'gen_anychar_75': False,
 'gen_anychar_76': False,
 'gen_anychar_77': False,
 'gen_anychar_78': False,
 'gen_anychar_79': False,
 'gen_anychar_80': False,
 'gen_anychar_81': False,
 'gen_anychar_82': False,
 'gen_anychar_83': False,
 'gen_anychar_84': False,
 'gen_anychar_85': False,
 'gen_anychar_86': False,
 'gen_anychar_87': False,
 'gen_anychar_88': False,
 'gen_anychar_89': False,
 'gen_anychar_90': False,
 'gen_anychar_91': False,
 'gen_anychar_92': False,
 'gen_anychar_93': False,
 'gen_anychar_94': False,
 'gen_anychar_95': False,
 'gen_anychar_96': False,
 'gen_anychar_97': False,
 'gen_anychar_98': False,
 'gen_anychar_99': False,
 'gen_anychars_0': False,
 'gen_uqvalue_0': False,
 'gen_uqchar_0': False,
 'gen_uqchar_1': False,
 'gen_uqchar_2': False,
 'gen_uqchar_3': False,
 'gen_uqchar_4': False,
 'gen_uqchar_5': False,
 'gen_uqchar_6': False,
 'gen_uqchar_7': False,
 'gen_uqchar_8': False,
 'gen_uqchar_9': False,
 'gen_uqchar_10': False,
 'gen_uqchar_11': False,
 'gen_uqchar_12': False,
 'gen_uqchar_13': False,
 'gen_uqchar_14': False,
 'gen_uqchar_15': False,
 'gen_uqchar_16': False,
 'gen_uqchar_17': False,
 'gen_uqchar_18': False,
 'gen_uqchar_19': False,
 'gen_uqchar_20': False,
 'gen_uqchar_21': False,
 'gen_uqchar_22': False,
 'gen_uqchar_23': False,
 'gen_uqchar_24': False,
 'gen_uqchar_25': False,
 'gen_uqchar_26': False,
 'gen_uqchar_27': False,
 'gen_uqchar_28': False,
 'gen_uqchar_29': False,
 'gen_uqchar_30': False,
 'gen_uqchar_31': False,
 'gen_uqchar_32': False,
 'gen_uqchar_33': False,
 'gen_uqchar_34': False,
 'gen_uqchar_35': False,
 'gen_uqchar_36': False,
 'gen_uqchar_37': False,
 'gen_uqchar_38': False,
 'gen_uqchar_39': False,
 'gen_uqchar_40': False,
 'gen_uqchar_41': False,
 'gen_uqchar_42': False,
 'gen_uqchar_43': False,
 'gen_uqchar_44': False,
 'gen_uqchar_45': False,
 'gen_uqchar_46': False,
 'gen_uqchar_47': False,
 'gen_uqchar_48': False,
 'gen_uqchar_49': False,
 'gen_uqchar_50': False,
 'gen_uqchar_51': False,
 'gen_uqchar_52': False,
 'gen_uqchar_53': False,
 'gen_uqchar_54': False,
 'gen_uqchar_55': False,
 'gen_uqchar_56': False,
 'gen_uqchar_57': False,
 'gen_uqchar_58': False,
 'gen_uqchar_59': False,
 'gen_uqchar_60': False,
 'gen_uqchar_61': False,
 'gen_uqchar_62': False,
 'gen_uqchar_63': False,
 'gen_uqchar_64': False,
 'gen_uqchar_65': False,
 'gen_uqchar_66': False,
 'gen_uqchar_67': False,
 'gen_uqchar_68': False,
 'gen_uqchar_69': False,
 'gen_uqchar_70': False,
 'gen_uqchar_71': False,
 'gen_uqchar_72': False,
 'gen_uqchar_73': False,
 'gen_uqchar_74': False,
 'gen_uqchar_75': False,
 'gen_uqchar_76': False,
 'gen_uqchar_77': False,
 'gen_uqchar_78': False,
 'gen_uqchar_79': False,
 'gen_uqchar_80': False,
 'gen_uqchar_81': False,
 'gen_uqchar_82': False,
 'gen_uqchar_83': False,
 'gen_uqchar_84': False,
 'gen_uqchar_85': False,
 'gen_uqchar_86': False,
 'gen_uqchar_87': False,
 'gen_uqchar_88': False,
 'gen_uqchars_0': False,
 'gen_attributes_0': False,
 'gen_attributes_1': True,
 'gen_space_0': False,
 'gen_space_1': False,
 'gen_space_2': False,
 'gen_a_content_1_0': False,
 'gen_a_content_1_1': True,
 'gen_address_content_1_0': False,
 'gen_address_content_1_1': True,
 'gen_param_1_0': False,
 'gen_param_1_1': True,
 'gen_block_content_1_0': False,
 'gen_block_content_1_1': True,
 'gen_body_content_1_0': False,
 'gen_body_content_1_1': True,
 'gen_body_content_2_0': False,
 'gen_body_content_2_1': True,
 'gen_body_content_3_0': False,
 'gen_body_content_3_1': True,
 'gen__r__1_0': False,
 'gen__r__1_1': True,
 'gen_li_tag_1_0': True,
 'gen_li_tag_1_1': True,
 'gen_dl_content_1_0': True,
 'gen_dl_content_1_1': True,
 'gen_legend_tag_1_0': False,
 'gen_legend_tag_1_1': True,
 'gen_form_content_1_0': False,
 'gen_form_content_1_1': True,
 'gen_flow_content_1_0': False,
 'gen_flow_content_1_1': True,
 'gen_form_content_2_0': False,
 'gen_form_content_2_1': True,
 'gen_frameset_content_1_0': False,
 'gen_frameset_content_1_1': True,
 'gen_head_content_1_0': False,
 'gen_head_content_1_1': True,
 'gen_label_content_1_0': False,
 'gen_label_content_1_1': True,
 'gen_area_1_0': False,
 'gen_area_1_1': True,
 'gen_li_tag_2_0': False,
 'gen_li_tag_2_1': True,
 'gen_body_content_4_0': False,
 'gen_body_content_4_1': True,
 'gen_param_2_0': False,
 'gen_param_2_1': True,
 'gen_li_tag_3_0': True,
 'gen_li_tag_3_1': True,
 'gen_option_tag_1_0': False,
 'gen_option_tag_1_1': True,
 'gen_plain_text_1_0': False,
 'gen_plain_text_1_1': True,
 'gen_entity_1_0': False,
 'gen_entity_1_1': True,
 'gen_pre_content_1_0': False,
 'gen_pre_content_1_1': True,
 'gen_select_content_1_0': False,
 'gen_select_content_1_1': True,
 'gen_caption_tag_1_0': False,
 'gen_caption_tag_1_1': True,
 'gen_colgroup_tag_1_0': False,
 'gen_colgroup_tag_1_1': True,
 'gen_table_content_1_0': False,
 'gen_table_content_1_1': True,
 'gen_text_content_1_0': False,
 'gen_text_content_1_1': True,
 'gen_table_cell_1_0': False,
 'gen_table_cell_1_1': True,
 'gen_li_tag_4_0': False,
 'gen_li_tag_4_1': True,
 'gen_space_1_0': False,
 'gen_space_1_1': True,
 'gen_attributes_1_0': False,
 'gen_attributes_1_1': True,
 'gen_space_2_0': False,
 'gen_space_2_1': True,
 'gen_anychar_1_0': False,
 'gen_anychar_1_1': True,
 'gen_uqchar_1_0': False,
 'gen_uqchar_1_1': True,
 'gen_space_3_0': False,
 'gen_space_3_1': True}

In [33]:
my_grammar = css_grammar

Maximum depth is 7; beyond that the size goes > 50G

## Existing tools
* Grammarinator (Python based)
* Gramfuzz (Python based)
* Dharma (Python based)

In [34]:
class Sanitize:
    def __init__(self, g):
        self.g = g
  
    def to_key(self, k):
        s = k.replace('-', '_')
        s = s.replace('[', 'Osq').replace(']','Csq')
        s = s.replace('{','Obr').replace('}','Cbr')
        s = s.replace('import','XimportX')
        s = s.replace('class', 'XclassX')
        s = s.replace('def', 'XdefX')
        return s

    def to_token(self, t):
        return t
    
    def split_tokens(self, t, grammar):
        if t in grammar: return [t]
        my_tokens = []
        # these should not matter for performance comparisons,
        # and makes my life simpler
        esc = {'\r': '\r', '\n': '\n',
             '\\': '\\',
             '"':'"',
             "'":"'"}
        for i in t:
            if i in esc:
                my_tokens.append(esc[i])
            else:
                my_tokens.append(i)
        return my_tokens
            
        return list(t)

    def to_rule(self, rule, grammar):
        tokens = [k for t in rule for k in self.split_tokens(t, grammar)]
        return [self.to_token(t) if t not in grammar else self.to_key(t)
                for t in tokens]

    def translate(self):
        new_grammar = {}
        for k in self.g:
            rules = self.g[k]
            new_grammar[self.to_key(k)] = [self.to_rule(rule, self.g) for rule in rules]
        return new_grammar

### Grammarinator

Be sure to install the latest from git. The --random-seed argument is only in git HEAD.

In [35]:
ipkg('grammarinator', 'git+https://github.com/renatahodovan/grammarinator.git')

grammarinator found


In [36]:
class AntlrG(Sanitize):
    def to_key(self, k):
        return super().to_key(k)[1:-1]

    def esc_token(self, t):
        # these are multi-char tokens
        t = t.replace('\\','\\\\')
        t = t.replace("'","\\\'")
        t = t.replace('\n','\\n')
        t = t.replace('\r','\\r')
        t = t.replace('\t','\\t')
        return t

    def rule_to_s(self, rule, grammar):
        return ' '.join(["'%s'" % self.esc_token(t)
                         if t not in grammar else self.to_key(t)
                         for t in rule])

    def translate(self):
        lines = ['grammar Grammar;']
        for k in self.g:
            rules = self.g[k]
            v = '\n    |'.join([self.rule_to_s(rule, self.g)
                                for rule in rules])
            lines.append('''\
%s
    : %s
    ;''' % (self.to_key(k), v))
        return '\n'.join(lines)

In [37]:
g4 = AntlrG(my_grammar).translate()

In [38]:
with open('testers/grammar.g4', 'w+') as f:
    print(g4, file=f)

In [39]:
!grammarinator-process testers/grammar.g4 -o testers

In [40]:
glexer = 'testers/GrammarUnlexer.py'
gparser = 'testers/GrammarUnparser.py'

In [41]:
!rm -rf tests

In [42]:
!grammarinator-generate -l testers/GrammarUnlexer.py -p testers/GrammarUnparser.py -r start -n 10 -o tests/ -j 1 --sys-recursion-limit 20900 -d 10 --random-seed 0

In [43]:
import pathlib

In [44]:
class GrammarinatorTester(Tester):
    def show_files(self, path):
        tests = pathlib.Path(path)
        for tf in tests.glob('*'):
            with open(tf) as f:
                print(repr(f.read()))

In [45]:
GrammarinatorTester().show_files('tests')

'@charset  ;  @import \t  \t   ;     --> --> <!--   -->   <!--  '
'    '
'   '
'   '
'@charset  ;  @import \t       ;   '
'    '
'@charset  ;  @import   \t  ;  @import   \t  ;  <!-- -->   <!-- '
'@charset  ; --> @import     \t  ;  --> @import  url("\t")   ; \t   --> -->  <!-- <!--   '
'  @import     ;  -->   @import   \t  ;  <!--    '
' <!--<!--<!--  @import      ; \t\t  '


In [46]:
class GrammarinatorTester(GrammarinatorTester):
    def folder_size(self, path='.'):
        def cksum(fn):
            v = !cksum {fn}
            return (v.fields(0)[0])
            #with open(fn) as f: return f.read()
        total = 0
        num = 0
        ufiles = set()
        for entry in os.scandir(path):
            if entry.is_file():
                total += entry.stat().st_size
                num += 1
                #ufiles.add(cksum(entry.path))
            elif entry.is_dir():
                raise Exception('Only flat directories expected now.') 
                # total += self.folder_size(entry.path)
        return total, num

In [47]:
GrammarinatorTester().folder_size('tests')

(335, 10)

In [48]:
class GrammarinatorTester(GrammarinatorTester):
    def init_run(self):
        super().init_run()
        !grammarinator-process testers/grammar.g4 -o testers/
        
    def pre_time(self):
        super().pre_time()
    
    def exec_program(self, seed, max_depth, t):
        glexer = 'testers/GrammarUnlexer.py'
        gparser = 'testers/GrammarUnparser.py'
        # seed, maxnum, max_depth
        return f"grammarinator-generate -l {glexer} -p {gparser}  --random-seed {seed} -n {self.max_num} -d {max_depth}  -r start  -o tests/ -j 1 --sys-recursion-limit 20900"

    def post_time(self):
        super().post_time()
        self.size, self.lines = self.folder_size(path='tests/')
        !rm -rf tests

In [49]:
GrammarinatorTester().run_test().show()

depth= 8 size= 283535.5 time= 4.965 stdev(0.08) throughput= 55.77911743717848 stdev(1)
depth= 16 size= 450632 time= 7.357 stdev(0.012) throughput= 59.81331993314168 stdev(1)
depth= 32 size= 454815 time= 7.081 stdev(0.177) throughput= 62.743195759392094 stdev(1)
depth= 64 size= 456442 time= 6.952 stdev(0.037) throughput= 64.11886914896179 stdev(1)
depth= 128 size= 457099.5 time= 7.054 stdev(0.003) throughput= 63.2813654472841 stdev(0)
depth= 256 size= 456886 time= 7.005 stdev(0.057) throughput= 63.69052207146876 stdev(0)
Throughput of  64.11886914896179  kilobytes per second at depth =  64
Total time: 0:02:47.586745


### GramFuzz

The fork at `vrthra-forks` contains a few fixes for python 2to3 which have not made to the main repo yet.

In [50]:
ipkg('gramfuzz', 'git+https://github.com/vrthra-forks/gramfuzz.git')

gramfuzz found


In [51]:
class RDefG(AntlrG):
    def to_key(self, k):
        return 'RRef("%s")' % super().to_key(k)
 
    def esc_token(self, t):
        # these are multi-char tokens
        t = t.replace('\\','\\\\')
        t = t.replace('"','\\\"') # main difference from Antlr -- Quote char.
        t = t.replace('\n','\\n')
        t = t.replace('\r','\\r')
        t = t.replace('\t','\\t')
        return t

    def rule_to_s(self, rule, grammar):
        if len(rule) == 0: return '""'
        s =['"%s"' % self.esc_token(t)
                         if t not in grammar else 'RRef("%s")' % t
                         for t in rule]
        if len(s) == 1: return s[0]
        return 'And(%s)' % ', '.join(s)

    def translate(self):
        lines = ['''\
from gramfuzz.fields import*
TOP_CAT = "grammar"
class RDef(Def): cat="grammar-def"
class RRef(Ref): cat="grammar-def"

# top-level rule
Def("grammar", RRef("<start>", cat="grammar-def") )
''']
        for key in self.g:
            rules = self.g[key]
            if len(rules) == 1:
                srules = self.rule_to_s(rules[0], self.g)
            else:
                srules = "Or(%s)" % ' ,'.join(sorted([self.rule_to_s(rule, self.g) for rule in rules], reverse=True))
            lines.append('''\
RDef("%(key)s",
%(rules)s
)
''' % {'key':key, 'rules':srules})
        return '\n'.join(lines)

In [52]:
roman_grammar = {
    '<start>' : [['<roman>']],
    '<roman>' : [['<hundreds>','<tens>','<units>']],
    '<hundreds>' : [['<lowhundreds>'],['CD'],['D','<lowhundreds>'],['CM']],
    '<lowhundreds>' : [[],['<lowhundreds>','C']],
    '<tens>' : [['<lowtens>'],['XL'], ['<lowtens>'], ['XC']],
    '<lowtens>' : [[],['<lowtens>','X']],
    '<units>' : [['<lowunits>'],['IV'], ['V', '<lowunits>'],['IX']],
    '<lowunits>' : [[],['<lowunits>', 'I']]
}

In [53]:
with open('testers/roman_rdef.py', 'w+') as f:
    print(RDefG(roman_grammar).translate(), file=f)
!cat testers/roman_rdef.py

from gramfuzz.fields import*
TOP_CAT = "grammar"
class RDef(Def): cat="grammar-def"
class RRef(Ref): cat="grammar-def"

# top-level rule
Def("grammar", RRef("<start>", cat="grammar-def") )

RDef("<start>",
RRef("<roman>")
)

RDef("<roman>",
And(RRef("<hundreds>"), RRef("<tens>"), RRef("<units>"))
)

RDef("<hundreds>",
Or(RRef("<lowhundreds>") ,And("D", RRef("<lowhundreds>")) ,"CM" ,"CD")
)

RDef("<lowhundreds>",
Or(And(RRef("<lowhundreds>"), "C") ,"")
)

RDef("<tens>",
Or(RRef("<lowtens>") ,RRef("<lowtens>") ,"XL" ,"XC")
)

RDef("<lowtens>",
Or(And(RRef("<lowtens>"), "X") ,"")
)

RDef("<units>",
Or(RRef("<lowunits>") ,And("V", RRef("<lowunits>")) ,"IX" ,"IV")
)

RDef("<lowunits>",
Or(And(RRef("<lowunits>"), "I") ,"")
)



In [54]:
import gramfuzz
fuzzer = gramfuzz.GramFuzzer()
fuzzer.load_grammar("testers/roman_rdef.py")
names = fuzzer.gen(cat="default", num=10)
print("\n".join(names))

CMXL
DCCCCCIV
CDI
CMXLIV
CDXLIV
CMXLIX
CDIV
CXLIX
DCCXIX
CDXLIV


In [55]:
with open('testers/grammar_rdef.py', 'w+') as f:
    print(RDefG(my_grammar).translate(), file=f)
!cat testers/grammar_rdef.py

from gramfuzz.fields import*
TOP_CAT = "grammar"
class RDef(Def): cat="grammar-def"
class RRef(Ref): cat="grammar-def"

# top-level rule
Def("grammar", RRef("<start>", cat="grammar-def") )

RDef("<start>",
RRef("<stylesheet>")
)

RDef("<stylesheet>",
And(RRef("<[CHARSET_SYM_STRING_SEMI]-1>"), " ", RRef("<[S_OR_CDO_OR_CDC]-1>"), " ", RRef("<[import_CDO_S_OR_CDC_S]-1>"), " ", RRef("<[stylesheet_closing_GROUPING]-1>"))
)

RDef("<[CHARSET_SYM_STRING_SEMI]>",
And(RRef("<CHARSET_SYM>"), " ", RRef("<STRING>"), " ;")
)

RDef("<[S_OR_CDO_OR_CDC]>",
Or(RRef("<Sp>") ,RRef("<CDO>") ,RRef("<CDC>"))
)

RDef("<[import_CDO_S_OR_CDC_S]>",
And(RRef("<import>"), " ", RRef("<[CDO_S_OR_CDC_S]-1>"))
)

RDef("<[CDO_S_OR_CDC_S]>",
Or(And(RRef("<CDO>"), " ", RRef("<Ss>")) ,And(RRef("<CDC>"), " ", RRef("<Ss>")))
)

RDef("<[ruleset_OR_media_OR_page]>",
Or(RRef("<ruleset>") ,RRef("<page>") ,RRef("<media>"))
)

RDef("<[stylesheet_closing_GROUPING]>",
And(RRef("<[ruleset_OR_medi

In [56]:
with open('testers/gram_fuzz.py', 'w+') as f:
    print('''\
import gramfuzz, sys
gramfuzz.rand.seed(int(sys.argv[2]))
fuzzer = gramfuzz.GramFuzzer()
fuzzer.load_grammar(sys.argv[1])
names = fuzzer.gen(cat="default",
                   num=int(sys.argv[3]),
                   max_recursion=int(sys.argv[4]))
for n in names:
   print(repr(n))''', file=f)

In [57]:
!python testers/gram_fuzz.py testers/grammar_rdef.py 0 10 10

'  @import \t url(" \'\'")  -w   ;  @import \t \'\' \t -_   ; \t <!--  @page    { \t  ;  ;   }  '
'@charset "" ; \t \t<!--\t @import  ""    ;  --> @import   url("")   c   ;  -->  '
' <!----><!-- \t--> @import  url("\'\'") \t  ;   @page   {  z  :  -c    ;   }   --> --> '
"@charset '' ;   \t\t <!--<!--  @media  u  ,  k  {   }  @media  -x   {  *    {    }  }  @page   {    }  "
'  @import  url("\t\'\'\t")  -y  ,  g  ;  <!--  [  -_   ]   {    } \t -->  @media  -v   {  *    {    }  }  --> <!-- --> '
' <!--<!--\t<!--<!---->  '
'   '
' <!--  '
"@charset '' ;   "
'@charset "" ;    \t  '


In [58]:
class GramFuzzTester(Tester):
    def exec_program(self, seed, max_depth, t):
        # seed, maxnum, max_depth
        fn = self.ofile(max_depth, seed)
        return f"python testers/gram_fuzz.py testers/grammar_rdef.py {seed} {self.max_num} {max_depth} > {fn}"

In [59]:
GramFuzzTester().run_test().show()

depth= 8 size= 559219.5 time= 3.245 stdev(0.078) throughput= 168.310429884367 stdev(4)
depth= 16 size= 1358548 time= 10.083 stdev(0.023) throughput= 131.57696207804727 stdev(1)
depth= 32 size= 1752249.5 time= 13.627 stdev(0.421) throughput= 125.60605114460381 stdev(2)
depth= 64 size= 1772606 time= 13.966 stdev(0.053) throughput= 123.94662908691615 stdev(2)
depth= 128 size= 1772606 time= 14.129 stdev(0.203) throughput= 122.52785825872101 stdev(1)
depth= 256 size= 1772606 time= 14.02 stdev(0.126) throughput= 123.47112168824071 stdev(0)
Throughput of  168.310429884367  kilobytes per second at depth =  8
Total time: 0:03:03.205191


### Dharma

In [60]:
ipkg('dharma', 'dharma')

dharma found


In [61]:
class DharmaG(AntlrG):
    def esc_token(self, t):
        if t.strip() == '':
            return '%range%( - )'
        # these are multi-char tokens
        #t = t.replace('\\','\\\\')
        #t = t.replace('"','\\\"')
        t = t.replace('\n','') # dont know what to do
        t = t.replace('\r','') # dont know what to do
        t = t.replace('\t','') # dont know what to do
        return t
    
    def rule_to_s(self, rule, grammar):
        if len(rule) == 0: return '""'
        s =['"%s"' % self.esc_token(t)
                         if t not in grammar else 'RRef("%s")' % t
                         for t in rule]
        if len(s) == 1: return s[0]
        return 'And(%s)' % ', '.join(s)

    def rule_to_s(self, rule, grammar):
        if len(rule) == 0:
            # *note* Dharma does not let us define epsilon rules. So we have to make do with
            # generating whitespace.
            return '%range%( - )'
        return ''.join([self.esc_token(t)
                        if t not in grammar else '+%s+' % self.to_key(t)
                         for t in rule])

    def translate(self):
        lines = ['''%%% Dharma Grammar
        ''']
        for k in self.g:
            rules = self.g[k]
            v = '\n    '.join([s for s in [self.rule_to_s(rule, self.g)
                                for rule in rules] if s.strip() != ''])
            lines.append('''\
%(key)s :=
    %(rules)s
'''% {'key':self.to_key(k), 'rules':v})
        lines.append(''' 
%section% := variance
main :=
    +start+
''')
        return '\n'.join(lines)

In [62]:
with open('testers/grammar.dg', 'w+') as f:
    print(DharmaG(my_grammar).translate(), file=f)
!cat testers/grammar.dg

%%% Dharma Grammar
        
start :=
    +stylesheet+

stylesheet :=
    +OsqCHARSET_SYM_STRING_SEMICsq_1+%range%( - )+OsqS_OR_CDO_OR_CDCCsq_1+%range%( - )+OsqXimportX_CDO_S_OR_CDC_SCsq_1+%range%( - )+Osqstylesheet_closing_GROUPINGCsq_1+

OsqCHARSET_SYM_STRING_SEMICsq :=
    +CHARSET_SYM+%range%( - )+STRING+ ;

OsqS_OR_CDO_OR_CDCCsq :=
    +Sp+
    +CDO+
    +CDC+

OsqXimportX_CDO_S_OR_CDC_SCsq :=
    +XimportX+%range%( - )+OsqCDO_S_OR_CDC_SCsq_1+

OsqCDO_S_OR_CDC_SCsq :=
    +CDO+%range%( - )+Ss+
    +CDC+%range%( - )+Ss+

Osqruleset_OR_media_OR_pageCsq :=
    +ruleset+
    +media+
    +page+

Osqstylesheet_closing_GROUPINGCsq :=
    +Osqruleset_OR_media_OR_pageCsq+%range%( - )+OsqCDO_S_OR_CDC_SCsq_2+

XimportX :=
    +IMPORT_SYM+%range%( - )+Ss+%range%( - )+OsqSTRING_OR_URICsq+%range%( - )+Ss+%range%( - )+media_list_1+ ; +Ss+

OsqSTRING_OR_URICsq :=
    +STRING+
    +URI+

media :=
    +MEDIA_SYM+%range%( - )+Ss+%range%( - )+media_list+ { +Ss+%r

In [63]:
!rm -rf tests
!python -m dharma -grammars testers/grammar.dg  -count 10 -seed 200 -storage tests/ -format txt

[Dharma] 2019-08-23 10:22:35,578 INFO: Machine random seed: 200
[Dharma] 2019-08-23 10:22:35,578 DEBUG: Using configuration from: /Users/rahul/Research/fastgrammarfuzzing/lib/python3.7/site-packages/dharma/settings.py
[Dharma] 2019-08-23 10:22:35,580 DEBUG: Processing grammar content of ../../lib/python3.7/site-packages/dharma/grammars/common.dg
[Dharma] 2019-08-23 10:22:35,586 DEBUG: Processing grammar content of testers/grammar.dg


In [64]:
!cat tests/*.txt

    @import      url("   ")     -e       ,     zdj   ,     g     ;   -->  <!--      
       
  -->-->-->    @media       h    ,     x      {      }   -->   @page   : -lo    {     d    :      #e3b         ;     ;    hs8     :    ' '            ;       }   -->    @page     {        }    @media   -x     ,   -i      {     :  a #-     {       }    }     
@charset '\z\F\q ' ;     @media   -v      {    tp  . omq [   -bt      ]    ,   selector  {    -rb    :     3287em   /         #f3649c     tv              }    -hk      ,    selector,   selector,   selector,     selector  {    -m     :      url("   ")         }    }     
@charset '\G ' ;       @media    -nr     ,   -d    ,    _k   ,    -kz     {     }     
@charset ' ' ; -->-->    @media       -ziiv      ,    f   ,     -g3z      {   : -nh      {      ;       }    }     @page      {        odr      :       url(" *&  ")        ;    iz    :   #3d3      @import  ;         }   -->      
@charset '\ \# ' ; <!---->     
@charset '\  ' ;   @i

In [65]:
class DharmaTester(GrammarinatorTester):
    def exec_program(self, seed, max_depth, t):
        # seed, maxnum, max_depth
        fn = self.ofile(max_depth, seed)
        return f"python -m dharma -grammars testers/grammar.dg -seed {seed} -count {self.max_num} -logging 30 -storage tests/"

In [66]:
DharmaTester().run_test().show()

depth= 8 size= 5930322.5 time= 23.846 stdev(0.111) throughput= 242.86970489526271 stdev(0)
depth= 16 size= 5930322.5 time= 24.125 stdev(0.211) throughput= 240.0659313323822 stdev(1)
depth= 32 size= 5930322.5 time= 24.019 stdev(0.1) throughput= 241.1232799600179 stdev(2)
depth= 64 size= 5930322.5 time= 24.084 stdev(0.234) throughput= 240.46638502841608 stdev(2)
depth= 128 size= 5930322.5 time= 24.107 stdev(0.247) throughput= 240.2479732420464 stdev(2)
depth= 256 size= 5930322.5 time= 24.015 stdev(0.048) throughput= 241.1544178982025 stdev(0)
Throughput of  242.86970489526271  kilobytes per second at depth =  8
Total time: 0:09:26.048787


### The Fuzzing book approach

In [67]:
def pp_grammar(grammar): return json.dumps(grammar, indent=2, sort_keys=False)

In [68]:
class Trans(Sanitize):
    def split_tokens(self, t, grammar):
        if t in grammar: return [t]
        my_tokens = []
        esc = {'\r': '\\\r', '\n': '\\\n',
             '\\': '\\\\',
             '"':'\\\"',
             "'":"'"}
        for i in t:
            if i in esc:
                my_tokens.append(esc[i])
            else:
                my_tokens.append(i)
        return my_tokens
            
        return list(t)

    def to_rule(self, rule, grammar):
        tokens = [k for t in rule for k in self.split_tokens(t, grammar)]
        return [self.to_token(t) if t not in grammar else self.to_key(t)
                for t in tokens]

    def translate(self):
        new_grammar = {}
        for k in self.g:
            rules = self.g[k]
            new_grammar[self.to_key(k)] = [self.to_rule(rule, self.g) for rule in rules]
        return new_grammar

In [69]:
s_grammar = Sanitize(my_grammar).translate()

In [70]:
s_grammar

{'<start>': [['<stylesheet>']],
 '<stylesheet>': [['<OsqCHARSET_SYM_STRING_SEMICsq_1>',
   ' ',
   '<OsqS_OR_CDO_OR_CDCCsq_1>',
   ' ',
   '<OsqXimportX_CDO_S_OR_CDC_SCsq_1>',
   ' ',
   '<Osqstylesheet_closing_GROUPINGCsq_1>']],
 '<OsqCHARSET_SYM_STRING_SEMICsq>': [['<CHARSET_SYM>',
   ' ',
   '<STRING>',
   ' ',
   ';']],
 '<OsqS_OR_CDO_OR_CDCCsq>': [['<Sp>'], ['<CDO>'], ['<CDC>']],
 '<OsqXimportX_CDO_S_OR_CDC_SCsq>': [['<XimportX>',
   ' ',
   '<OsqCDO_S_OR_CDC_SCsq_1>']],
 '<OsqCDO_S_OR_CDC_SCsq>': [['<CDO>', ' ', '<Ss>'], ['<CDC>', ' ', '<Ss>']],
 '<Osqruleset_OR_media_OR_pageCsq>': [['<ruleset>'], ['<media>'], ['<page>']],
 '<Osqstylesheet_closing_GROUPINGCsq>': [['<Osqruleset_OR_media_OR_pageCsq>',
   ' ',
   '<OsqCDO_S_OR_CDC_SCsq_2>']],
 '<XimportX>': [['<IMPORT_SYM>',
   ' ',
   '<Ss>',
   ' ',
   '<OsqSTRING_OR_URICsq>',
   ' ',
   '<Ss>',
   ' ',
   '<media_list_1>',
   ' ',
   ';',
   ' ',
   '<Ss>']],
 '<OsqSTRING_OR_URICsq>': [['<STRING>'], ['<URI>']],
 '<media>': [['<ME

In [71]:
with open('testers/fuzzingbook_gfuzzer.py', 'w+') as f:
    print('''grammar = ''', pp_grammar(s_grammar), file=f)
    print("""
result = ''
from fuzzingbook.GrammarFuzzer import GrammarFuzzer
def canonical(grammar):
    new_g = {}
    for k in grammar:
        new_g[k] = []
        for rule in grammar[k]:
            new_g[k].append(''.join(rule))
    return new_g
import random
def main(args):
    random.seed(int(sys.argv[1]))
    global result
    max_num = int(args[2])
    max_depth = int(args[3])
    fuzzer = GrammarFuzzer(canonical(grammar), max_nonterminals=max_depth)
    global result
    for i in range(max_num):
        result = fuzzer.fuzz()
        print(result)
        result = ''
import sys
main(sys.argv)""", file=f)

In [72]:
!cat testers/fuzzingbook_gfuzzer.py

grammar =  {
  "<start>": [
    [
      "<stylesheet>"
    ]
  ],
  "<stylesheet>": [
    [
      "<OsqCHARSET_SYM_STRING_SEMICsq_1>",
      " ",
      "<OsqS_OR_CDO_OR_CDCCsq_1>",
      " ",
      "<OsqXimportX_CDO_S_OR_CDC_SCsq_1>",
      " ",
      "<Osqstylesheet_closing_GROUPINGCsq_1>"
    ]
  ],
  "<OsqCHARSET_SYM_STRING_SEMICsq>": [
    [
      "<CHARSET_SYM>",
      " ",
      "<STRING>",
      " ",
      ";"
    ]
  ],
  "<OsqS_OR_CDO_OR_CDCCsq>": [
    [
      "<Sp>"
    ],
    [
      "<CDO>"
    ],
    [
      "<CDC>"
    ]
  ],
  "<OsqXimportX_CDO_S_OR_CDC_SCsq>": [
    [
      "<XimportX>",
      " ",
      "<OsqCDO_S_OR_CDC_SCsq_1>"
    ]
  ],
  "<OsqCDO_S_OR_CDC_SCsq>": [
    [
      "<CDO>",
      " ",
      "<Ss>"
    ],
    [
      "<CDC>",
      " ",
      "<Ss>"
    ]
  ],
  "<Osqruleset_OR_media_OR_pageCsq>": [
    [
      "<ruleset>"
    ],
    [
      "<media>"
    ],
    [
      "<page>"
    ]
  

In [73]:
!python testers/fuzzingbook_gfuzzer.py 0 10 10

 <!--<!--  *    {    }  --> 
  --> @import  ""   ;   
@charset 'M	' ;   
@charset '\g' ;   
  @import  ''   ;   @page   {    }  
@charset '\
' ; <!--	<!--  
    @import  ''   ;  @import  ""   ;  -->  
  @import  ""    ;  --> <!--  
 --> @import  ''   ;   @media  v   {   }  <!-- <!-- 
   @media  _   {  *    {    }  }  


In [74]:
class FuzzingbookTester(Tester):
    def exec_program(self, seed, max_depth, t):
        # seed, maxnum, max_depth
        fn = self.ofile(max_depth, seed)
        return f"python testers/fuzzingbook_gfuzzer.py {seed} {self.max_num} {max_depth} > {fn}"

In [75]:
# The fuzzing book fuzzer does not improve with more depth.
FuzzingbookTester(limit_depth=5).run_test().show()

depth= 8 size= 371946.5 time= 97.094 stdev(1.22) throughput= 3.74114231248298 stdev(0)
depth= 16 size= 547603 time= 151.442 stdev(0.231) throughput= 3.531163101793468 stdev(0)
Throughput of  3.74114231248298  kilobytes per second at depth =  8
Total time: 0:24:37.088306


## Building a grammar fuzzer

In [76]:
import sys
assert sys.platform == 'darwin'
sys.setrecursionlimit(20900) # for OSX only

In [77]:
class Fuzzer:
    def __init__(self, grammar):
        self.grammar = grammar
    
    def fuzz(self, key='<start>', max_num=None, max_depth=None):
        raise NotImplemented()

In [78]:
class NFuzzer(Fuzzer):
    def gen_key(self, key):
        return (self.gen_rule(random.choice(self.grammar[key]))
                if key in self.grammar else key)

    def gen_rule(self, rule):
        return ''.join(self.gen_key(token) for token in rule)

    def fuzz(self, key='<start>', max_depth=None):
        return self.gen_key(key)

In [79]:
my_fuzzer = NFuzzer(s_grammar)
try:
    for i in range(100):
        print(repr(my_fuzzer.fuzz()))
except:
    exc_type, exc_value, exc_traceback = sys.exc_info()
    print(exc_type, exc_value)

"@charset '\\\x0c' ;   "
' <!--  @media \t -vxci   {  : -t( \t  )   {   ; \t ;   ;  -mk\\~a  :  ""    }   }  --> \t @media \t\t pkc9xf   { \t  }  --> [  -uqwf  |=  -wy \t  ]   {  u  : \t \'t\'   ,  url("%")  @import   }   -j .bp88  ,  selector {  -m \t :  url("\'\'")   @import  ; \t  -am  :  \'\'   @import  }  \t \t\t   \t @media \t \t q  , \t m \t  ,  -_  ,  -q-o   { \t\t m  +  -w .-iu#8  ,  selector {  -kcd  \t\t :  \t    \t j(  \t\t -c(  -k( \t\t \t  ""   )  \t\t  ) \t  -hby(  \'\\\n\x0b\' \t  )  url("\t")  ) \t  @import \t   }  }  eajm .ph4b.nb: zh.-mr2r \t   {    }  \t '
'  @import  \'\'   -c-o \t  , \t\t\t\t -gu ,  -_  ,  -k ,  -t \t\t\t\t\t\t, \t  pi , \t yg \t\t\t ;  --> --> @import \t url("\t&#")   -lz \t \t  ; \t -->  --> -->  <!--  '
" --> @import \t   ''  -k  , \t  \\`g ,  \t \t lgrr  \t  ,  \t w    ;  --> \t \t \t-->  "
"@charset 'N' ; <!--  @media  -l4   { \t\t *  \t\t\t   ,   \t selector { \t\t   }    } \t\t   <!-- <!-- --> -->  "
"@charset '\\x' ;   "
'@charset "" ; -->

'@charset "\\Z\\\r" ;   @media  x  , \t\t\t -a    \t\t\t {    }  \t '
' -->--> @import   url("$*&\\n-") \t\t -cd_ \t ,  e6qivj \t ,  -y ,  \t\t ib  ;   -->  '
'@charset \'\\\r\' ;   @import \t "" \t  ;  @import   url(""_\\\x0c"")   ;    -_  +  : d[  -ox   ]  ,  selector,   selector,  selector,  selector {    }   <!--  @media  \\H   {     }   <!-- \t\t  d : -pl(  cg   )   { \t\t\t  ;   \t -c \t :  #add \t\t\t  "\\\n\\\n"  @import     } \t <!-- \t   \t --> @media  -r \t  {  [ \t  y   ][  \t k  |=  "\\\x0cQ\\I"  \t ]#gu8e6-   {    }  } \t -->  @media \t -j \t\t  ,  \t z  {   }  -o : v(    -\\:w \t\t )#n#x \t  o0   ,   selector,  selector { \t   ;  ;  \\S7 \t :  -g(  url("%")   )   ;     }  \t #b: -p#1a  ,  selector,   selector {    -j   \t :  w \t    }  '
'@charset \'\' ;  @import   url("\'\'") \t \t \t\t  ;   @media  -q7 \t ,   _q   \t { \t * [  -z   ~=   -qn\\n  ] >  *  >  #c[  -o   ][ \t \\*_y \t = \t zw \t ]: h  ,  selector {  i  \t  :  url("  ") \t  #8a9bff   @import   } \t }  '
'   

In [80]:
import inspect
import json

In [81]:
def get_opts(args, log=False):
    seed = int(args[1])
    max_num = int(args[2])
    max_depth = int(args[3])
    random.seed(seed)
    sys.setrecursionlimit(20900)
    if log:
        print("seed=%d, num=%d, depth=%d" % (seed, max_num, max_depth), file=sys.stderr)
    return max_num, max_depth

In [82]:
def extract_class_definition(cls, log=False):
    eldest = [c for c in cls.mro()
                if c.__name__ == cls.__name__ and
                   cls.__name__ not in {i.__name__ for i in c.__bases__}]
    n_parents = sum([[j.__name__ for j in i.__bases__] for i in eldest], [])
    s_parents = '(%s)' % ', '.join(set(n_parents)) if n_parents else ''
    buf = ["class %s%s:" % (cls.__name__, s_parents)]
    seen = set()
    i = 0
    for curcls in cls.mro():
        i += 1
        if log: print('Parent: %d' % i, curcls.__name__)
        if curcls.__name__ != cls.__name__: continue
        for fn_name in dir(curcls):
            if log: print('\t:', fn_name)
            if fn_name in seen: continue
            if fn_name == '__new__':
                continue
            fn = curcls.__dict__.get(fn_name)
            if fn is None:
                continue
            if ('function' in str(type(fn))):
                seen.add(fn_name)
                buf.append(inspect.getsource(fn))
    return '\n'.join(buf)

In [83]:
def write_file(file_name, grammar, classes, fns=[get_opts], fuzzer=None):
    with open(file_name, 'w+') as f:
        print('''grammar = ''', pp_grammar(grammar), file=f)
        for cls in classes:
            print(extract_class_definition(cls), file=f)
        for fn in fns:
            print(inspect.getsource(fn), file=f)
        print("""
import itertools
import sys
import random
def main(args):
    max_num, max_depth = get_opts(args)
    my_fuzzer = %s(grammar)
    for i in range(max_num):
        print(my_fuzzer.fuzz(key='<start>', max_depth=max_depth))
try:
    main(sys.argv)
    sys.exit(0)
except RecursionError as e:
    print(e, file=sys.stderr)
    sys.exit(2)
""" % fuzzer.__name__, file=f)

In [84]:
write_file('testers/grammar_producer_naive.py', s_grammar, [Fuzzer, NFuzzer], fuzzer=NFuzzer)

In [85]:
!cat testers/grammar_producer_naive.py

grammar =  {
  "<start>": [
    [
      "<stylesheet>"
    ]
  ],
  "<stylesheet>": [
    [
      "<OsqCHARSET_SYM_STRING_SEMICsq_1>",
      " ",
      "<OsqS_OR_CDO_OR_CDCCsq_1>",
      " ",
      "<OsqXimportX_CDO_S_OR_CDC_SCsq_1>",
      " ",
      "<Osqstylesheet_closing_GROUPINGCsq_1>"
    ]
  ],
  "<OsqCHARSET_SYM_STRING_SEMICsq>": [
    [
      "<CHARSET_SYM>",
      " ",
      "<STRING>",
      " ",
      ";"
    ]
  ],
  "<OsqS_OR_CDO_OR_CDCCsq>": [
    [
      "<Sp>"
    ],
    [
      "<CDO>"
    ],
    [
      "<CDC>"
    ]
  ],
  "<OsqXimportX_CDO_S_OR_CDC_SCsq>": [
    [
      "<XimportX>",
      " ",
      "<OsqCDO_S_OR_CDC_SCsq_1>"
    ]
  ],
  "<OsqCDO_S_OR_CDC_SCsq>": [
    [
      "<CDO>",
      " ",
      "<Ss>"
    ],
    [
      "<CDC>",
      " ",
      "<Ss>"
    ]
  ],
  "<Osqruleset_OR_media_OR_pageCsq>": [
    [
      "<ruleset>"
    ],
    [
      "<media>"
    ],
    [
      "<page>"
    ]
  

In [86]:
# the seed and max_num is chosen to avoid recusion error.
!time python testers/grammar_producer_naive.py 1 96 0 > testers/fuzz.out


real	0m0.092s
user	0m0.079s
sys	0m0.009s


## Setting expansion limits

We can compute the least cost paths to take and use only those paths after a given depth is exceeded.

In [87]:
import sys
import functools

In [88]:
class LimitFuzzer(Fuzzer):
    def symbol_cost(self, grammar, symbol, seen):
        if symbol in self.key_cost: return self.key_cost[symbol]
        if symbol in seen:
            self.key_cost[symbol] = float('inf')
            return float('inf')
        v = min((self.expansion_cost(grammar, rule, seen | {symbol})
                    for rule in grammar.get(symbol, [])), default=0)
        self.key_cost[symbol] = v
        return v

    def expansion_cost(self, grammar, tokens, seen):
        return max((self.symbol_cost(grammar, token, seen)
                    for token in tokens if token in grammar), default=0) + 1

In [89]:
class LimitFuzzer(LimitFuzzer):
    def gen_key(self, key, depth, max_depth):
        if key not in self.grammar: return key
        if depth > max_depth:
            clst = sorted([(self.cost[key][str(rule)], rule) for rule in self.grammar[key]])
            rules = [r for c,r in clst if c == clst[0][0]]
        else:
            rules = self.grammar[key]
        return self.gen_rule(random.choice(rules), depth+1, max_depth)

    def gen_rule(self, rule, depth, max_depth):
        return ''.join(self.gen_key(token, depth, max_depth) for token in rule)

    def fuzz(self, key='<start>', max_depth=10):
        return self.gen_key(key=key, depth=0, max_depth=max_depth)

In [90]:
class LimitFuzzer(LimitFuzzer):
    def __init__(self, grammar):
        super().__init__(grammar)
        self.key_cost = {}
        self.cost = self.compute_cost(grammar)
 
    def compute_cost(self, grammar):
        cost = {}
        for k in grammar:
            cost[k] = {}
            for rule in grammar[k]:
                cost[k][str(rule)] = self.expansion_cost(grammar, rule, set())  
        return cost

In [91]:
my_fuzzer = LimitFuzzer(s_grammar)

In [92]:
my_fuzzer.fuzz()

'  @import  "\\\x0c]"  u   ; \t \t  <!-- @import  \t ""   ; \t\t -->  \t\t-->  '

In [93]:
write_file('testers/grammar_producer_limit.py', s_grammar, [Fuzzer, LimitFuzzer], fuzzer=LimitFuzzer)

In [94]:
!cat testers/grammar_producer_limit.py

grammar =  {
  "<start>": [
    [
      "<stylesheet>"
    ]
  ],
  "<stylesheet>": [
    [
      "<OsqCHARSET_SYM_STRING_SEMICsq_1>",
      " ",
      "<OsqS_OR_CDO_OR_CDCCsq_1>",
      " ",
      "<OsqXimportX_CDO_S_OR_CDC_SCsq_1>",
      " ",
      "<Osqstylesheet_closing_GROUPINGCsq_1>"
    ]
  ],
  "<OsqCHARSET_SYM_STRING_SEMICsq>": [
    [
      "<CHARSET_SYM>",
      " ",
      "<STRING>",
      " ",
      ";"
    ]
  ],
  "<OsqS_OR_CDO_OR_CDCCsq>": [
    [
      "<Sp>"
    ],
    [
      "<CDO>"
    ],
    [
      "<CDC>"
    ]
  ],
  "<OsqXimportX_CDO_S_OR_CDC_SCsq>": [
    [
      "<XimportX>",
      " ",
      "<OsqCDO_S_OR_CDC_SCsq_1>"
    ]
  ],
  "<OsqCDO_S_OR_CDC_SCsq>": [
    [
      "<CDO>",
      " ",
      "<Ss>"
    ],
    [
      "<CDC>",
      " ",
      "<Ss>"
    ]
  ],
  "<Osqruleset_OR_media_OR_pageCsq>": [
    [
      "<ruleset>"
    ],
    [
      "<media>"
    ],
    [
      "<page>"
    ]
  

In [95]:
class PyLimitTester(Tester):
    def exec_program(self, seed, max_depth, t):
        # seed, maxnum, max_depth
        fn = self.ofile(max_depth, seed)
        return f"python ./testers/grammar_producer_limit.py {seed} {self.max_num} {max_depth} > {fn}"

In [96]:
!python testers/grammar_producer_limit.py 0 10 10

 --><!----> @import   url("'\t\'")  	 z4    , 	 f ,  z ,  o  ;  @import  ""   ;     
   @media 	  is   { 	 [  n   ]#k  ,  selector {    ;  h  :  url("")   ;   }  } 	 @page   	 : u 	 { 	  ;  ;  ;   } 	 <!-- <!-- 	 @media  w   {   }  <!-- 		@media 	 l   {   }   @page  : j  {   ;   }  
@charset "\H" ;   @media  -ce 	  {  * #k  ,  selector {    }  	[  p   ] 	   { 	   } 	*   ,  selector {  c  :  ''     }  }  @media  i   {  *    {     }  } 	   
 <!---->	  @import  url(""\U\" ")   -vc   ;  @import 	  "\-"  m    ;  @import 	   url("	!") 	  ;    @import  ''  p   ; 	   @page   {    ; 		 h  :  url("")   ;   b  :  url("")   ;  b  :  e   ;   } 		 
@charset 'j\n:}' ;  @import   ''    ;  <!-- --> @import  '\I'  j 	 ,  _  ;  @import  ''  		 e   ;   
 -->--> @import  	 url(" &-	")  -p    ,  e  ; 				  
@charset '' ; <!--  
@charset '\]\
7' ; <!-- @import  url(""T" ")   ;  				 @import 	 	 url("''") 	  ;  <!-- 	--> 	 
   @import 	 url(" ")      -ku  ,  m  ;   		  @import   ''  q   ; 	 <!-- @im

In [97]:
PyLimitTester().run_test().show()

depth= 8 size= 801791.5 time= 3.474 stdev(0.012) throughput= 225.35201779101067 stdev(2)
depth= 16 size= 1359521.5 time= 5.245 stdev(0.004) throughput= 253.1518766268312 stdev(2)
depth= 32 size= 1582628 time= 5.03 stdev(0.037) throughput= 307.2419734744696 stdev(2)
depth= 64 size= 1590165 time= 5.026 stdev(0.103) throughput= 309.02318996269133 stdev(5)
depth= 128 size= 1590165 time= 5.028 stdev(0.1) throughput= 308.8970987961536 stdev(5)
depth= 256 size= 1590165 time= 5.011 stdev(0.021) throughput= 309.8971180245778 stdev(0)
Throughput of  309.8971180245778  kilobytes per second at depth =  256
Total time: 0:01:43.700045


## Using precomputed string pools

**idea**: We can precompute the closing parts.

In [98]:
class PooledFuzzer(LimitFuzzer):
    def compute_cost(self, grammar, cost={}):
        return {k:sorted([(self.expansion_cost(grammar, rule, set()), rule)
                          for rule in grammar[k]])
                for k in self.grammar}

In [99]:
class PooledFuzzer(PooledFuzzer):
    def cheap_grammar(self):
        new_grammar = {}
        for k in self.cost:
            crules = self.cost[k]
            min_cost = crules[0][0]
            new_grammar[k] = [r for c,r in crules if c == min_cost]
            assert len(new_grammar[k]) > 0
        return new_grammar

In [100]:
PooledFuzzer(s_grammar).cheap_grammar()

{'<start>': [['<stylesheet>']],
 '<stylesheet>': [['<OsqCHARSET_SYM_STRING_SEMICsq_1>',
   ' ',
   '<OsqS_OR_CDO_OR_CDCCsq_1>',
   ' ',
   '<OsqXimportX_CDO_S_OR_CDC_SCsq_1>',
   ' ',
   '<Osqstylesheet_closing_GROUPINGCsq_1>']],
 '<OsqCHARSET_SYM_STRING_SEMICsq>': [['<CHARSET_SYM>',
   ' ',
   '<STRING>',
   ' ',
   ';']],
 '<OsqS_OR_CDO_OR_CDCCsq>': [['<CDC>'], ['<CDO>']],
 '<OsqXimportX_CDO_S_OR_CDC_SCsq>': [['<XimportX>',
   ' ',
   '<OsqCDO_S_OR_CDC_SCsq_1>']],
 '<OsqCDO_S_OR_CDC_SCsq>': [['<CDC>', ' ', '<Ss>'], ['<CDO>', ' ', '<Ss>']],
 '<Osqruleset_OR_media_OR_pageCsq>': [['<page>']],
 '<Osqstylesheet_closing_GROUPINGCsq>': [['<Osqruleset_OR_media_OR_pageCsq>',
   ' ',
   '<OsqCDO_S_OR_CDC_SCsq_2>']],
 '<XimportX>': [['<IMPORT_SYM>',
   ' ',
   '<Ss>',
   ' ',
   '<OsqSTRING_OR_URICsq>',
   ' ',
   '<Ss>',
   ' ',
   '<media_list_1>',
   ' ',
   ';',
   ' ',
   '<Ss>']],
 '<OsqSTRING_OR_URICsq>': [['<STRING>'], ['<URI>']],
 '<media>': [['<MEDIA_SYM>',
   ' ',
   '<Ss>',
   ' ',


In [101]:
import itertools
import random

In [102]:
class PooledFuzzer(PooledFuzzer):
    def get_strings_for_key(self, grammar, key='<start>'):
        if key not in grammar: return [key]
        v = sum([self.get_strings_for_rule(grammar, rule)
                 for rule in grammar[key]], [])
        return random.sample(v, min(self.MAX_SAMPLE, len(v)))

    def get_strings_for_rule(self, grammar, rule):
        my_strings_list = [self.get_strings_for_key(grammar, key) for key in rule]
        v = [''.join(l) for l in itertools.product(*my_strings_list)]
        return random.sample(v, min(self.MAX_SAMPLE, len(v)))

    def completion_strings(self):
        # we are being choosy
        return {k:self.get_strings_for_key(self.c_grammar, k)
                for k in self.c_grammar}

In [103]:
pf = PooledFuzzer(s_grammar)

In [104]:
pf.c_grammar = pf.cheap_grammar()

In [105]:
pf.MAX_SAMPLE = 255
strings = pf.completion_strings()

In [106]:
for k in strings:
    print(k, strings[k])

<start> ['   ']
<stylesheet> ['   ']
<OsqCHARSET_SYM_STRING_SEMICsq> ['@charset "" ;', "@charset '' ;"]
<OsqS_OR_CDO_OR_CDCCsq> ['<!--', '-->']
<OsqXimportX_CDO_S_OR_CDC_SCsq> ['@import  ""   ;  ', "@import  ''   ;  ", '@import  url("")   ;  ']
<OsqCDO_S_OR_CDC_SCsq> ['<!-- ', '--> ']
<Osqruleset_OR_media_OR_pageCsq> ['@page   {    } ']
<Osqstylesheet_closing_GROUPINGCsq> ['@page   {    }  ']
<XimportX> ["@import  ''   ; ", '@import  url("")   ; ', '@import  ""   ; ']
<OsqSTRING_OR_URICsq> ['url("")', '""', "''"]
<media> ['@media  g   {   } ', '@media  c   {   } ', '@media  q   {   } ', '@media  h   {   } ', '@media  l   {   } ', '@media  j   {   } ', '@media  x   {   } ', '@media  b   {   } ', '@media  f   {   } ', '@media  t   {   } ', '@media  d   {   } ', '@media  k   {   } ', '@media  w   {   } ', '@media  e   {   } ', '@media  i   {   } ', '@media  u   {   } ', '@media  z   {   } ', '@media  v   {   } ', '@media  r   {   } ', '@media  m   {   } ', '@media  n   {   } ', '@media  p

In [107]:
class PooledFuzzer(PooledFuzzer):
    def __init__(self, grammar):
        super().__init__(grammar)
        self.c_grammar = self.cheap_grammar()
        self.MAX_SAMPLE = 255
        self.pool_of_strings = self.completion_strings()
        # reorder our grammar rules by cost.
        for k in self.grammar:
            self.grammar[k] = [r for (i,r) in self.cost[k]]
        self.ordered_grammar = True
        
    def gen_key(self, key, depth, max_depth):
        if key not in self.grammar: return key
        if depth > max_depth:
            return random.choice(self.pool_of_strings[key])
        return self.gen_rule(random.choice(self.grammar[key]), depth+1, max_depth)

In [108]:
my_fuzzer = PooledFuzzer(s_grammar)
for i in range(10):
    print(repr(my_fuzzer.fuzz()))

'@charset "" ;   @page   { \t\t i  :   ""   #304688   ; \t ; \t  } \t '
' <!-- \t \t\t\t\t\t-->  @media  -w   { \t\t   #f#z +  *    {   i  :  #5ba   @import   }   [  _   ]  ,  selector { \t   }  }   @page   {  \t w  :  d(  #689fb1   )   @import  ;   ;   }  <!--  @page  : o  {  l  :  #816    ;   }  @media  h   {  *    {    } *    {    }  }  '
"@charset '' ;   "
'@charset \'%\' ;  @import \t ""   ;    : w   { \t  c  :  ""   #b9390c  #b351de   ; \t ; \t ;  o  :  #a36   ;  ;   } \t '
"@charset '' ;   @page  : -bc   { \t   }  <!-- \t  \t-->  \t\t<!-- "
' <!-- @import  "\\\n\\\r\n"   ; \t @import  url("\t"" ")   ;  --> \t .j     {    ;  k  :  #c33   @import ;   p  :  d    }   --> <!-- <!--  : t(   )   { \t   s  :  #420   #f7383e  @import  ;   }   <!-- --> @page \t\t : o  { \t\t   }   '
'@charset "" ;  @import  ""  -rt   ; \t\t    --> --> @import  url(" \'\' ") \t \t g   ; \t  <!-- --> \t\t-->  @import   ""  d  ,  _ ,  c  ; \t\t -->  * #0  , \t\t selector,  selector {    } \t  \t -->   @media

In [109]:
write_file('testers/grammar_producer_pool.py', s_grammar, [Fuzzer, LimitFuzzer, PooledFuzzer], fuzzer=PooledFuzzer)

In [110]:
!cat testers/grammar_producer_pool.py

grammar =  {
  "<start>": [
    [
      "<stylesheet>"
    ]
  ],
  "<stylesheet>": [
    [
      "<OsqCHARSET_SYM_STRING_SEMICsq_1>",
      " ",
      "<OsqS_OR_CDO_OR_CDCCsq_1>",
      " ",
      "<OsqXimportX_CDO_S_OR_CDC_SCsq_1>",
      " ",
      "<Osqstylesheet_closing_GROUPINGCsq_1>"
    ]
  ],
  "<OsqCHARSET_SYM_STRING_SEMICsq>": [
    [
      "<CHARSET_SYM>",
      " ",
      "<STRING>",
      " ",
      ";"
    ]
  ],
  "<OsqS_OR_CDO_OR_CDCCsq>": [
    [
      "<CDC>"
    ],
    [
      "<CDO>"
    ],
    [
      "<Sp>"
    ]
  ],
  "<OsqXimportX_CDO_S_OR_CDC_SCsq>": [
    [
      "<XimportX>",
      " ",
      "<OsqCDO_S_OR_CDC_SCsq_1>"
    ]
  ],
  "<OsqCDO_S_OR_CDC_SCsq>": [
    [
      "<CDC>",
      " ",
      "<Ss>"
    ],
    [
      "<CDO>",
      " ",
      "<Ss>"
    ]
  ],
  "<Osqruleset_OR_media_OR_pageCsq>": [
    [
      "<page>"
    ],
    [
      "<ruleset>"
    ],
    [
      "<media>"
    ]
  

In [111]:
class PyPooledTester(Tester):
    def exec_program(self, seed, max_depth, t):
        # seed, maxnum, max_depth
        fn = self.ofile(max_depth, seed)
        return f"python testers/grammar_producer_pool.py {seed} {self.max_num} {max_depth} > {fn}"

In [112]:
PyPooledTester().run_test().show()

depth= 8 size= 809723.5 time= 25.96 stdev(0.02) throughput= 30.460146568086948 stdev(0)
depth= 16 size= 1365012 time= 27.666 stdev(0.362) throughput= 48.18345476648793 stdev(0)
depth= 32 size= 1579906 time= 28.696 stdev(0.103) throughput= 53.76722368872993 stdev(1)
depth= 64 size= 1571669.5 time= 28.66 stdev(0.046) throughput= 53.55367960728431 stdev(1)
depth= 128 size= 1571669.5 time= 28.538 stdev(0.066) throughput= 53.78199217329936 stdev(1)
depth= 256 size= 1571669.5 time= 28.425 stdev(0.009) throughput= 53.99485658379083 stdev(1)
Throughput of  53.99485658379083  kilobytes per second at depth =  256
Total time: 0:10:07.229713


Can we do better?

## Compile the Grammar

In [113]:
class PyTrans(Sanitize):
    def split_tokens(self, t, grammar):
        if t in grammar: return [t]
        my_tokens = []
        for i in t:
            my_tokens.append(i)
        return my_tokens

In [114]:
pyc_grammar = PyTrans(my_grammar).translate()

In [115]:
pyc_grammar

{'<start>': [['<stylesheet>']],
 '<stylesheet>': [['<OsqCHARSET_SYM_STRING_SEMICsq_1>',
   ' ',
   '<OsqS_OR_CDO_OR_CDCCsq_1>',
   ' ',
   '<OsqXimportX_CDO_S_OR_CDC_SCsq_1>',
   ' ',
   '<Osqstylesheet_closing_GROUPINGCsq_1>']],
 '<OsqCHARSET_SYM_STRING_SEMICsq>': [['<CHARSET_SYM>',
   ' ',
   '<STRING>',
   ' ',
   ';']],
 '<OsqS_OR_CDO_OR_CDCCsq>': [['<Sp>'], ['<CDO>'], ['<CDC>']],
 '<OsqXimportX_CDO_S_OR_CDC_SCsq>': [['<XimportX>',
   ' ',
   '<OsqCDO_S_OR_CDC_SCsq_1>']],
 '<OsqCDO_S_OR_CDC_SCsq>': [['<CDO>', ' ', '<Ss>'], ['<CDC>', ' ', '<Ss>']],
 '<Osqruleset_OR_media_OR_pageCsq>': [['<ruleset>'], ['<media>'], ['<page>']],
 '<Osqstylesheet_closing_GROUPINGCsq>': [['<Osqruleset_OR_media_OR_pageCsq>',
   ' ',
   '<OsqCDO_S_OR_CDC_SCsq_2>']],
 '<XimportX>': [['<IMPORT_SYM>',
   ' ',
   '<Ss>',
   ' ',
   '<OsqSTRING_OR_URICsq>',
   ' ',
   '<Ss>',
   ' ',
   '<media_list_1>',
   ' ',
   ';',
   ' ',
   '<Ss>']],
 '<OsqSTRING_OR_URICsq>': [['<STRING>'], ['<URI>']],
 '<media>': [['<ME

### Compile to Python

In [116]:
# not clear what is the fastest: + or ''.join
# https://stackoverflow.com/questions/1316887/what-is-the-most-efficient-string-concatenation-method-in-python
class PyCompiledFuzzer(PooledFuzzer):
    def add_indent(self, string, indent):
        return '\n'.join([indent + i for i in string.split('\n')])

    # used for escaping inside strings
    def esc(self, t):
        t = t.replace('\\', '\\\\')
        t = t.replace('\n', '\\n')
        t = t.replace('\r', '\\r')
        t = t.replace('\t', '\\t')
        t = t.replace('\b', '\\b')
        t = t.replace('\v', '\\v')
        t = t.replace('"', '\\"')
        return t
    
    def esc_char(self, t):
        assert len(t) == 1
        t = t.replace('\\', '\\\\')
        t = t.replace('\n', '\\n')
        t = t.replace('\r', '\\r')
        t = t.replace('\t', '\\t')
        t = t.replace('\b', '\\b')
        t = t.replace('\v', '\\v')
        t = t.replace("'", "\\'")
        return t

    def k_to_s(self, k): return k[1:-1].replace('-', '_')

    def gen_rule_src(self, rule, key, i):
        res = []
        for token in rule:
            if token in self.grammar:
                res.append('''\
gen_%s(next_depth, max_depth)''' % self.k_to_s(token))
            else:
                res.append('''\
result.append("%s")''' % self.esc(token))
        return '\n'.join(res)

    def string_pool_defs(self):
        result =[]
        for k in self.pool_of_strings:
            result.append('''\
pool_of_%(key)s = %(values)s''' % {
                'key':self.k_to_s(k),
                'values': self.pool_of_strings[k]})
        result.append('''
result = []''')
        return '\n'.join(result)

    def gen_main_src(self):
        result = []
        result.append('''
import random
import sys
def main(args):
    global result
    max_num, max_depth = get_opts(args)
    for i in range(max_num):
        gen_start(0, max_depth)
        print(''.join(result))
        result = []
 
main(sys.argv)''')
        return '\n'.join(result)

    def gen_alt_src(self, key):
        rules = self.grammar[key]
        result = []
        result.append('''
def gen_%(name)s(depth, max_depth):
    next_depth = depth + 1
    if depth > max_depth:
        result.append(random.choice(pool_of_%(name)s))
        return
    val = random.randrange(%(nrules)s)''' % {
            'name':self.k_to_s(key),
            'nrules':len(rules)})
        for i, rule in enumerate(rules):
            result.append('''\
    if val == %d:
%s
        return''' % (i, self.add_indent(self.gen_rule_src(rule, key, i),'        ')))
        return '\n'.join(result)

    def gen_fuzz_src(self):
        result = []
        result.append(self.string_pool_defs())
        for key in self.grammar:
            result.append(self.gen_alt_src(key))
        return '\n'.join(result)

    def fuzz_src(self, key='<start>'):
        result = [self.gen_fuzz_src(),
                  self.gen_main_src()]
        return ''.join(result)

In [117]:
with open('testers/grammar_producer_pycompiled.py', 'w+') as f:
    for fn in [get_opts]:
        print(inspect.getsource(fn), file=f)
    result = PyCompiledFuzzer(pyc_grammar).fuzz_src()
    print(result, file=f)

In [118]:
!cat testers/grammar_producer_pycompiled.py

def get_opts(args, log=False):
    seed = int(args[1])
    max_num = int(args[2])
    max_depth = int(args[3])
    random.seed(seed)
    sys.setrecursionlimit(20900)
    if log:
        print("seed=%d, num=%d, depth=%d" % (seed, max_num, max_depth), file=sys.stderr)
    return max_num, max_depth

pool_of_start = ['   ']
pool_of_stylesheet = ['   ']
pool_of_OsqCHARSET_SYM_STRING_SEMICsq = ["@charset '' ;", '@charset "" ;']
pool_of_OsqS_OR_CDO_OR_CDCCsq = ['-->', '<!--']
pool_of_OsqXimportX_CDO_S_OR_CDC_SCsq = ["@import  ''   ;  ", '@import  url("")   ;  ', '@import  ""   ;  ']
pool_of_OsqCDO_S_OR_CDC_SCsq = ['<!-- ', '--> ']
pool_of_Osqruleset_OR_media_OR_pageCsq = ['@page   {    } ']
pool_of_Osqstylesheet_closing_GROUPINGCsq = ['@page   {    }  ']
pool_of_XimportX = ['@import  ""   ; ', "@import  ''   ; ", '@import  url("")   ; ']
pool_of_OsqSTRING_OR_URICsq = ["''", '""', 'url("")']
pool_of_media = ['@media  u   {   } ', '@media  n   {   } ', '@media  s   {   } ', 

In [119]:
!python testers/grammar_producer_pycompiled.py 0 10 10

   	-->	  	    
@charset '' ;   
  	  		 @import  "\?\J0" 	   ; 	 -->  
   
@charset "" ;  @import  url("*$	") 	 	  ;  <!-- @import 	 url("	")  	 	  ;   @media 	 -q    ,  m  {   }  
  @import 			 url(" ")  lk    , 	 t  ;  @import  	 url(" "" ")    ;   
@charset '\N' ; 			 <!--  
@charset "g\p\J\G" ;  @import 	 	 "" 	   ;    #c >  [  p   ]     {   ;  w  :  #0ef   @import  } 	 
@charset '' ;   @import  	  "a"  -c3   ;   <!-- 	-->  @media  s  ,   j ,  v  {   } 	 k   ,  selector,  selector { 	  j  :  url("")   #c5177b   ;  u  :  #dcd0d6    }  --> @media 		 a  ,  h  {    }  <!-- 
@charset "\M\" ;   @media  	   s  	  {  b [  m   ] 	  ,  selector,  selector {  x  :  #768bf2   @import   }  }  


In [120]:
class PyCompiledTester(Tester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"python testers/grammar_producer_pycompiled.py {seed} {self.max_num} {max_depth} > {fn}"

In [121]:
PyCompiledTester().run_test().show()

depth= 8 size= 808118 time= 0.836 stdev(0.006) throughput= 944.0035306490386 stdev(3)
depth= 16 size= 1365633 time= 1.984 stdev(0.073) throughput= 672.2884089554645 stdev(15)
depth= 32 size= 1580559 time= 2.577 stdev(0.062) throughput= 599.1938695030162 stdev(20)
depth= 64 size= 1571791.5 time= 2.528 stdev(0.037) throughput= 607.3085722824931 stdev(18)
depth= 128 size= 1571791.5 time= 2.563 stdev(0.018) throughput= 598.935446078907 stdev(13)
depth= 256 size= 1571791.5 time= 2.502 stdev(0.005) throughput= 613.3601359756542 stdev(8)
Throughput of  944.0035306490386  kilobytes per second at depth =  8
Total time: 0:00:45.927991


In [122]:
class PyRecCompiledFuzzer(PyCompiledFuzzer):
    def __init__(self, grammar):
        super().__init__(grammar)
        assert self.ordered_grammar
        self.rec_cost = {}
        self.compute_rule_recursion()

    def kr_to_s(self, key, i): return 'gen_%s_%d' % (self.k_to_s(key), i)
    # the grammar needs to be ordered by the cost.
    # else the ordering will change at the end.
    
    def is_rule_recursive(self, rname, rule, seen):
        if not rule: return False
        if rname in seen:
            return False # reached another recursive rule without seeing this one
        for token in rule:
            if token not in self.grammar: continue
            for i,trule in enumerate(self.grammar[token]):
                rn = self.kr_to_s(token, i)
                if rn  == rname: return True
                if rn in seen: return False
                v = self.is_rule_recursive(rname, trule, seen | {rn})
                if v: return True
        return False
    
    def is_key_recursive(self, check, key, seen):
        if not key in self.grammar: return False
        if key in seen: return False
        for rule in self.grammar[key]:
            for token in rule:
                if token not in self.grammar: continue
                if token == check: return True
                v = self.is_key_recursive(check, token, seen | {token})
                if v: return True
        return False
    
    def compute_rule_recursion(self):
        if IS_HTML:   # TODO -- to much time -- only for HTML
            self.rule_recursion = HTML_RULE_RECURSION
            self.key_recursion = HTML_KEY_RECURSION
            return
        self.rule_recursion = {}
        for k in self.grammar:
            for i_rule,rule in enumerate(self.grammar[k]):
                n = self.kr_to_s(k, i_rule)
                self.rule_recursion[n] = self.is_rule_recursive(n, rule, set())
        self.key_recursion = {}
        for k in self.grammar:
            self.key_recursion[k] = self.is_key_recursive(k, k, set())

In [123]:
PyRecCompiledFuzzer(pyc_grammar).rule_recursion

{'gen_start_0': False,
 'gen_stylesheet_0': False,
 'gen_OsqCHARSET_SYM_STRING_SEMICsq_0': False,
 'gen_OsqS_OR_CDO_OR_CDCCsq_0': False,
 'gen_OsqS_OR_CDO_OR_CDCCsq_1': False,
 'gen_OsqS_OR_CDO_OR_CDCCsq_2': False,
 'gen_OsqXimportX_CDO_S_OR_CDC_SCsq_0': False,
 'gen_OsqCDO_S_OR_CDC_SCsq_0': False,
 'gen_OsqCDO_S_OR_CDC_SCsq_1': False,
 'gen_Osqruleset_OR_media_OR_pageCsq_0': False,
 'gen_Osqruleset_OR_media_OR_pageCsq_1': False,
 'gen_Osqruleset_OR_media_OR_pageCsq_2': False,
 'gen_Osqstylesheet_closing_GROUPINGCsq_0': False,
 'gen_XimportX_0': False,
 'gen_OsqSTRING_OR_URICsq_0': False,
 'gen_OsqSTRING_OR_URICsq_1': False,
 'gen_media_0': False,
 'gen_media_list_0': False,
 'gen_OsqCOMMA_S_mediumCsq_0': False,
 'gen_medium_0': False,
 'gen_page_0': False,
 'gen_OsqSEMI_S_declarationCsq_0': False,
 'gen_pseudo_page_0': False,
 'gen_operator_0': False,
 'gen_operator_1': False,
 'gen_combinator_0': False,
 'gen_combinator_1': False,
 'gen_unary_operator_0': False,
 'gen_unary_operator_

In [124]:
PyRecCompiledFuzzer(pyc_grammar).key_recursion

{'<start>': False,
 '<stylesheet>': False,
 '<OsqCHARSET_SYM_STRING_SEMICsq>': False,
 '<OsqS_OR_CDO_OR_CDCCsq>': False,
 '<OsqXimportX_CDO_S_OR_CDC_SCsq>': False,
 '<OsqCDO_S_OR_CDC_SCsq>': False,
 '<Osqruleset_OR_media_OR_pageCsq>': False,
 '<Osqstylesheet_closing_GROUPINGCsq>': False,
 '<XimportX>': False,
 '<OsqSTRING_OR_URICsq>': False,
 '<media>': False,
 '<media_list>': False,
 '<OsqCOMMA_S_mediumCsq>': False,
 '<medium>': False,
 '<page>': False,
 '<OsqSEMI_S_declarationCsq>': False,
 '<pseudo_page>': False,
 '<operator>': False,
 '<combinator>': False,
 '<unary_operator>': False,
 '<property>': False,
 '<ruleset>': False,
 '<COMMA_S_selector>': False,
 '<selector>': False,
 '<Osqcombinator_selectorCsq>': False,
 '<Osqcombinator_selector_OR_SCsq>': False,
 '<simple_selector>': False,
 '<OsqHASH_OR_XclassX_OR_attrib_OR_pseudoCsq>': False,
 '<XclassX>': False,
 '<element_name>': False,
 '<attrib>': False,
 '<OsqEQUAL_OR_INCLUDES_OR_DASHMATCHCsq>': False,
 '<OsqIDENT_OR_STRINGCsq>

### Partial Evaluation in Python

In [125]:
class PEFuzzer(PyRecCompiledFuzzer):
    def pe_rule(self, rule, depth):
        res = []
        res.append('''\
# rule=%(rule)s len[%(len)d]''' % {'rule':str(rule), 'len':len(rule)})
        if not rule:
            res.append('''\
pass''')
        for token in rule:
            res.append('''\
# token=%(token)s''' % {'token':token})
            if token in self.grammar:
                if self.key_recursion[token]:
                    res.append('''\
# sc
gen_%(key)s(depth+%(depth)s, max_depth)''' % {'key':self.k_to_s(token),'depth':depth+2})
                else:
                    res.append(self.pe_key(token, depth=depth+1))
            else:
                res.append('''\
result.append("%s")''' % self.esc(token))
        return '\n'.join(res)
 
    def pe_key(self, key, depth):
        if depth == self.MAX_PE_DEPTH:
            return 'gen_%(key)s(depth+%(depth)d, max_depth)' % {'key':self.k_to_s(key), 'depth':depth+1}
        rules = self.grammar[key]
        result = ['''\
#* %(key)s begins
if depth + %(depth)d > max_depth:
    result.append(random.choice(pool_of_strings['%(key)s']))
else:''' % {'name':self.k_to_s(key), 'key':key,  'depth':depth+1}]
        if len(rules) == 0:
            result.append('''\
    # indent dummy''')
        elif len(rules) == 1:
            result.append('''\
    # indent inline''')
            result.append(self.add_indent(self.pe_rule(rules[0], depth),'    '))
        else:
            result.append('''\
    val = random.randrange(%(nrules)s)''' % {'nrules':len(rules)})
            result.append('''\
    # indent here<
    if False:
        pass # dummy''')
            assert len(rules) > 1
            for i, rule in enumerate(rules):
                result.append('''\
    elif val == %d:''' % i)               
                result.append(self.add_indent(self.pe_rule(rule, depth),'        '))
            result.append('''\
    # indent here>''')
        result.append('''\
#* %(key)s ends''' % {'key': key})
        return '\n'.join(result)

    def gen_rule_src(self, rule, key, i):
        res = []
        for token in rule:
            if token in self.grammar:
                if token == key:
                    res.append('''\
# not unrolling
gen_%s(depth+1, max_depth)''' % self.k_to_s(token))
                else:
                    res.append('''\
#indent -<''')
                    res.append(self.pe_key(token, depth=0))
                    res.append('''\
#indent >-''')
                    
            else:
                res.append('''\
result.append("%s")''' % self.esc(token))
        return self.add_indent('\n'.join(res), '            ')

    def string_pool_defs(self):
        result =[]
        result.append('''\
pool_of_strings = %s''' % pp_grammar(self.pool_of_strings))
        result.append('''
result = [];''')
        return '\n'.join(result)

    def gen_main_src(self):
        result = []
        result.append('''
import random
import sys
def main(args):
    global result
    max_num, max_depth = get_opts(args)
    for i in range(max_num):
        gen_start(0, max_depth)
        print(''.join(result))
        result = []
main(sys.argv)
    ''')
        return '\n'.join(result)

    def gen_alt_src(self, key):
        rules = self.grammar[key]
        result = []
        result.append('''
def gen_%(name)s(depth, max_depth):
    # %(name)s begins
    if depth > max_depth:
        result.append(random.choice(pool_of_strings['%(key)s']))
    else:
        val = random.randrange(%(nrules)s)''' % {'name':self.k_to_s(key), 'key':key, 'nrules':len(rules)})
        for i, rule in enumerate(rules):
            result.append('''\
        if val == %d:
%s
            return''' % (i, self.gen_rule_src(rule, key, i)))
        result.append('''
    # %(name)s ends
        ''')
        return '\n'.join(result)
    
    def fuzz_src(self, key='<start>'):
        self.MAX_PE_DEPTH = 4
        result = [self.gen_fuzz_src(),
                  self.gen_main_src()]
        return ''.join(result)

In [126]:
with open('testers/grammar_producer_pe.py', 'w+') as f:
    for fn in [get_opts]:
        print(inspect.getsource(fn), file=f)
    result = PEFuzzer(pyc_grammar).fuzz_src()
    print(result, file=f)

In [127]:
!wc -l testers/grammar_producer_pe.py

   45655 testers/grammar_producer_pe.py


In [128]:
!python testers/grammar_producer_pe.py 0 10 10

@charset "\\" ; --> @import  url("\*-- ")   ;  -->  -->  
  @import 	  url(" *	")   ;  <!--  
 	 <!--		 	 @import  '\
'   ;   
 -->-->  
 -->  @media  g  	 ,  o ,  z ,  r  {   }  <!-- --> <!--   @media  x   {    #5   {  c  :  m    ;  ;   } *    {  i  :  #23e     } *    {    }  }  @media   c   {   }  <!-- *     ,  selector,  selector { 	   }   @media  e   {   }  <!-- 
@charset '' ; 	--> @import   url("	$ ")  	 -c9 	   ; 	  
@charset "\p\
" ; -->-->  @import  ""  -t   ;  @import   '\ 1'    ;  <!-- 	-->  	 
 --><!--  #k  ,  selector,   selector {    }  	   --> 
@charset "\\(" ; <!-- @import   ''   ; 		  @import 		  "\)"   ;    
@charset "\^" ;  @import  ''  	 		 -e  ,  o  ;   


In [129]:
class PyPETester(Tester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"python testers/grammar_producer_pe.py {seed} {self.max_num} {max_depth} > {fn}"

In [130]:
PyPETester().run_test().show()

depth= 8 size= 809020 time= 0.752 stdev(0.006) throughput= 1051.2966813309786 stdev(3)
depth= 16 size= 1380400 time= 1.636 stdev(0.054) throughput= 824.5325822462984 stdev(33)
depth= 32 size= 1567121.5 time= 2.032 stdev(0.091) throughput= 753.6236252881176 stdev(30)
depth= 64 size= 1571835.5 time= 2.018 stdev(0.023) throughput= 760.5058080644019 stdev(7)
depth= 128 size= 1571836 time= 2.021 stdev(0.015) throughput= 759.7261109518874 stdev(4)
depth= 256 size= 1571836 time= 2.054 stdev(0.018) throughput= 747.1599450366682 stdev(5)
Throughput of  1051.2966813309786  kilobytes per second at depth =  8
Total time: 0:00:40.129814


### Supercompile in Python

In [131]:
class PySuperCompiledFuzzer(PyRecCompiledFuzzer):    
    def supercompile_rule(self, key, rule, i_rule, depth):
        gen_name = self.kr_to_s(key, i_rule)
        if self.rule_recursion[gen_name]:
            self.current_lst.append(gen_name)
            return '''\
%(gen_name)s(depth_%(depth)d) # recursing''' % {'gen_name':gen_name, 'depth':depth}
        res = []
        if len(rule) == 0:
            res.append('pass')
        else:
            for token in rule:
                if token not in self.grammar:
                    res.append('''\
result.append("%s")''' % self.esc(token))
                else:
                    res.append(# no indent
                        self.supercompile_key(token,
                                              depth=(depth+1)))
        return '\n'.join(res)
    def supercompile_key_internal(self, key, trule, i_trule, depth):
        if depth > self.MAX_SUPERCOMPILE_DEPTH:
            self.current_lst.append(self.kr_to_s(key, i_trule))
            return '%(gen_name)s(depth_%(depth)d) #slimit*' % {
                        'gen_name':self.kr_to_s(key, i_trule), 'depth':depth}
        else:
            return self.supercompile_rule(key, trule, i_trule, depth=depth)
 
    def supercompile_key(self, key, depth):
        # Should check for MAX_SUPERCOMPILE_DEPTH
        # should first get the random number curresponding to
        # len(grammar[key]) then it should unroll that elif cond.
        if len(self.grammar[key]) == 0: return '' # no more jumping on the bed
        res = ['''\
if depth_%(depth)d > max_depth:
    result.append(random.choice(pool_of_%(key)s))
else:
    depth_%(d_1)d = depth + %(d_1)d'''%{
            'key':self.k_to_s(key), 'depth': depth, 'd_1': depth+1}]
        if len(self.grammar[key]) == 1:
            # we do not have to get the random number, and check for
            # equality first.
            i_trule, trule  = 0, self.grammar[key][0]
            res.append(self.add_indent(
                self.supercompile_key_internal(key, trule, i_trule, depth),
                '''\
    '''))
        else:
            # First get the random number, then compare and
            # unroll
            res.append('''\
    val = random.randrange(%(len_rules)d)
    if False: # dummy for elsif
        pass''' % {'len_rules': len(self.grammar[key])})
            for i_trule, trule in enumerate(self.grammar[key]):
                res.append('''\
    elif val == %(i_trule)d:''' % {'i_trule': i_trule})
                res.append(self.add_indent(
                    self.supercompile_key_internal(key, trule, i_trule, depth),
                    '''\
        '''))

        return '\n'.join(res)
   
    def gen_rule_src(self, rule, key, i_rule):
        res = ['''\
def %(gen_name)s(depth):
    if depth > max_depth:
        result.append(random.choice(pool_of_%(key)s))
    else:
        depth_%(d_1)d = depth + %(d_1)d''' % {
            'gen_name':self.kr_to_s(key,i_rule),
            'key':self.k_to_s(key),
            'depth':0, 'd_1': 1}]
        
        # These should be a sequence of getting randon numbers
        # and unrolling appropriately.
        for token in rule:
            if token not in self.grammar:
                res.append('''\
        result.append("%s")''' % self.esc(token))
            else:
                res.append(self.add_indent(
                    self.supercompile_key(token, depth=1), '''\
        '''))
        return '\n'.join(res)

    def gen_main_src(self):
        result = []
        result.append('''
import random
import sys
max_depth = 0
def main(args):
    global result, max_depth
    max_num, max_depth = get_opts(args)
    for i in range(max_num):
        gen_start_0(0)
        print(''.join(result))
        result = []
main(sys.argv)
    ''')
        return '\n'.join(result)
    
    def gen_fuzz_src(self):
        keys_used = {}
        result = [self.string_pool_defs()]
        key_defs = {}
        for key in self.grammar:
            for i,rule in enumerate(self.grammar[key]):
                self.current_lst = []
                ks = self.kr_to_s(key, i)
                keys_used[ks] = self.current_lst
                key_defs[ks] = self.gen_rule_src(rule, key, i)
        key_set = set(keys_used['gen_start_0']) | {'gen_start_0'}
        old_len = 0
        while old_len != len(key_set):
            old_len = len(key_set)
            key_set.update(k1 for k in list(key_set) for k1 in keys_used[k])
            
        for k in key_set:
            result.append(key_defs[k])
        return '\n'.join(result)

    def fuzz_src(self, key='<start>'):
        self.MAX_SUPERCOMPILE_DEPTH = 100
        result = [self.gen_fuzz_src(),
                  self.gen_main_src()]
        return ''.join(result)

In [132]:
with open('testers/grammar_producer_pysupercompiled.py', 'w+') as f:
    for fn in [get_opts]:
        print(inspect.getsource(fn), file=f)
    result = PySuperCompiledFuzzer(pyc_grammar).fuzz_src()
    print(result, file=f)

In [133]:
!wc -l testers/grammar_producer_pysupercompiled.py

   19676 testers/grammar_producer_pysupercompiled.py


In [134]:
!python testers/grammar_producer_pysupercompiled.py 0 10 10

@charset '' ;   	   @media  -w   ,   j ,  z  { 	  *    { 	 _  :  #be9e32    ;  d  :  #e86316    }  *   ,  selector {    }  }   @media    h    {  *   ,  selector {    }  }  
 -->-->  
  @import  "\#="  	 s 	   ;   
@charset '' ; 	  	  @import  url("** ") 	  ; 	 @import   url(""w"")    ;    -->  @media   _   {   }   * : n(   ) 	   { 	   }  	  --> <!-- 
@charset '\
\"' ; --> @import   url("		")  -is    ,  s ,  a  ;  -->  
 --> @import  				 ''   ; 	 <!-- 	 
 <!--	-->  	  @import   ''  	  ;   
@charset "\" ; <!--	 @import  url(" """)  -bv  	  ;  <!-- @import 		 url("'('") 	  ;  --> 	 @import 		  "I"  b   ;  <!--  
@charset '' ;  @import 	 url("$ ") 	 		 kp  ,  n  ;   
@charset '' ;   


In [135]:
class PySupercompiledTester(Tester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"python testers/grammar_producer_pysupercompiled.py {seed} {self.max_num} {max_depth} > {fn}"

In [136]:
PySupercompiledTester().run_test().show()

depth= 8 size= 808534 time= 0.706 stdev(0.008) throughput= 1119.1780991388514 stdev(1)
depth= 16 size= 1368361 time= 1.557 stdev(0.089) throughput= 859.435245321564 stdev(42)
depth= 32 size= 1564651.5 time= 1.96 stdev(0.055) throughput= 779.7500192877299 stdev(12)
depth= 64 size= 1571573 time= 1.954 stdev(0.083) throughput= 785.9478044241946 stdev(24)
depth= 128 size= 1571543 time= 1.951 stdev(0.069) throughput= 786.9546986929877 stdev(18)
depth= 256 size= 1571543 time= 2.016 stdev(0.017) throughput= 761.3306088221117 stdev(16)
Throughput of  1119.1780991388514  kilobytes per second at depth =  8
Total time: 0:00:38.844699


### Compile to C

In [137]:
class CTrans(Sanitize):
    def split_tokens(self, t, grammar):
        if t in grammar: return [t]
        my_tokens = []
        esc = {
           '\r': '\\r',
           '\n': '\\n',
           '\t': '\\t',
           '\\': '\\\\',
        }
        for i in t:
            #if i in esc:
            #    my_tokens.append(esc[i])
            #else:
                my_tokens.append(i)
        return my_tokens

In [138]:
c_grammar = CTrans(my_grammar).translate()

In [139]:
c_grammar

{'<start>': [['<stylesheet>']],
 '<stylesheet>': [['<OsqCHARSET_SYM_STRING_SEMICsq_1>',
   ' ',
   '<OsqS_OR_CDO_OR_CDCCsq_1>',
   ' ',
   '<OsqXimportX_CDO_S_OR_CDC_SCsq_1>',
   ' ',
   '<Osqstylesheet_closing_GROUPINGCsq_1>']],
 '<OsqCHARSET_SYM_STRING_SEMICsq>': [['<CHARSET_SYM>',
   ' ',
   '<STRING>',
   ' ',
   ';']],
 '<OsqS_OR_CDO_OR_CDCCsq>': [['<Sp>'], ['<CDO>'], ['<CDC>']],
 '<OsqXimportX_CDO_S_OR_CDC_SCsq>': [['<XimportX>',
   ' ',
   '<OsqCDO_S_OR_CDC_SCsq_1>']],
 '<OsqCDO_S_OR_CDC_SCsq>': [['<CDO>', ' ', '<Ss>'], ['<CDC>', ' ', '<Ss>']],
 '<Osqruleset_OR_media_OR_pageCsq>': [['<ruleset>'], ['<media>'], ['<page>']],
 '<Osqstylesheet_closing_GROUPINGCsq>': [['<Osqruleset_OR_media_OR_pageCsq>',
   ' ',
   '<OsqCDO_S_OR_CDC_SCsq_2>']],
 '<XimportX>': [['<IMPORT_SYM>',
   ' ',
   '<Ss>',
   ' ',
   '<OsqSTRING_OR_URICsq>',
   ' ',
   '<Ss>',
   ' ',
   '<media_list_1>',
   ' ',
   ';',
   ' ',
   '<Ss>']],
 '<OsqSTRING_OR_URICsq>': [['<STRING>'], ['<URI>']],
 '<media>': [['<ME

We compile a grammar into a C program that produces from the grammar.  Just how fast can we be?

In [140]:
class CFuzzer(PyRecCompiledFuzzer):    
    def cheap_chars(self, string):
        # to be embedded within single quotes
        escaped = {'t':'\t', 'n': '\n', "'": "\\'", "\\":"\\\\", 'r': '\r'}
        slst = []
        while string:
            c, *string = string
            if c in {'\\'}:
                c1, *string = string
                slst.append(escaped[c1])
            elif c in {"'"}:
                slst.append("\'")
            else:
                slst.append(c)
        return slst
    
    def gen_rule_src(self, rule, key, i):
        res = []
        for token in rule:
            if token in self.grammar:
                res.append('gen_%s(depth +1);' % self.k_to_s(token))
            else:
                res.append("out('%s');" % self.esc_char(token))
        return '\n        '.join(res)

    def gen_alt_src(self, k):
        rules = self.grammar[k]
        cheap_strings = self.pool_of_strings[k]
        result = ['''
void gen_%(name)s(int depth) {
    if (depth > max_depth) {
        int val = map(%(num_cheap_strings)d);
        const char* str = pool_%(name)s[val];
        const int str_l = pool_l_%(name)s[val];
        for (int i = 0; i < str_l; i++) {
            out(str[i]);
        }
        return;
    }

    int val = map(%(nrules)d);
    switch(val) {''' % {'name':self.k_to_s(k), 'nrules':len(rules),
                        'num_cheap_strings': len(cheap_strings),
                       }]
        for i, rule in enumerate(rules):
            result.append('''
    case %d:
        %s
        break;''' % (i, self.gen_rule_src(rule, k, i)))
        result.append('''
    }
}
    ''')
        return '\n'.join(result)
    
    def string_pool_defs(self):
        result = []
        for k in self.grammar:
            cheap_strings = self.pool_of_strings[k]
            result.append('''
const char* pool_%(k)s[] =  {%(cheap_strings)s};
const int pool_l_%(k)s[] =  {%(cheap_strings_len)s};
        ''' % {'k':self.k_to_s(k),
               'cheap_strings': ', '.join(['"%s"' % self.esc(s) for s in cheap_strings]),
               'cheap_strings_len': ', '.join([str(len(s)) for s in cheap_strings])})
        return '\n'.join(result)

    
    def fn_fuzz_decs(self):
        result = []
        for k in self.grammar:
            result.append('''void gen_%s(int depth);''' % self.k_to_s(k))
        return '\n'.join(result)
    
    def fn_map_def(self):
        return '''
int map(int v) {
    return random() % v;
}
 '''    
    def fn_out_def(self):
        return '''
void out(const char s) {
    fputc(s, stdout);
}       
 '''

    def fuzz_hdefs(self):
        return '''
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
'''
    
    def fuzz_out_var_defs(self):
        return '''
void out(const char s);'''
    
    def fuzz_rand_var_defs(self):
        return '''
int map(int v);'''
    def fuzz_stack_var_defs(self):
        return '''
extern int max_depth;'''

    def fuzz_var_defs(self):
        return '\n'.join([self.fuzz_out_var_defs(), self.fuzz_rand_var_defs(), self.fuzz_stack_var_defs()])

    def fn_main_input_frag(self):
        return '''
    if (argc < 3) {
        printf("%s <seed> <max_num> <max_depth>\\n", argv[0]);
        return 0;
    }
    seed = atoi(argv[1]);
    max_num = atoi(argv[2]);
    max_depth = atoi(argv[3]);'''
    
    def fn_main_loop_frag(self):
        return '''
    for(int i=0; i < max_num; i++) {
        gen_init__();
    }'''

    def fn_main_def(self):
        result = '''
int main(int argc, char** argv) {
    int seed, max_num;
%(input_frag)s
    //srandom(time(0));
    srandom(seed);
%(loop_frag)s
    return 0;
}''' % {'input_frag':self.fn_main_input_frag(),
        'loop_frag': self.fn_main_loop_frag()}
        return result
    
    def main_stack_var_defs(self):
        return '''
int max_depth = 0;'''
    
    def main_init_var_defs(self):
        return '''
void gen_init__();'''
    
    def main_var_defs(self):
        return '\n'.join([self.main_stack_var_defs(), self.main_init_var_defs()])
    
    def fuzz_fn_defs(self):
        result = []
        for key in self.grammar:
            result.append(self.gen_alt_src(key))
        return '\n'.join(result)
    
    def fuzz_entry(self):
        return '''
void gen_init__() {
    gen_start(0);
    out('\\n');
    return;
}'''

    def main_hdefs(self):
        return '''
#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
'''

    def gen_main_src(self):
        return '\n'.join([self.main_hdefs(),
                          self.main_var_defs(),
                          self.fn_map_def(),
                          self.fn_out_def(),
                          self.fn_main_def()])
    
    def gen_fuzz_src(self):
        return '\n'.join([self.fuzz_hdefs(),
                          self.fuzz_var_defs(),
                          self.fn_fuzz_decs(),
                          self.string_pool_defs(),
                          self.fuzz_fn_defs(),
                          self.fuzz_entry()])

    def fuzz_src(self, key='<start>'):
        return self.gen_main_src(), self.gen_fuzz_src()

In [141]:
main_src, fuzz_src = CFuzzer(c_grammar).fuzz_src()
with open('testers/grammar_producer_c_fuzz.c', 'w+') as f:
    print(fuzz_src, file=f)
with open('testers/grammar_producer_c_main.c', 'w+') as f:
    print(main_src, file=f)

In [142]:
!nl -ba testers/grammar_producer_c_fuzz.c

     1	
     2	#include <stdlib.h>
     3	#include <stdio.h>
     4	#include <time.h>
     5	#include <string.h>
     6	
     7	
     8	void out(const char s);
     9	
    10	int map(int v);
    11	
    12	extern int max_depth;
    13	void gen_start(int depth);
    14	void gen_stylesheet(int depth);
    15	void gen_OsqCHARSET_SYM_STRING_SEMICsq(int depth);
    16	void gen_OsqS_OR_CDO_OR_CDCCsq(int depth);
    17	void gen_OsqXimportX_CDO_S_OR_CDC_SCsq(int depth);
    18	void gen_OsqCDO_S_OR_CDC_SCsq(int depth);
    19	void gen_Osqruleset_OR_media_OR_pageCsq(int depth);
    20	void gen_Osqstylesheet_closing_GROUPINGCsq(int depth);
    21	void gen_XimportX(int depth);
    22	void gen_OsqSTRING_OR_URICsq(int depth);
    23	void gen_media(int depth);
    24	void gen_media_list(int depth);
    25	void gen_OsqCOMMA_S_mediumCsq(int depth);
    26	void gen_medium(int depth);
    27	void gen_page(int depth);
    28	void gen_OsqSEMI_S_declarationCsq(int depth);
    29	

In [143]:
!cat testers/grammar_producer_c_main.c


#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>


int max_depth = 0;

void gen_init__();

int map(int v) {
    return random() % v;
}
 

void out(const char s) {
    fputc(s, stdout);
}       
 

int main(int argc, char** argv) {
    int seed, max_num;

    if (argc < 3) {
        printf("%s <seed> <max_num> <max_depth>\n", argv[0]);
        return 0;
    }
    seed = atoi(argv[1]);
    max_num = atoi(argv[2]);
    max_depth = atoi(argv[3]);
    //srandom(time(0));
    srandom(seed);

    for(int i=0; i < max_num; i++) {
        gen_init__();
    }
    return 0;
}


In [144]:
%cd testers
!cc -Ofast grammar_producer_c_main.c grammar_producer_c_fuzz.c  -o grammar_producer_c
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [145]:
!./testers/grammar_producer_c 0 10 10

@charset '' ;  @import  '' 	  ;  @import   url("")   m   ,  x ,  t ,  n  ;   
@charset '\m' ;  @import  '\\\
\' 	  ;  @import 	 "" 	  l   ;  			  @page   {    }  
 <!--  @media  f 	  {   }   @page    { 	  v  :    8khz     }  -->  --> 		
@charset "\*\" ;  @import  'Y\]k' 	  ;  @import  ''  n   ; 		  
@charset "" ;   
@charset '' ; 	-->  
@charset 'o9\G' ; <!--  
  @import  '\G\S\
'  e  	 ,  n ,  g  ;  <!-- -->   
   
 <!-- @import 	 "@\
"   ;   


In [146]:
# II
class CTester(Tester):
    def __init__(self, name=None, max_num=10000, start_depth=3, limit_depth=9, timeout=3600, iterations=100):
        super().__init__(name, max_num, start_depth, limit_depth, timeout)

    def exec_program(self, seed, max_depth, t):
        # seed, maxnum, max_depth
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_c {seed} {self.max_num} {max_depth} > {fn}"

In [147]:
CTester().run_test().show()

depth= 8 size= 807582 time= 0.043 stdev(0.001) throughput= 18558.91007838455 stdev(281)
depth= 16 size= 1383654 time= 0.082 stdev(0.001) throughput= 16579.980865726626 stdev(124)
depth= 32 size= 1577385 time= 0.096 stdev(0.001) throughput= 16129.890361585116 stdev(30)
depth= 64 size= 1588344.5 time= 0.098 stdev(0.001) throughput= 15831.039992791964 stdev(459)
depth= 128 size= 1588344.5 time= 0.101 stdev(0.005) throughput= 15458.280462012415 stdev(986)
depth= 256 size= 1588344.5 time= 0.097 stdev(0.002) throughput= 16075.073950221537 stdev(120)
Throughput of  18558.91007838455  kilobytes per second at depth =  8
Total time: 0:00:12.991791


### Partial Evaluation in C

In [148]:
class PECFuzzer(CFuzzer):
    def pe_rule(self, rule, seen, depth):
        res = []
        res.append('''\
/* rule=%(rule)s len[%(len)d]*/''' % {'rule': str(rule), 'len': len(rule)})
        if not rule:
            res.append('''
/*break;*/
            ''')
        for token in rule:
            res.append('''
/* token=%(token)s*/''' % {'token': token})
            if token in self.grammar:
                if token in seen:
                    res.append('''\
/* sc */
gen_%(key)s(depth+%(depth)d);''' % {'key': self.k_to_s(token), 'depth': depth+2})
                else:
                    res.append(self.pe_key(token, seen=(seen | {token}), depth=depth+1))
            else:
                res.append('''\
out('%s');''' % self.esc_char(token))
        return '\n'.join(res)
        
        
    def pe_key(self, key, seen, depth):
        if depth == self.MAX_PE_DEPTH:
            return 'gen_%(key)s(depth+%(depth)d);' % {'key': self.k_to_s(key), 'depth': depth+1}
        rules = self.grammar[key] # ordered by the cost
        cheap_strings = self.pool_of_strings[key] 
        # we haven't restricted map to 256 yet.
        result = ['''\
/* %(key)s begins*/
if ((depth + %(depth)d) > max_depth) {
    int val = map(%(num_cheap_strings)d);
    const char* str = pool_%(name)s[val];
    const int str_l = pool_l_%(name)s[val];
    for (int i = 0; i < str_l; i++) {
        out(str[i]);
    }
} else {''' %  {'name':self.k_to_s(key),
                'key': key,
                'depth': depth+1,
                'num_cheap_strings': len(cheap_strings)}]
        if len(rules) == 0:
            result.append('''\
    /*indent dummy*/
            ''')
        elif len(rules) == 1:
            result.append('''\
    /*indent inline*/
            ''')
            result.append(self.add_indent(self.pe_rule(rules[0], seen, depth),'    '))
        else:
            result.append('''\
    int val = map(%(nrules)d);
            ''' % {'nrules': len(rules)})
            result.append('''\
    switch(val) {
            ''')
            for i, rule in enumerate(rules):
                result.append('''\
    case %d:
        {''' % i)
                result.append(self.add_indent(self.pe_rule(rule, seen, depth),'            '))
                result.append('''\
            break;
        }''')
            result.append('''\
    }''')
        result.append('''\
}
/* %(key)s ends*/''' % {'key': key})
        return '\n'.join(result)
        
    def gen_rule_src(self, rule, key, i):
        res = []
        for token in rule:
            if token in self.grammar:
                if token == key:
                    res.append('''\
/* not unrolling*/
gen_%(key)s(depth +1);''' % {'key':self.k_to_s(token)})
                else:
                    res.append('''\
/*indent -<*/''')
                    res.append(self.pe_key(token, seen={key, token}, depth=0))
                    res.append('''\
/*indent >-*/''')
            else:
                res.append("out('%s');" % self.esc_char(token))
        return '\n'.join(res)
    
    def fuzz_src(self, key='<start>'):
        self.MAX_PE_DEPTH = 4
        return self.gen_main_src(), self.gen_fuzz_src()

In [149]:
main_src, fuzz_src = PECFuzzer(c_grammar).fuzz_src()
with open('testers/grammar_producer_pec_fuzz.c', 'w+') as f:
    print(fuzz_src, file=f)
with open('testers/grammar_producer_pec_main.c', 'w+') as f:
    print(main_src, file=f)

In [150]:
!wc -l testers/grammar_producer_pec_fuzz.c

  109307 testers/grammar_producer_pec_fuzz.c


In [151]:
!cat testers/grammar_producer_pec_main.c


#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>


int max_depth = 0;

void gen_init__();

int map(int v) {
    return random() % v;
}
 

void out(const char s) {
    fputc(s, stdout);
}       
 

int main(int argc, char** argv) {
    int seed, max_num;

    if (argc < 3) {
        printf("%s <seed> <max_num> <max_depth>\n", argv[0]);
        return 0;
    }
    seed = atoi(argv[1]);
    max_num = atoi(argv[2]);
    max_depth = atoi(argv[3]);
    //srandom(time(0));
    srandom(seed);

    for(int i=0; i < max_num; i++) {
        gen_init__();
    }
    return 0;
}


In [152]:
%cd testers
!cc -Ofast grammar_producer_pec_main.c grammar_producer_pec_fuzz.c  -o grammar_producer_pec
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [153]:
!./testers/grammar_producer_pec 0 10 10

@charset '\$' ;  @import   ''    ; 	 <!-- -->  --> --> @import 	    url(" *")   ;   @import    ")" 		   ;  @import  ''  q   ;  --> --> @import  ""   ;   @import  ""   ;  @import  ""   ;  <!-- @import  ""   ;  @import  ''   ;   
@charset '' ; 		  @import  "\m\
" 		  ; 	   <!-- 	 
   j   ,  selector {    ; 		 f  :  #33f   @import  }  
  @import  '\6u'   ;   w #b +  *      {   p  :  url("")   #653096  #b06228    } 		 *  +  *    {   ;    }  
  @import   url("	"s"")  -s3 		 ,  y  ;  -->  : b(   )   { 	  	 k   :   #245f90   #ec0e2f  #fdf    } 	 @page  : j  {  r  :  t(  #4b9915   )   #7beffd  @import   }  
   
 <!--  @page   { 	   }  <!--   --> 
  @import   url("%")   ;   @import  ''   i   ;  @import  ''   ;     
  @import  url("	'\B'	") 	 -v   ,  x ,  a ,  e  ;  --> <!-- --> 		 : l >  .x   {      }     -->  .m  ,  selector,  selector {  u  : 	 url("")   #c5b0ab   ;  m  :  #cae   ;   }  --> *   ,  selector,  selector { 	   } 	 @media  c  ,  g  {    }  
@charset "" ; --> @import  '

In [154]:
class CPETester(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_pec {seed} {self.max_num} {max_depth} > {fn}"

In [155]:
CPETester().run_test().show()

depth= 8 size= 813969 time= 0.041 stdev(0.0) throughput= 19387.600038109755 stdev(29)
depth= 16 size= 1388865.5 time= 0.084 stdev(0.003) throughput= 16154.419153032826 stdev(465)
depth= 32 size= 1593247 time= 0.098 stdev(0.001) throughput= 15877.827078777465 stdev(172)
depth= 64 size= 1596752.5 time= 0.097 stdev(0.001) throughput= 16076.459612165178 stdev(124)
depth= 128 size= 1596752.5 time= 0.105 stdev(0.001) throughput= 14851.411550934325 stdev(98)
depth= 256 size= 1596752.5 time= 0.097 stdev(0.001) throughput= 16158.87451171875 stdev(8)
Throughput of  19387.600038109755  kilobytes per second at depth =  8
Total time: 0:00:12.925107


### Supercompile C

In [156]:
# II
class CSuperCompiledFuzzer(CFuzzer):

    def kr_to_s(self, key, i): return 'gen_%s_%d' % (self.k_to_s(key), i)
    
    def supercompile_rule(self, key, rule, i_rule, depth):
        gen_name = self.kr_to_s(key, i_rule)
        if self.rule_recursion[gen_name]:
            self.current_lst.append(gen_name)
            return '''\
%(gen_name)s(depth_%(depth)d); /* recurse*/''' % {'gen_name':gen_name, 'depth':depth}
        res = []
        if len(rule) == 0:
            res.append('/*pass*/')
        else:
            for token in rule:
                if token not in self.grammar:
                    res.append('''\
out('%s');''' % self.esc_char(token))
                else:
                    res.append(# no indent
                        self.supercompile_key(token,
                                              depth=(depth+1)))
        return '\n'.join(res)
    
    def supercompile_key_internal(self, key, trule, i_trule, depth):
        if depth > self.MAX_SUPERCOMPILE_DEPTH:
            self.current_lst.append(self.kr_to_s(key, i_trule))
            return '%(gen_name)s(depth_%(depth)d); /*slimit*/' % {
                        'gen_name':self.kr_to_s(key, i_trule), 'depth':depth}
        else:
            return self.supercompile_rule(key, trule, i_trule, depth=depth)
        
    def choose_from_cheap_strings(self, key):
        cheap_strings = self.pool_of_strings[key]
        if len(cheap_strings) == 1:
            return '\n'.join(["out('%s');" % c for c in self.cheap_chars(cheap_strings[0])])
        l = [len(s) for s in cheap_strings]
        if len(set(l)) == 1:
            name = ['''\
int val = map(%(num_cheap_strings)d);
const char* str = pool_%(name)s[val];''' % {
                'name':self.k_to_s(key), 'num_cheap_strings': len(cheap_strings)}]
            out = ["out(str[%d]);" % i for i in range(l[0])]
            return '\n'.join(name + out)
        else:
            return '''\
int val = map(%(num_cheap_strings)d);
const char* str = pool_%(name)s[val];
const int str_l = pool_l_%(name)s[val];
for (int i = 0; i < str_l; i++) {
    out(str[i]);
}
    '''%{
            'name':self.k_to_s(key),
            'num_cheap_strings': len(cheap_strings)}

    def supercompile_key(self, key, depth):
        # Should check for MAX_SUPERCOMPILE_DEPTH
        # should first get the random number curresponding to
        # len(grammar[key]) then it should unroll that elif cond.
        if len(self.grammar[key]) == 0: return '' # no more jumping on the bed

        res = ['''\
if (depth_%(depth)d > max_depth) {
%(select_from_pool)s
} else {
    int depth_%(d_1)d = depth + %(d_1)d;'''%{
            'select_from_pool': self.add_indent(self.choose_from_cheap_strings(key), '    '),
            'depth': depth,
            'd_1': depth+1}]
        if len(self.grammar[key]) == 1:
            # we do not have to get the random number, and check for
            # equality first.
            i_trule, trule  = 0, self.grammar[key][0]
            res.append(self.add_indent(
                self.supercompile_key_internal(key, trule, i_trule, depth),
                '''\
    '''))
        else:
            # First get the random number, then compare and
            # unroll
            res.append('''\
    int val = map(%(len_rules)d);
    switch(val) {''' % {'len_rules': len(self.grammar[key])})
            for i_trule, trule in enumerate(self.grammar[key]):
                res.append('''\
    case %(i_trule)d:
        {''' % {'i_trule': i_trule})
                res.append(self.add_indent(
                    self.supercompile_key_internal(key, trule, i_trule, depth),
                    '''\
            '''))
                res.append('''\
            break;
        } /*case %d*/''' % i_trule)
            res.append('''\
    }/*switch*/''')
        res.append('''\
}/*ifelse %d*/''' % depth)
        return '\n'.join(res)
   
    def gen_rule_src(self, rule, key, i_rule):
        res = ['''\
void %(gen_name)s(int depth) {
    if (depth > max_depth) {
%(select_from_pool)s
    } else {
        int depth_%(d_1)d = depth + %(d_1)d; ''' % {
            'gen_name':self.kr_to_s(key,i_rule),
            'select_from_pool': self.add_indent(self.choose_from_cheap_strings(key),'        '),
            'depth':0,
            'd_1': 1}]
        # These should be a sequence of getting randon numbers
        # and unrolling appropriately.
        for token in rule:
            if token not in self.grammar:
                res.append('''\
        out('%s');''' % self.esc_char(token))
            else:
                res.append(self.add_indent(
                    self.supercompile_key(token, depth=1), '''\
        '''))
        res.append('''\
    } /*else*/
} /* %s */''' % self.kr_to_s(key, i_rule))
        return '\n'.join(res)
    
    # ----  
 
    def fn_fuzz_decs(self):
        result = []
        for k in self.grammar:
            for i,r in enumerate(self.grammar[k]):
                result.append('void %(name)s(int depth);' % {'name':self.kr_to_s(k, i)})
        return '\n'.join(result)
    
    def fuzz_fn_defs(self):
        keys_used = {}
        result = []
        key_defs = {}
        for key in self.grammar:
            for i_rule,rule in enumerate(self.grammar[key]):
                self.current_lst = []
                ks = self.kr_to_s(key, i_rule)
                keys_used[ks] = self.current_lst
                key_defs[ks] = self.gen_rule_src(rule, key, i_rule)
        key_set = set(keys_used['gen_start_0']) | {'gen_start_0'}
        old_len = 0
        while old_len != len(key_set):
            old_len = len(key_set)
            key_set.update(k1 for k in list(key_set) for k1 in keys_used[k])
            
        for k in key_set:
            result.append(key_defs[k])
        return '\n'.join(result)
 
    def fuzz_entry(self):
        return '''
void gen_init__() {
    gen_start_0(0);
    out('\\n');
    return;
}'''
 
    def fuzz_src(self, key='<start>'):
        self.MAX_SUPERCOMPILE_DEPTH = 0
        return self.gen_main_src(), self.gen_fuzz_src()

Going full supercompilation (below) actually reduces the speed by a small %. It is not clear why.

In [157]:
class CSuperCompiledFuzzer(CSuperCompiledFuzzer):
    def string_pool_defs(self): return ''
    
    def choose_from_cheap_strings(self, key):
        short = False
        cheap_strings = self.pool_of_strings[key]
        if len(cheap_strings) == 1:
            return '\n'.join(["out('%s');" % self.esc_char(c) for c in cheap_strings[0]])
        elif len(cheap_strings) == 0:
            return ''
        else:
            lst = ['''
int val = map(%(num_cheap_strings)d);
switch(val){'''% {
            'name':self.k_to_s(key),
            'num_cheap_strings': len(cheap_strings)}]
            for i in range(len(cheap_strings)):
                lst.append('''
case %d:
    {''' % i)
                lst.extend(["    out('%s');" % self.esc_char(c) for c in  cheap_strings[i]])
                lst.append('''
    break;
    }''')
            lst.append('''
}''')
            return '\n'.join(lst)

In [158]:
main_src, fuzz_src = CSuperCompiledFuzzer(c_grammar).fuzz_src()
with open('testers/grammar_producer_superc_fuzz.c', 'w+') as f:
    print(fuzz_src, file=f)
with open('testers/grammar_producer_superc_main.c', 'w+') as f:
    print(main_src, file=f)

In [159]:
!wc -l testers/grammar_producer_superc_fuzz.c

  363562 testers/grammar_producer_superc_fuzz.c


In [160]:
!cat testers/grammar_producer_superc_main.c


#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>


int max_depth = 0;

void gen_init__();

int map(int v) {
    return random() % v;
}
 

void out(const char s) {
    fputc(s, stdout);
}       
 

int main(int argc, char** argv) {
    int seed, max_num;

    if (argc < 3) {
        printf("%s <seed> <max_num> <max_depth>\n", argv[0]);
        return 0;
    }
    seed = atoi(argv[1]);
    max_num = atoi(argv[2]);
    max_depth = atoi(argv[3]);
    //srandom(time(0));
    srandom(seed);

    for(int i=0; i < max_num; i++) {
        gen_init__();
    }
    return 0;
}


In [161]:
%cd testers
!cc -Ofast grammar_producer_superc_main.c grammar_producer_superc_fuzz.c  -o grammar_producer_superc
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [162]:
!./testers/grammar_producer_superc 0 10 10

@charset '\
7' ; <!--	   *     *    { 	  a  : 	 url("""")   @import  ;  	 c  :  #f4465a   @import ;  j  :  #59f    }  @media  n   ,  v ,  s  {   }    --> 		w   ,  selector { 	  ;   } 		 --> 
  @import  url(" ")   ;  @import 	  url(" ''") 	  ;  <!-- 			--> -->   
 	<!--  
@charset "\" ;  @import  url(" "\["")   ;  --> @import  "\[" 	 o  ,  p  ;  <!-- <!-- <!--  @page     {    }  @page 	 : h 	 { 	 	 d  :   8mm    ; 	 ;  i  :  #4d6    }   
    
@charset "Y\k\?\p" ;   
 <!-- @import 	 url("""	") 	  ; 	  @page  : t  {  t 	 :   1cm     }  --> <!-- <!-- 	--> 
  @import    	  ''  w 		  ;  @import  '' 	  ;  -->    <!-- --> --> <!-- --> @import  '$'   ; 	  @page   {   u   :   1em   #440dc4  @import   } 	  
   
   


In [163]:
class CSupercompiledTester(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_superc {seed} {self.max_num} {max_depth} > {fn}"

In [164]:
CSupercompiledTester().run_test().show() # 0

depth= 8 size= 817229 time= 0.041 stdev(0.0) throughput= 19465.248666158535 stdev(244)
depth= 16 size= 1392425 time= 0.078 stdev(0.001) throughput= 17433.141509894376 stdev(7)
depth= 32 size= 1573788.5 time= 0.09 stdev(0.002) throughput= 16984.42276664402 stdev(177)
depth= 64 size= 1575617.5 time= 0.091 stdev(0.002) throughput= 16818.196089549732 stdev(166)
depth= 128 size= 1575617.5 time= 0.091 stdev(0.0) throughput= 16908.66994333791 stdev(225)
depth= 256 size= 1575617.5 time= 0.095 stdev(0.01) throughput= 16273.841681114915 stdev(1480)
Throughput of  19465.248666158535  kilobytes per second at depth =  8
Total time: 0:00:12.856686


In [165]:
class CSuperCompiledFuzzer(CFuzzer):
    def fuzz_src(self, key='<start>'):
        self.MAX_SUPERCOMPILE_DEPTH = 1
        return self.gen_main_src(), self.gen_fuzz_src()

In [166]:
%cd testers
!cc -Ofast grammar_producer_superc_main.c grammar_producer_superc_fuzz.c  -o grammar_producer_superc
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [167]:
CSupercompiledTester().run_test().show() # 1

depth= 8 size= 817229 time= 0.041 stdev(0.001) throughput= 19706.411668730943 stdev(98)
depth= 16 size= 1392425 time= 0.079 stdev(0.001) throughput= 17321.422662335688 stdev(165)
depth= 32 size= 1573788.5 time= 0.091 stdev(0.001) throughput= 16796.40765675391 stdev(89)
depth= 64 size= 1575617.5 time= 0.093 stdev(0.003) throughput= 16549.34509877458 stdev(283)
depth= 128 size= 1575617.5 time= 0.099 stdev(0.013) throughput= 15658.412905092591 stdev(1806)
depth= 256 size= 1575617.5 time= 0.098 stdev(0.011) throughput= 15863.860212053569 stdev(1516)
Throughput of  19706.411668730943  kilobytes per second at depth =  8
Total time: 0:00:12.938362


In [168]:
class CSuperCompiledFuzzer(CFuzzer):
    def fuzz_src(self, key='<start>'):
        self.MAX_SUPERCOMPILE_DEPTH = 2
        return self.gen_main_src(), self.gen_fuzz_src()

In [169]:
%cd testers
!cc -Ofast grammar_producer_superc_main.c grammar_producer_superc_fuzz.c  -o grammar_producer_superc
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [170]:
CSupercompiledTester().run_test().show() # 2

depth= 8 size= 817229 time= 0.053 stdev(0.008) throughput= 15268.787822867833 stdev(2633)
depth= 16 size= 1392425 time= 0.088 stdev(0.004) throughput= 15382.823481024789 stdev(899)
depth= 32 size= 1573788.5 time= 0.11 stdev(0.013) throughput= 14130.778131565126 stdev(1551)
depth= 64 size= 1575617.5 time= 0.12 stdev(0.001) throughput= 12824.305052922511 stdev(322)
depth= 128 size= 1575617.5 time= 0.115 stdev(0.001) throughput= 13379.819894226406 stdev(14)
depth= 256 size= 1575617.5 time= 0.113 stdev(0.013) throughput= 13786.42280477827 stdev(1829)
Throughput of  15382.823481024789  kilobytes per second at depth =  16
Total time: 0:00:13.124186


In [171]:
class CSuperCompiledFuzzer(CFuzzer):
    def fuzz_src(self, key='<start>'):
        self.MAX_SUPERCOMPILE_DEPTH = 3
        return self.gen_main_src(), self.gen_fuzz_src()

In [172]:
%cd testers
!cc -Ofast grammar_producer_superc_main.c grammar_producer_superc_fuzz.c  -o grammar_producer_superc
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [173]:
CSupercompiledTester().run_test().show() # 3

depth= 8 size= 817229 time= 0.054 stdev(0.009) throughput= 14839.209217842812 stdev(2320)
depth= 16 size= 1392425 time= 0.095 stdev(0.011) throughput= 14431.719634911004 stdev(1984)
depth= 32 size= 1573788.5 time= 0.108 stdev(0.004) throughput= 14175.537856363146 stdev(647)
depth= 64 size= 1575617.5 time= 0.103 stdev(0.017) throughput= 15127.668539850096 stdev(2293)
depth= 128 size= 1575617.5 time= 0.113 stdev(0.006) throughput= 13638.345779853269 stdev(864)
depth= 256 size= 1575617.5 time= 0.096 stdev(0.007) throughput= 16063.72290785483 stdev(970)
Throughput of  16063.72290785483  kilobytes per second at depth =  256
Total time: 0:00:13.072541


In [174]:
class CSuperCompiledFuzzer(CFuzzer):
    def fuzz_src(self, key='<start>'):
        self.MAX_SUPERCOMPILE_DEPTH = 5
        return self.gen_main_src(), self.gen_fuzz_src()

In [175]:
%cd testers
!cc -Ofast grammar_producer_superc_main.c grammar_producer_superc_fuzz.c  -o grammar_producer_superc
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [176]:
CSupercompiledTester().run_test().show() # 5

depth= 8 size= 817229 time= 0.041 stdev(0.001) throughput= 19706.411668730943 stdev(98)
depth= 16 size= 1392425 time= 0.077 stdev(0.001) throughput= 17544.924476565102 stdev(165)
depth= 32 size= 1573788.5 time= 0.096 stdev(0.006) throughput= 16122.028835422389 stdev(865)
depth= 64 size= 1575617.5 time= 0.095 stdev(0.006) throughput= 16219.056605287073 stdev(750)
depth= 128 size= 1575617.5 time= 0.093 stdev(0.004) throughput= 16557.244873046875 stdev(535)
depth= 256 size= 1575617.5 time= 0.092 stdev(0.002) throughput= 16636.309356368365 stdev(160)
Throughput of  19706.411668730943  kilobytes per second at depth =  8
Total time: 0:00:12.701946


In [177]:
class CSuperCompiledFuzzer(CFuzzer):
    def fuzz_src(self, key='<start>'):
        self.MAX_SUPERCOMPILE_DEPTH = 10
        return self.gen_main_src(), self.gen_fuzz_src()

In [178]:
%cd testers
!cc -Ofast grammar_producer_superc_main.c grammar_producer_superc_fuzz.c  -o grammar_producer_superc
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [179]:
CSupercompiledTester().run_test().show() # 10

depth= 8 size= 817229 time= 0.048 stdev(0.008) throughput= 17012.46126214342 stdev(2576)
depth= 16 size= 1392425 time= 0.083 stdev(0.001) throughput= 16484.218313803995 stdev(447)
depth= 32 size= 1573788.5 time= 0.112 stdev(0.001) throughput= 13661.068613701013 stdev(92)
depth= 64 size= 1575617.5 time= 0.106 stdev(0.0) throughput= 14515.933630601416 stdev(193)
depth= 128 size= 1575617.5 time= 0.116 stdev(0.004) throughput= 13216.386266677724 stdev(577)
depth= 256 size= 1575617.5 time= 0.109 stdev(0.008) throughput= 14080.830429230247 stdev(813)
Throughput of  17012.46126214342  kilobytes per second at depth =  8
Total time: 0:00:13.049628


## Making the random -> choices map faster.

### [Replace the division by mulitplication](//https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/)

For convenience, here is our map function.
```c
int map(int v) {
    return random() % v;
}
```

```c
uint32_t
__attribute__((always_inline))
map(uint32_t to) {
    uint32_t from = random();
    return ((uint64_t) from * (uint64_t) to) >> 32;
}
```

### Replace random() by a faster pseudo random.

[Xoshiro**](http://xoshiro.di.unimi.it/xoshiro128starstar.c)

In [180]:
# https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ 
class CFuzzerPRNG(CFuzzer):
    def fn_map_def(self):
        return '''
uint64_t next(void);
uint32_t map(uint32_t to) {
    uint32_t from = next();
    return ((uint64_t) from * (uint64_t) to) >> 32;
}

static inline uint64_t rotl(const uint64_t x, int k) {
    return (x << k) | (x >> (64 - k));
}
static uint64_t r__s[4] = {13343, 9838742, 223185, 802124}; /*TODO: initialize with seed.*/
uint64_t next(void) {
    const uint64_t result_starstar = rotl(r__s[1] * 5, 7) * 9;

    const uint64_t t = r__s[1] << 17;

    r__s[2] ^= r__s[0];
    r__s[3] ^= r__s[1];
    r__s[1] ^= r__s[2];
    r__s[0] ^= r__s[3];

    r__s[2] ^= t;

    r__s[3] = rotl(r__s[3], 45);

    return result_starstar;
}
'''

In [181]:
main_src, fuzz_src = CFuzzerPRNG(c_grammar).fuzz_src()
with open('testers/grammar_producer_cprng_main.c', 'w+') as f:
    print(main_src, file=f)
with open('testers/grammar_producer_cprng_fuzz.c', 'w+') as f:
    print(fuzz_src, file=f)

In [182]:
!cat testers/grammar_producer_cprng_main.c


#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>


int max_depth = 0;

void gen_init__();

uint64_t next(void);
uint32_t map(uint32_t to) {
    uint32_t from = next();
    return ((uint64_t) from * (uint64_t) to) >> 32;
}

static inline uint64_t rotl(const uint64_t x, int k) {
    return (x << k) | (x >> (64 - k));
}
static uint64_t r__s[4] = {13343, 9838742, 223185, 802124}; /*TODO: initialize with seed.*/
uint64_t next(void) {
    const uint64_t result_starstar = rotl(r__s[1] * 5, 7) * 9;

    const uint64_t t = r__s[1] << 17;

    r__s[2] ^= r__s[0];
    r__s[3] ^= r__s[1];
    r__s[1] ^= r__s[2];
    r__s[0] ^= r__s[3];

    r__s[2] ^= t;

    r__s[3] = rotl(r__s[3], 45);

    return result_starstar;
}


void out(const char s) {
    fputc(s, stdout);
}       
 

int main(int argc, char** argv) {
    int seed, max_num;

    if (argc < 3) {
       

In [183]:
%cd testers
!cc -Ofast grammar_producer_cprng_main.c grammar_producer_cprng_fuzz.c -o grammar_producer_cprng
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [184]:
class CTesterPRNG(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_cprng {seed} {self.max_num} {max_depth} > {fn}"

In [185]:
!./testers/grammar_producer_cprng 0 10 10

   --> @import  url("	")   q1   ;  <!-- 		--> 	--> -->  @import   url("")   	  ;   @import  'k'   ;  --> @import  ""   ;   <!--  --> -->  
@charset '' ;  @import  url("	"\
7"")   ; 			   @import  '\,'   b 	 ,  g  ;  @import  	 "`"    ;  <!-- 	@import 		 url(" ") 		 v  ,  j  ;  -->   .k  ,  selector, 	 selector,  selector,  selector {  _  :  #4a8   #a1a  @import   }   -->  --> 	
@charset "\B	" ;  @import 		 url("	'\' ") 	 ch   ,  c ,  p  ;     *   ,  selector {  			   }  <!-- 
@charset "" ;  @import   url("	"" ")   	  ; 	 	 	  
@charset "" ;  @import  ''      y 	   ;  --> @import 	 ''  v 	 ,  f  ; 	  
@charset '\
' ;  @import  url("& ")  	  -z8     ; 			 <!--  @page   {  a  :  	 ""   #8bb631  @import   } 	 @media  y   { 	 .s 	  ,  selector,  selector {     } *    {   ;   }  }  z [  w   ]  ,  selector { 	  x  :  #42038e   @import   }  --> 
@charset '\
\r' ; <!--  
   .g: x(   )     *   , 	 selector,   selector { 	   }  
@charset "" ;  @import  "\"  -k3  ,  i ,  y ,  v  ;  

In [186]:
CTesterPRNG().run_test().show()

depth= 8 size= 813847 time= 0.037 stdev(0.001) throughput= 21778.67479295702 stdev(422)
depth= 16 size= 1379174 time= 0.065 stdev(0.001) throughput= 20725.668704870975 stdev(451)
depth= 32 size= 1572259 time= 0.072 stdev(0.0) throughput= 21325.12749565972 stdev(0)
depth= 64 size= 1581547 time= 0.074 stdev(0.0) throughput= 20871.344489020266 stdev(0)
depth= 128 size= 1581547 time= 0.074 stdev(0.001) throughput= 21014.298903328625 stdev(202)
depth= 256 size= 1581547 time= 0.073 stdev(0.0) throughput= 21157.253317636983 stdev(0)
Throughput of  21778.67479295702  kilobytes per second at depth =  8
Total time: 0:00:12.332315


### Can we make the random go faster?

**idea**: Do the random allocation in one place, and use that later.

#### How fast is /dev/random (and variants)?

Using the best block size, and fastest #counts

In [187]:
with timeit() as t:
    !dd if=/dev/random of=random.x bs=1024 count=1000 2>/dev/null
print("throughput=",os.stat('random.x').st_size/1024/t.runtime, 'kb per second')

throughput= 19691.24133572426 kb per second


In [188]:
with timeit() as t:
    !dd if=/dev/urandom of=random.x bs=1024 count=1000 2>/dev/null
print("throughput=",os.stat('random.x').st_size/1024/t.runtime, 'kb per second')

throughput= 20553.71713982809 kb per second


In [189]:
with timeit() as t:
    !dd if=/dev/zero of=io.x bs=1024 count=1000 2>/dev/null
print("throughput=",os.stat('io.x').st_size/1024/t.runtime, 'kb per second')

throughput= 59959.22772571298 kb per second


**Idea**:
* Pre-allocate random bits, and use only as much as necessary.
* Optimize for < 256 bits by using only a single byte at a time.

```c
uint8_t
map(uint8_t to) {
    uint8_t from = rand_region[rand_cursor++];
    if (rand_cursor >= rand_region_size)
        rand_cursor = 0;
    return ((uint16_t) from * (uint16_t) to) >> 8;
}
```

**idea**: Wraparound at 4 GB to avoid comparisons (this did not work as expected.)

In [190]:
u_max_int = 4096 * 1024 * 1024

(The better idea is to use a pointer to the last element, and increment it rather than use an array address, which is required for this trick.)

### Can we make random faster?

In [191]:
my_str = '''
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <limits.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <math.h>
#include <errno.h>
#include <string.h>

uint8_t* rand_region;
void* stack[INT_MAX];

static inline uint64_t rotl(const uint64_t x, int k) {
    return (x << k) | (x >> (64 - k));
}
static uint64_t r__s[4] = {13343, 9838742, 223185, 802124}; /*TODO: initialize with seed.*/
uint64_t
next(void) {
    const uint64_t result_starstar = rotl(r__s[1] * 5, 7) * 9;

    const uint64_t t = r__s[1] << 17;

    r__s[2] ^= r__s[0];
    r__s[3] ^= r__s[1];
    r__s[1] ^= r__s[2];
    r__s[0] ^= r__s[3];

    r__s[2] ^= t;

    r__s[3] = rotl(r__s[3], 45);

    return result_starstar;
}

uint8_t* rand_region_size = 0;

void
__attribute__((flatten))
initialize_random(uint64_t max_chars) {
    uint64_t* arr = (uint64_t*) rand_region;
    uint64_t i;
    for (i=0; i < max_chars/8; i++) { /*max_space/8 because we have 8 bytes*/
        arr[i] = next();
    }
    rand_region_size = (uint8_t*) (arr+i);
}

int main(int argc, char** argv) {
    struct stat st;
    int rand_fd;
    uint8_t* rand_region_init;

    char* rand_file = argv[1];
    rand_fd = open(rand_file, O_RDWR | O_CREAT, 0600);
    size_t u_max = (uint64_t)powl(2,32);
    int res = ftruncate(rand_fd, u_max);
    if (res == -1) {
        fprintf(stdout, "Error: %s\\n", strerror(errno));
        return 4;
    }
    rand_region = mmap(0, u_max, PROT_READ| PROT_WRITE, MAP_SHARED, rand_fd, 0);
    rand_region_init = rand_region;
    if (rand_region == (uint8_t*)-1) {
        exit(3);
    }
    initialize_random(u_max);
    msync(rand_region, st.st_size, MS_SYNC);
    munmap(rand_region, st.st_size);
    long rand_size = rand_region_size - rand_region_init;
    ftruncate(rand_fd, rand_size);
    /*fprintf(stdout, "%ld\\n", rand_size);*/
    close(rand_fd);
    return 0;
}
'''

In [192]:
with open('testers/rand.c', 'w+') as f:
    print(my_str, file=f)

In [193]:
%cd testers
!cc -g -o rand -Ofast rand.c
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [194]:
with timeit() as t:
    !./testers/rand random1.x
print("throughput=",os.stat('random1.x').st_size/1024/t.runtime)
!rm random1.x

throughput= 598326.6612525583


Allocate a file `u_max_int` size, and mmap it to memory.

```c
uint32_t rand_cursor = 0;
uint8_t
__attribute__((always_inline))
map(uint8_t to) {
    uint8_t from = rand_region[rand_cursor++];
    return ((uint16_t) from * (uint16_t) to) >> 8;
}
```

Does it help?

In [195]:
# II
# https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
class CFuzzerExtRand(CFuzzer):
    def main_hdefs(self):
        s = super().main_hdefs()
        return s + '''
#include <unistd.h>
#include <limits.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <math.h>
'''
    
    def fn_map_def(self):
        return '''
uint8_t
__attribute__((always_inline))
map(uint8_t to) {
    uint8_t from = rand_regionp[rand_cursor++];
    if (rand_cursor >= rand_region_size)
        rand_cursor = 0;
    return ((uint16_t) from * (uint16_t) to) >> 8;
}

static inline uint64_t rotl(const uint64_t x, int k) {
    return (x << k) | (x >> (64 - k));
}
static uint64_t r__s[4] = {13343, 9838742, 223185, 802124}; /*TODO: initialize with seed.*/
uint64_t
next(void) {
    const uint64_t result_starstar = rotl(r__s[1] * 5, 7) * 9;

    const uint64_t t = r__s[1] << 17;

    r__s[2] ^= r__s[0];
    r__s[3] ^= r__s[1];
    r__s[1] ^= r__s[2];
    r__s[0] ^= r__s[3];

    r__s[2] ^= t;

    r__s[3] = rotl(r__s[3], 45);

    return result_starstar;
}

uint8_t* rand_region_sizep = 0;

void
__attribute__((flatten))
initialize_random(uint64_t max_chars) {
    uint64_t* arr = (uint64_t*) rand_regionp;
    uint64_t i;
    for (i=0; i < max_chars/8; i++) { /*max_space/8 because we have 8 bytes*/
        arr[i] = next();
    }
    rand_region_sizep = (uint8_t*) (arr+i);
}
'''
    def main_rand_var_defs(self):
        return '''
const uint64_t rand_region_size = 1ULL << 16;
uint8_t rand_regionp[rand_region_size];
uint64_t rand_cursor = 0;
'''
    
    def main_var_defs(self):
        s = super().main_var_defs()
        return s + self.main_rand_var_defs()

    def fuzz_hdefs(self):
        s = super().fuzz_hdefs()
        return s + '''
#include <unistd.h>
#include <stdint.h>'''
    
    def fuzz_rand_var_defs(self):
        return '''
extern uint8_t* rand_regionp;
extern uint64_t rand_cursor;
extern uint64_t rand_region_size;
uint8_t map(uint8_t to);'''
 
    def fn_main_rand_frag(self):
        return '''\
    initialize_random(rand_region_size);
    rand_cursor = seed;
    '''
 
    def fn_main_def(self):
        return '''
int main(int argc, char** argv) {
    struct stat st;
    int max_num, seed, rand_fd, out_fd;
    long out_size;
%(input_frag)s
%(rand_frag)s
%(loop_frag)s
    return 0;
}''' % { 'input_frag': self.fn_main_input_frag(),
         'loop_frag': self.fn_main_loop_frag(),
         'rand_frag': self.fn_main_rand_frag()}

In [196]:
main_src, fuzz_src = CFuzzerExtRand(c_grammar).fuzz_src()
with open('testers/grammar_producer_cprngextr_main.c', 'w+') as f:
    print(main_src, file=f)
with open('testers/grammar_producer_cprngextr_fuzz.c', 'w+') as f:
    print(fuzz_src, file=f)

In [197]:
!cat testers/grammar_producer_cprngextr_fuzz.c


#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>

#include <unistd.h>
#include <stdint.h>

void out(const char s);

extern uint8_t* rand_regionp;
extern uint64_t rand_cursor;
extern uint64_t rand_region_size;
uint8_t map(uint8_t to);

extern int max_depth;
void gen_start(int depth);
void gen_stylesheet(int depth);
void gen_OsqCHARSET_SYM_STRING_SEMICsq(int depth);
void gen_OsqS_OR_CDO_OR_CDCCsq(int depth);
void gen_OsqXimportX_CDO_S_OR_CDC_SCsq(int depth);
void gen_OsqCDO_S_OR_CDC_SCsq(int depth);
void gen_Osqruleset_OR_media_OR_pageCsq(int depth);
void gen_Osqstylesheet_closing_GROUPINGCsq(int depth);
void gen_XimportX(int depth);
void gen_OsqSTRING_OR_URICsq(int depth);
void gen_media(int depth);
void gen_media_list(int depth);
void gen_OsqCOMMA_S_mediumCsq(int depth);
void gen_medium(int depth);
void gen_page(int depth);
void gen_OsqSEMI_S_declarationCsq(int depth);
void gen_pseudo_page(int depth);
void gen_operator(int 

In [198]:
!nl -ba testers/grammar_producer_cprngextr_main.c

     1	
     2	#define _LARGEFILE64_SOURCE
     3	#define _FILE_OFFSET_BITS 64
     4	
     5	#include <stdlib.h>
     6	#include <stdio.h>
     7	#include <time.h>
     8	#include <string.h>
     9	
    10	#include <unistd.h>
    11	#include <limits.h>
    12	#include <fcntl.h>
    13	#include <sys/types.h>
    14	#include <sys/mman.h>
    15	#include <sys/stat.h>
    16	#include <math.h>
    17	
    18	
    19	int max_depth = 0;
    20	
    21	void gen_init__();
    22	const uint64_t rand_region_size = 1ULL << 16;
    23	uint8_t rand_regionp[rand_region_size];
    24	uint64_t rand_cursor = 0;
    25	
    26	
    27	uint8_t
    28	__attribute__((always_inline))
    29	map(uint8_t to) {
    30	    uint8_t from = rand_regionp[rand_cursor++];
    31	    if (rand_cursor >= rand_region_size)
    32	        rand_cursor = 0;
    33	    return ((uint16_t) from * (uint16_t) to) >> 8;
    34	}
    35	
    36	static inline uint64_t rotl(const uint64_t x, int k)

In [199]:
%cd testers
!cc -g -Ofast -o grammar_producer_cprngextr grammar_producer_cprngextr_main.c grammar_producer_cprngextr_fuzz.c
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [200]:
# II
!./testers/grammar_producer_cprngextr 0 10 10

@charset "" ;   @page  		  {  o   : 		 u   @import   }  --> <!--   
@charset '' ; --><!--<!--<!--  @media  e4    ,  b  { 	 #p  ,  selector {  e  :  #e96     }  } 		  -->  
@charset "" ;  @import 	 url("'\k/'	")   ; 	    <!-- -->  <!-- 	 
   
@charset '' ;  @import  url("""")  -g_  ,   f  ;  	  *    { 	   }  * : h.h   { 	    } 	 
  @import  ""  -n6   ; 	 @import 		 url("") 	  ;   @import  url(" ")    ;  -->  	@import  ''   ;  <!--  --> @import  ""   n   ;  <!-- <!-- <!--  @page   {  p  :  url("''")   #dc4833    } 		 	 @media  b 	 ,  w  {   }   -->  	<!--  <!-- --> 
 -->-->  @media  -d     {  *   ,  selector,  selector,  selector {  	 p  :  #eed    ;  z  :  #f63b82    }  }  --> @media  t 	 ,  o ,  t ,  b  {   }   
@charset '\C' ;   	<!--	 @import 		 '' 	 -m  ,  k  ;  --> 	 @import  url("""")    ;   <!--  @media    mx 	 ,  l  {   * [  q   ] >  *    {  a  :  #203661   @import   }  } 		 	 @media  b   {   } 	 <!-- s   ,  selector,  selector,  selector {      } 	 *   ,  selector { 	 l 

In [201]:
# II
class CTesterPRNGExt(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_cprngextr {seed} {self.max_num} {max_depth} ./random.x > {fn}"

In [202]:
CTesterPRNGExt().run_test().show()

depth= 8 size= 825944 time= 0.037 stdev(0.001) throughput= 22102.473811702323 stdev(437)
depth= 16 size= 1363999.5 time= 0.061 stdev(0.001) throughput= 21660.528980821986 stdev(260)
depth= 32 size= 1489954.5 time= 0.072 stdev(0.004) throughput= 20243.819916213768 stdev(1189)
depth= 64 size= 1474938.5 time= 0.069 stdev(0.001) throughput= 21028.434161538255 stdev(221)
depth= 128 size= 1474938.5 time= 0.068 stdev(0.001) throughput= 21339.959711438212 stdev(220)
depth= 256 size= 1474938.5 time= 0.068 stdev(0.001) throughput= 21339.959711438212 stdev(220)
Throughput of  22102.473811702323  kilobytes per second at depth =  8
Total time: 0:00:12.345378


In [203]:
# II
class CFuzzerExtRandP(CFuzzerExtRand):
    def fn_map_def(self):
        return '''
uint8_t
__attribute__((always_inline))
map(uint8_t to) {
    uint8_t from = *rand_regionp++;
    if (rand_regionp >= rand_region_sizep)
        rand_regionp = rand_region_initp;
    return ((uint16_t) from * (uint16_t) to) >> 8;
}


static inline uint64_t rotl(const uint64_t x, int k) {
    return (x << k) | (x >> (64 - k));
}
static uint64_t r__s[4] = {13343, 9838742, 223185, 802124}; /*TODO: initialize with seed.*/
uint64_t
next(void) {
    const uint64_t result_starstar = rotl(r__s[1] * 5, 7) * 9;

    const uint64_t t = r__s[1] << 17;

    r__s[2] ^= r__s[0];
    r__s[3] ^= r__s[1];
    r__s[1] ^= r__s[2];
    r__s[0] ^= r__s[3];

    r__s[2] ^= t;

    r__s[3] = rotl(r__s[3], 45);

    return result_starstar;
}

void
__attribute__((flatten))
initialize_random(uint64_t max_chars) {
    uint64_t* arr = (uint64_t*) rand_regionp;
    uint64_t i;
    for (i=0; i < max_chars/8; i++) { /*max_space/8 because we have 8 bytes*/
        arr[i] = next();
    }
    rand_region_sizep = (uint8_t*) (arr+i);
}
'''
    def main_rand_var_defs(self):
        return '''
uint8_t* rand_region_sizep = 0;
const uint64_t rand_region_size = 1ULL << 16;
uint8_t rand_region_initp[rand_region_size];

uint8_t* rand_regionp = rand_region_initp;
'''
    def fuzz_rand_var_defs(self):
        return '''
uint8_t map(uint8_t to);
'''
    
    def fn_main_loop_frag(self):
        return '''
    for (int i = 0; i < max_num; i++) {
        gen_init__();
    }
'''
    def fn_main_rand_frag(self):
        return '''\
    initialize_random(rand_region_size);
    rand_regionp += seed;
    '''
    def fn_main_def(self):
        return '''
int main(int argc, char** argv) {
    struct stat st;
    long out_size;
    char* out_region_sizep = 0;
    char* out_region_initp;
    int out_fd;
    int seed, max_num;
%(input_frag)s
%(rand_frag)s
%(loop_frag)s
    return 0;
}''' % {'input_frag': self.fn_main_input_frag(),
        'rand_frag': self.fn_main_rand_frag(),
        'loop_frag': self.fn_main_loop_frag()
       }

In [204]:
main_src, fuzz_src = CFuzzerExtRandP(c_grammar).fuzz_src()
with open('testers/grammar_producer_cprngextrP_main.c', 'w+') as f:
    print(main_src, file=f)
with open('testers/grammar_producer_cprngextrP_fuzz.c', 'w+') as f:
    print(fuzz_src, file=f)

In [205]:
!cat testers/grammar_producer_cprngextrP_main.c


#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>

#include <unistd.h>
#include <limits.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <math.h>


int max_depth = 0;

void gen_init__();
uint8_t* rand_region_sizep = 0;
const uint64_t rand_region_size = 1ULL << 16;
uint8_t rand_region_initp[rand_region_size];

uint8_t* rand_regionp = rand_region_initp;


uint8_t
__attribute__((always_inline))
map(uint8_t to) {
    uint8_t from = *rand_regionp++;
    if (rand_regionp >= rand_region_sizep)
        rand_regionp = rand_region_initp;
    return ((uint16_t) from * (uint16_t) to) >> 8;
}


static inline uint64_t rotl(const uint64_t x, int k) {
    return (x << k) | (x >> (64 - k));
}
static uint64_t r__s[4] = {13343, 9838742, 223185, 802124}; /*TODO: initialize with seed.*/
uint64_t
next(void) {
    const uint64_t res

In [206]:
!cat testers/grammar_producer_cprngextrP_fuzz.c


#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>

#include <unistd.h>
#include <stdint.h>

void out(const char s);

uint8_t map(uint8_t to);


extern int max_depth;
void gen_start(int depth);
void gen_stylesheet(int depth);
void gen_OsqCHARSET_SYM_STRING_SEMICsq(int depth);
void gen_OsqS_OR_CDO_OR_CDCCsq(int depth);
void gen_OsqXimportX_CDO_S_OR_CDC_SCsq(int depth);
void gen_OsqCDO_S_OR_CDC_SCsq(int depth);
void gen_Osqruleset_OR_media_OR_pageCsq(int depth);
void gen_Osqstylesheet_closing_GROUPINGCsq(int depth);
void gen_XimportX(int depth);
void gen_OsqSTRING_OR_URICsq(int depth);
void gen_media(int depth);
void gen_media_list(int depth);
void gen_OsqCOMMA_S_mediumCsq(int depth);
void gen_medium(int depth);
void gen_page(int depth);
void gen_OsqSEMI_S_declarationCsq(int depth);
void gen_pseudo_page(int depth);
void gen_operator(int depth);
void gen_combinator(int depth);
void gen_unary_operator(int depth);
void gen_proper

In [207]:
%cd testers
!cc -g -Ofast -o grammar_producer_cprngextrP grammar_producer_cprngextrP_main.c grammar_producer_cprngextrP_fuzz.c
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [208]:
!./testers/grammar_producer_cprngextrP 0 10 10

@charset "" ;   @page  		  {  p   : 		 q   @import   }  --> <!--   
@charset '' ; --><!--<!--<!--  @media  ea    ,  v  { 	 [  k   ]  ,  selector {  o  :  #3a5c3f     }  } 		  -->  
@charset "" ;  @import 	 url("'\{6'	")   ; 	    <!-- -->  <!-- 	 
   
@charset '' ;  @import  url("""")  -gi  ,   o  ;  	  *    { 	   }  * : n(   ).z   { 	    } 	 
  @import  ""  -nk   ; 	 @import 		 url("") 	  ;   @import  url(" ")    ;  -->  	@import  ''   ;  <!--  --> @import  ""   q   ;  <!-- <!-- -->  @page   {  c  :  url("""")   #744    } 		 	 @media  q 	 ,  m  {   }   -->  	<!--  <!-- --> 
 -->-->  @media  -d     {  *   ,  selector,  selector,  selector {  	 e  :  #71a    ;  n  :  #b6b061    }  }  --> @media  m 	 ,  a ,  s ,  n  {   }   
@charset '\C' ;   	<!--	 @import 		 '' 	 -m  ,  d  ;  --> 	 @import  url("""")    ;   <!--  @media    m2 	 ,  i  {   * .l +  *    {  y  :  #22a   @import   }  } 		 	 @media  q   {   } 	 <!-- i   ,  selector,  selector,  selector {      } 	 *   ,  selector { 	 d

In [209]:
# II
class CTesterPRNGExtP(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_cprngextrP {seed} {self.max_num} {max_depth} > {fn}"

In [210]:
CTesterPRNGExtP().run_test().show()

depth= 8 size= 826132.5 time= 0.037 stdev(0.001) throughput= 22107.352225272145 stdev(420)
depth= 16 size= 1363001.5 time= 0.061 stdev(0.0) throughput= 21820.592661372953 stdev(10)
depth= 32 size= 1489957.5 time= 0.072 stdev(0.005) throughput= 20398.89236749387 stdev(1408)
depth= 64 size= 1474938.5 time= 0.067 stdev(0.0) throughput= 21498.05416277985 stdev(4)
depth= 128 size= 1474938.5 time= 0.071 stdev(0.005) throughput= 20481.386320876612 stdev(1442)
depth= 256 size= 1474938.5 time= 0.068 stdev(0.0) throughput= 21181.90630744485 stdev(4)
Throughput of  22107.352225272145  kilobytes per second at depth =  8
Total time: 0:00:12.388546


## Using faster IO.

### MMap

**Idea**:
* `mmap` to a file, write the bits, and `ftruncate()` to the new size.

In [211]:
# II
class CMMapIOFuzzer(CFuzzerExtRandP):
    def main_out_var_defs(self):
        return '''
char* out_regionp;
uint64_t out_cursor = 0;
'''
    def main_var_defs(self):
        s = super().main_var_defs()
        return s + self.main_out_var_defs()
     
    def fn_out_def(self):
        return '''
void
__attribute__((always_inline))
out(char c) {
    out_regionp[out_cursor++] = c;
}
'''
    
    def fuzz_out_var_defs(self):
        return '''
void out(char c);
extern char* out_regionp;
extern uint64_t out_cursor;
'''

    def fn_main_input_frag(self):
        return '''
    if (argc < 3) {
        printf("%s <seed> <max_num> <max_depth>\\n", argv[0]);
        return 0;
    }
    seed = atoi(argv[1]);
    max_num = atoi(argv[2]);
    max_depth = atoi(argv[3]);'''

    def fn_main_out_frag(self):
        return '''
    char* iomax = getenv("IO_LIMIT");
    uint64_t u_iomax = UINT_MAX * 10ULL; // 40G
    if (iomax) {
        u_iomax = 1ULL << atoi(iomax);
    }
    if (argc > 4) {
        out_fd = open(argv[4], O_RDWR | O_CREAT, 0600);
    } else {
        out_fd = open("io.x", O_RDWR | O_CREAT, 0600);
    }
    if (iomax) {
        int res = ftruncate(out_fd, u_iomax);
        if (res != 0) {
            perror("truncate failed");
            exit(2);
        }
    } else {
        int res = try_truncate(out_fd);
        if (res < 32) {
            perror("truncate failed");
            fprintf(stderr,"%d\\n", res);
            exit(5);
        }
    }
    fstat(out_fd, &st);
    out_regionp = mmap(0, st.st_size, PROT_READ|
                      PROT_WRITE, MAP_SHARED, out_fd, 0);
    if (out_regionp == (caddr_t)-1) {
        exit(3);
    }
    '''
    
    def fn_main_sync_frag(self):
        return '''
    msync(out_regionp, st.st_size, MS_SYNC);
    munmap(out_regionp, st.st_size);
    ftruncate(out_fd, out_cursor);
    close(out_fd);
'''

    def fn_truncateio(self):
        return '''
#include <errno.h>
int try_truncate(int fd) {
    for (off_t len = 63; len > 0; len--) {
      uint64_t m = 1ULL << len;
      errno = 0;
      int ret = ftruncate(fd, m);
      if (ret == 0) {
        return len;
      }
    }
    return -1;
}
'''
    
    def fn_main_def(self):
        return self.fn_truncateio() + '''
int main(int argc, char** argv) {
    struct stat st;
    int rand_fd, out_fd;
    int seed, max_num;
%(input_frag)s
%(rand_frag)s
%(out_frag)s
%(loop_frag)s
%(sync_frag)s
    return 0;
}''' % {'input_frag': self.fn_main_input_frag(),
        'out_frag': self.fn_main_out_frag(),
        'sync_frag': self.fn_main_sync_frag(),
        'rand_frag': self.fn_main_rand_frag(),
        'loop_frag': self.fn_main_loop_frag(),
       }

In [212]:
main_src, fuzz_src = CMMapIOFuzzer(c_grammar).fuzz_src()
with open('./testers/grammar_producer_mmapio_main.c', 'w+') as f:
    print(main_src, file=f)
with open('./testers/grammar_producer_mmapio_fuzz.c', 'w+') as f:
    print(fuzz_src, file=f)

In [213]:
!cat testers/grammar_producer_mmapio_main.c


#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>

#include <unistd.h>
#include <limits.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <math.h>


int max_depth = 0;

void gen_init__();
uint8_t* rand_region_sizep = 0;
const uint64_t rand_region_size = 1ULL << 16;
uint8_t rand_region_initp[rand_region_size];

uint8_t* rand_regionp = rand_region_initp;

char* out_regionp;
uint64_t out_cursor = 0;


uint8_t
__attribute__((always_inline))
map(uint8_t to) {
    uint8_t from = *rand_regionp++;
    if (rand_regionp >= rand_region_sizep)
        rand_regionp = rand_region_initp;
    return ((uint16_t) from * (uint16_t) to) >> 8;
}


static inline uint64_t rotl(const uint64_t x, int k) {
    return (x << k) | (x >> (64 - k));
}
static uint64_t r__s[4] = {13343, 9838742, 223185, 802124}; /*TODO: initialize with seed.*/

In [214]:
%cd testers
!cc -g -Ofast -o grammar_producer_mmapio grammar_producer_mmapio_main.c grammar_producer_mmapio_fuzz.c
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [215]:
# II
!./testers/grammar_producer_mmapio 0 10 10 io.x

In [216]:
!cat io.x

@charset "" ;   @page  		  {  _ 	 : 	  v   @import   }  --> <!--   
@charset '' ; --><!--<!--<!--  @media  e4  	 ,  q  { 	 #q  ,  selector {  q  :  o     }  } 		  -->  
@charset "" ;  @import 	 url("'\	/'	")   ; 	    <!-- -->  <!-- 	 
   
@charset '' ;  @import  url("""")  -g0  , 	 x  ;  	  *    { 	   }  * : d[  _   ]   { 	    } 	 
  @import  ""  -nq   ; 	 @import 		 url("") 	  ;   @import  url(" ")    ;  -->   @import  ''   ;  <!-- 	--> @import  "" 	 a   ;  <!-- <!-- -->  @page   {  e  :  url("""")   #701b99    } 		   @media  n   ,  r  {   }   -->  	<!--  <!-- --> 
 -->-->  @media  -d  	  {  *   ,  selector,  selector,  selector {    g  :  #613    ;  z  :  #c5b84e    }  }  --> @media  f   ,  k ,  e ,  s  {   }   
@charset '\C' ;   	<!--	 @import 		 '' 	 -m  ,  b  ;  --> 	 @import  url("""")    ;   <!--  @media    my 	 ,  t  {   * [  p   ] >  *    {  m  :  #720   @import   }  } 		   @media  n   {   } 	 <!-- a   ,  selector,  selector,  selector {  	   } 	 *   ,  selector {   n  

In [217]:
# II
class CTesterMMap(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_mmapio {seed} {self.max_num} {max_depth} {fn}"

In [218]:
CTesterMMap().run_test().show()

depth= 8 size= 826461.5 time= 0.029 stdev(0.006) throughput= 28369.46200284091 stdev(5524)
depth= 16 size= 1363682 time= 0.035 stdev(0.006) throughput= 38554.217392990075 stdev(6250)
depth= 32 size= 1489957.5 time= 0.04 stdev(0.005) throughput= 36196.69276431972 stdev(4416)
depth= 64 size= 1474938.5 time= 0.041 stdev(0.002) throughput= 35613.70603680174 stdev(1872)
depth= 128 size= 1474938.5 time= 0.043 stdev(0.0) throughput= 33496.96811409884 stdev(6)
depth= 256 size= 1474938.5 time= 0.041 stdev(0.006) throughput= 35468.112155123876 stdev(4887)
Throughput of  38554.217392990075  kilobytes per second at depth =  16
Total time: 0:00:12.036837


### FWrite

In [219]:
class CFWriteFuzzer(CFuzzerExtRandP):
    def main_out_var_defs(self):
        return '''
const uint64_t size = UINT_MAX; /*max size of a single input -- 4G*/
char out_region_initp[size];
char *out_regionp = out_region_initp;
uint64_t out_cursor = 0;
FILE* fs;
'''
    def main_var_defs(self):
        s = super().main_var_defs()
        return s + self.main_out_var_defs()
     
    def fn_out_def(self):
        return '''
void
__attribute__((always_inline))
out(char c) {
    out_regionp[out_cursor++] = c;
}'''
    
    def fuzz_out_var_defs(self):
        return '''
void out(char c);
extern char* out_regionp;
extern uint64_t out_cursor;
'''

    def fn_main_input_frag(self):
        return '''
    if (argc < 3) {
        printf("%s <seed> <max_num> <max_depth>\\n", argv[0]);
        return 0;
    }
    seed = atoi(argv[1]);
    max_num = atoi(argv[2]);
    max_depth = atoi(argv[3]);'''

    def fn_main_out_frag(self):
        return '''
    if (argc > 4) {
        out_fd = open(argv[4], O_RDWR | O_CREAT, 0600);
    } else {
        out_fd = open("io.x", O_RDWR | O_CREAT, 0600);
    }
    fs = fdopen(out_fd, "w");
'''

    def fn_main_sync_frag(self):
        return '''
    fclose(fs);
    close(out_fd);
'''

    def fn_truncateio(self):
        return '''
'''
    def fn_main_loop_frag(self):
        return '''
    for(int i=0; i < max_num; i++) {
        gen_init__();
        fwrite(out_regionp, sizeof(char), out_cursor, fs);
        out_cursor = 0;
    }
'''

    def fn_main_def(self):
        return self.fn_truncateio() + '''
int main(int argc, char** argv) {
    struct stat st;
    int rand_fd, out_fd;
    int seed, max_num;
%(input_frag)s
%(rand_frag)s
%(out_frag)s
%(loop_frag)s
%(sync_frag)s
    return 0;
}''' % {'input_frag': self.fn_main_input_frag(),
        'out_frag': self.fn_main_out_frag(),
        'sync_frag': self.fn_main_sync_frag(),
        'rand_frag': self.fn_main_rand_frag(),
        'loop_frag': self.fn_main_loop_frag(),
       }

In [220]:
main_src, fuzz_src = CFWriteFuzzer(c_grammar).fuzz_src()
with open('./testers/grammar_producer_fwrite_main.c', 'w+') as f:
    print(main_src, file=f)
with open('./testers/grammar_producer_fwrite_fuzz.c', 'w+') as f:
    print(fuzz_src, file=f)

In [221]:
!cat testers/grammar_producer_fwrite_fuzz.c


#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>

#include <unistd.h>
#include <stdint.h>

void out(char c);
extern char* out_regionp;
extern uint64_t out_cursor;


uint8_t map(uint8_t to);


extern int max_depth;
void gen_start(int depth);
void gen_stylesheet(int depth);
void gen_OsqCHARSET_SYM_STRING_SEMICsq(int depth);
void gen_OsqS_OR_CDO_OR_CDCCsq(int depth);
void gen_OsqXimportX_CDO_S_OR_CDC_SCsq(int depth);
void gen_OsqCDO_S_OR_CDC_SCsq(int depth);
void gen_Osqruleset_OR_media_OR_pageCsq(int depth);
void gen_Osqstylesheet_closing_GROUPINGCsq(int depth);
void gen_XimportX(int depth);
void gen_OsqSTRING_OR_URICsq(int depth);
void gen_media(int depth);
void gen_media_list(int depth);
void gen_OsqCOMMA_S_mediumCsq(int depth);
void gen_medium(int depth);
void gen_page(int depth);
void gen_OsqSEMI_S_declarationCsq(int depth);
void gen_pseudo_page(int depth);
void gen_operator(int depth);
void gen_combinator(int depth);


In [222]:
!cat testers/grammar_producer_fwrite_main.c


#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>

#include <unistd.h>
#include <limits.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <math.h>


int max_depth = 0;

void gen_init__();
uint8_t* rand_region_sizep = 0;
const uint64_t rand_region_size = 1ULL << 16;
uint8_t rand_region_initp[rand_region_size];

uint8_t* rand_regionp = rand_region_initp;

const uint64_t size = UINT_MAX; /*max size of a single input -- 4G*/
char out_region_initp[size];
char *out_regionp = out_region_initp;
uint64_t out_cursor = 0;
FILE* fs;


uint8_t
__attribute__((always_inline))
map(uint8_t to) {
    uint8_t from = *rand_regionp++;
    if (rand_regionp >= rand_region_sizep)
        rand_regionp = rand_region_initp;
    return ((uint16_t) from * (uint16_t) to) >> 8;
}


static inline uint64_t rotl(const uint64_t x, int k) {
    

In [223]:
%cd testers
!cc -g -Ofast -o grammar_producer_fwrite grammar_producer_fwrite_main.c grammar_producer_fwrite_fuzz.c
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [224]:
# II
!./testers/grammar_producer_fwrite 0 10 10 io.x

In [225]:
!cat io.x

@charset "" ;   @page  		  {  z 	 : 	  i   @import   }  --> <!--   
@charset '' ; --><!--<!--<!--  @media  ei  	 ,  g  { 	 [  i   ]  ,  selector {  w  :  #09a7ca     }  } 		  -->  
@charset "" ;  @import 	 url("'\GH'	")   ; 	    <!-- -->  <!-- 	 
   
@charset '' ;  @import  url("""")  -gn  , 	 k  ;  	  *    { 	   }  * : c(   )#r   { 	    } 	 
  @import  ""  -n2   ; 	 @import 		 url("") 	  ;   @import  url(" ")    ;  -->   @import  ''   ;  <!-- 	--> @import  "" 	 p   ;  <!-- <!-- -->  @page   {  c  :  url("''")   #6eb5ca    } 		   @media  w   ,  f  {   }   -->  	<!--  <!-- --> 
 -->-->  @media  -d  	  {  *   ,  selector,  selector,  selector {    i  :  #4cfbd9    ;  e  :  #b64057    }  }  --> @media  f   ,  v ,  c ,  w  {   }   
@charset '\C' ;   	<!--	 @import 		 '' 	 -m  ,  c  ;  --> 	 @import  url("""")    ;   <!--  @media    my 	 ,  t  {   * #q >  *    {  b  :  #a7a   @import   }  } 		   @media  w   {   } 	 <!-- z   ,  selector,  selector,  selector {  	   } 	 *   ,  selector

In [226]:
# II
class CTesterFWrite(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_fwrite {seed} {self.max_num} {max_depth} {fn}"

In [227]:
CTesterFWrite().run_test().show()

depth= 8 size= 826594 time= 0.01 stdev(0.0) throughput= 80722.0703125 stdev(31)
depth= 16 size= 1362877 time= 0.021 stdev(0.001) throughput= 64961.74897693453 stdev(2210)
depth= 32 size= 1489957.5 time= 0.022 stdev(0.001) throughput= 64700.46164772727 stdev(2047)
depth= 64 size= 1474938.5 time= 0.027 stdev(0.006) throughput= 55968.675540689146 stdev(13451)
depth= 128 size= 1474938.5 time= 0.022 stdev(0.0) throughput= 65471.346768465904 stdev(12)
depth= 256 size= 1474938.5 time= 0.022 stdev(0.001) throughput= 64047.871827136856 stdev(2001)
Throughput of  80722.0703125  kilobytes per second at depth =  8
Total time: 0:00:11.630662


### No output 

In [228]:
class CNoWriteFuzzer(CFuzzerExtRandP):
    def main_out_var_defs(self):
        return '''
const uint64_t size = UINT_MAX; // size of a single output item -- 4G
char out_region_initp[size];
char *out_regionp = out_region_initp;
uint64_t out_cursor = 0;'''
    def main_var_defs(self):
        s = super().main_var_defs()
        return s + self.main_out_var_defs()
     
    def fn_out_def(self):
        return '''
void
__attribute__((always_inline))
out(char c) {
    out_regionp[out_cursor++] = c;
}'''
    
    def fuzz_out_var_defs(self):
        return '''\
void out(char c);
extern char* out_regionp;
extern uint64_t out_cursor;
'''

    def fn_main_input_frag(self):
        return '''
    if (argc < 3) {
        printf("%s <seed> <max_num> <max_depth>\\n", argv[0]);
        return 0;
    }
    seed = atoi(argv[1]);
    max_num = atoi(argv[2]);
    max_depth = atoi(argv[3]);'''

    def fn_main_out_frag(self):
        return '''
    '''
    
    def fn_main_sync_frag(self):
        return '''
    '''

    def fn_truncateio(self):
        return '''
        '''
    def fn_main_loop_frag(self):
        return '''
    uint64_t out_size = 0;
    for(int i=0; i < max_num; i++) {
        gen_init__();
        // throw away
        out_size += out_cursor;
        out_cursor = 0;
    }
    printf("%lld\\n", out_size);
    '''

    def fn_main_def(self):
        return self.fn_truncateio() + '''
int main(int argc, char** argv) {
    struct stat st;
    int rand_fd;
    int seed, max_num;
%(input_frag)s
%(rand_frag)s
%(out_frag)s
%(loop_frag)s
%(sync_frag)s
    return 0;
}''' % {'input_frag': self.fn_main_input_frag(),
        'out_frag': self.fn_main_out_frag(),
        'sync_frag': self.fn_main_sync_frag(),
        'rand_frag': self.fn_main_rand_frag(),
        'loop_frag': self.fn_main_loop_frag(),
       }

In [229]:
main_src, fuzz_src = CNoWriteFuzzer(c_grammar).fuzz_src()
with open('./testers/grammar_producer_nowrite_main.c', 'w+') as f:
    print(main_src, file=f)
with open('./testers/grammar_producer_nowrite_fuzz.c', 'w+') as f:
    print(fuzz_src, file=f)

In [230]:
%cd testers
!cc -g -Ofast -o grammar_producer_nowrite grammar_producer_nowrite_main.c grammar_producer_nowrite_fuzz.c
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [231]:
# II
!./testers/grammar_producer_nowrite 0 10 10

1205


In [232]:
# II
class CTesterNoWrite(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_nowrite {seed} {self.max_num} {max_depth} > {fn}"
  
    def post_time(self):
        super().post_time()
        with open(self.file) as f:
            self.size = int(f.read())

In [233]:
CTesterNoWrite().run_test().show()

depth= 8 size= 826967 time= 0.007 stdev(0.0) throughput= 115369.28013392857 stdev(43)
depth= 16 size= 1362956 time= 0.014 stdev(0.0) throughput= 95072.265625 stdev(45)
depth= 32 size= 1489954.5 time= 0.017 stdev(0.0) throughput= 85590.21714154411 stdev(18)
depth= 64 size= 1474938.5 time= 0.016 stdev(0.0) throughput= 90023.10180664062 stdev(17)
depth= 128 size= 1474938.5 time= 0.018 stdev(0.001) throughput= 82374.38565921159 stdev(3344)
depth= 256 size= 1474938.5 time= 0.017 stdev(0.001) throughput= 85022.46771918403 stdev(7088)
Throughput of  115369.28013392857  kilobytes per second at depth =  8
Total time: 0:00:11.534302


## Fuzzer as a VM

### MMap

#### Direct threaded VM

In [234]:
# II
class DTMMapFuzzer(CMMapIOFuzzer):
    def fn_out_def(self): return ''
    def gen_rule_src(self, rule, k, j):
        res = []
        leaf = True
        for i, token in enumerate(rule):
            if token in self.grammar:
                leaf = False
                trules = self.grammar[token] # ordered by cost
                len_min_choices = len(self.c_grammar[token])
                assert len(trules) < 256
                cheap_strings = self.pool_of_strings[token]
                if len(cheap_strings) < 256: # we only have 255 random choices
                    check_pool = '''
        val = map(%(len_cheap_strings)s);
        const char* str = pool_%(k)s[val];
        const int str_l = pool_l_%(k)s[val];
        for (int i = 0; i < str_l; i++) {
            *out_regionp++ = str[i];
        }
        --returnp;
        goto **returnp; 
            ''' % { 'len_cheap_strings': len(cheap_strings), 'k': self.k_to_s(token)}
                else:
                    check_pool = '''
        val = map(%(len_min_choices)s);
                ''' % {'len_min_choices':len_min_choices}
                res.append('''\
    *returnp = &&return__%(i)d__%(j)d__%(k)s;
    if (returnp > max_depthp) {
        %(check_pool)s;
    } else {
        val = map(%(len_rules)s);
    }
    goto *gen_%(t)s[val];
return__%(i)d__%(j)d__%(k)s:;
            ''' % {'i':i, 'j':j, 'k':self.k_to_s(k),
                   't':self.k_to_s(token), 'rnum':0, 'len_rules':len(trules), 'len_min_choices':len_min_choices, 'check_pool':check_pool})
            else:
                res.append('''\
    *out_regionp++ = '%s';''' % self.esc_char(token))
        return res, leaf
    
    def gen_alt_src_1rule(self, k):
        rule = self.grammar[k][0]
        ri = 0
        src, leaf = self.gen_rule_src(rule, k, ri)
        body = '\n'.join(src)
        result = []
        if leaf:
            return '''
gen_%(name)s_0: {
%(body)s
    goto **returnp;
}''' % {'name':self.k_to_s(k), 'body':body}
        else:
             return '''
gen_%(name)s_0: {
    ++returnp;
    // single -- no switch
%(body)s
    --returnp;
    goto **returnp;
}''' % {'name':self.k_to_s(k), 'body':body}

    def gen_alt_src(self, k):
        rules = self.grammar[k]
        ret = self.k_to_s(k)
        result = []
        if len(rules) == 1: return self.gen_alt_src_1rule(k)
        for ri, rule in enumerate(rules):
            src, leaf = self.gen_rule_src(rule, k, ri)
            body = '\n'.join(src)
            if leaf:
                result.append('''
gen_%(name)s_%(rnum)d: {
%(body)s
    goto **returnp;
}
    ''' % {'name': self.k_to_s(k), 'rnum': ri, 'body':body})
            else:
                 result.append('''
gen_%(name)s_%(rnum)d: {
    ++returnp;
%(body)s
    --returnp;
    goto **returnp;
}
    ''' % {'name': self.k_to_s(k), 'rnum': ri, 'body':body})
        return '\n'.join(result)

    def fuzz_out_var_defs(self):
        return '''\
extern char* out_regionp;'''
    
    def fuzz_rand_var_defs(self):
        return '''
uint8_t map(uint8_t to);'''
    
    def fuzz_stack_var_defs(self):
        return '''
extern void* stackp[];
'''

    def fuzz_entry(self):
        result = ['''
void gen_init__(void** max_depthp) {
    uint8_t val;
    void** returnp = stackp;
    *returnp =  &&return__init;
''']
        for k in self.grammar:
            l = []
            for ri,rule in enumerate(self.grammar[k]):
                l.append('&&gen_%(k)s_%(ri)d' % {'k':self.k_to_s(k), 'ri':ri})
            s = '''
    void** gen_%(k)s[] = {
%(body)s
    };''' % {'k': self.k_to_s(k), 'body': ',\n'.join(l)}
            result.append(s)
        result.append('''
    goto gen_start_0;''')
        result.append(self.fuzz_fn_defs())
        result.append("""
return__init:
    return;
return_abort:
    exit(10); 
}""")
        return '\n'.join(result)
    
    def main_out_var_defs(self):
        return'''
char* out_regionp;
int out_cursor;
'''
    
    def main_stack_var_defs(self):
        return'''
int max_depth;
void** max_depthp;
void* stackp[INT_MAX];
'''
    def main_init_var_defs(self):
        return'''
void gen_init__(void** max_depthp);
'''

    def fn_main_loop_frag(self):
        return '''
    for(int i=0; i < max_num; i++) {
        gen_init__(max_depthp);
        *out_regionp++ = '\\n';
    }
    *out_regionp = 0;'''
    
    def fn_main_def(self):
        return self.fn_truncateio() + '''
int main(int argc, char** argv) {
    struct stat st;
    long out_size;
    char* out_region_sizep = 0;
    char* out_region_initp;
    int rand_fd, out_fd;
    int seed, max_num;
%(input_frag)s
    max_depthp = stackp + max_depth;
%(rand_frag)s
%(out_frag)s
    out_region_initp = out_regionp;
    out_region_sizep = out_regionp + st.st_size;
%(loop_frag)s
    out_size = out_regionp - out_region_initp;
    out_cursor = out_size;
%(sync_frag)s
    return 0;
}''' % {'input_frag': self.fn_main_input_frag(),
        'rand_frag': self.fn_main_rand_frag(),
        'out_frag': self.fn_main_out_frag(),
        'loop_frag': self.fn_main_loop_frag(),
        'sync_frag': self.fn_main_sync_frag()
       }


    def gen_fuzz_src(self):
        return '\n'.join([self.fuzz_hdefs(),
                          self.fuzz_var_defs(),
                          self.fn_fuzz_decs(),
                          self.string_pool_defs(),
                          # self.fuzz_fn_defs(),
                          self.fuzz_entry()])

In [235]:
main_src, fuzz_src = DTMMapFuzzer(c_grammar).fuzz_src()
with open('testers/grammar_producer_dtmmap_main.c', 'w+') as f:
    print(main_src, file=f)
with open('testers/grammar_producer_dtmmap_fuzz.c', 'w+') as f:
    print(fuzz_src, file=f)

In [236]:
!wc -l testers/grammar_producer_dtmmap_fuzz.c

   10235 testers/grammar_producer_dtmmap_fuzz.c


In [237]:
%cd testers
!cc -g -Ofast -o grammar_producer_dtmmap grammar_producer_dtmmap_main.c grammar_producer_dtmmap_fuzz.c
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [238]:
# II
!./testers/grammar_producer_dtmmap 0 10 10 io.x

In [239]:
!cat io.x

 -->  
 -->  
@charset '' ;   
@charset "" ; <!-- @import 				 url("	") 	  -x  , 	 m ,  j ;  -->   
@charset "\\j" ; 	 	  @page    { 	   }  -->  --> 
 <!-- @import 	  "\*" 	    ; 	   --> --> 	 	<!-- 	 --> 	@import  "" 	  ;  	 -->    <!--  
@charset '\|' ;  @import 	 url("  ") 	  ;   	  
@charset "" ;   * : v: m(   )  , 	 selector { 	   }  
@charset "" ;   
  	   [     ,   selector {   	 : 		 url("     } 	 s    { 		   }   @page   {   e :  #2bcb80    }  -->  @media  _  { 	 *   }   @media  n  {  *    {    }  }  <!-- -->*   <!----> 


In [240]:
# II
class CTesterMMapDT(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_dtmmap {seed} {self.max_num} {max_depth} {fn}"

In [241]:
CTesterMMapDT().run_test().show()

depth= 8 size= 736237.5 time= 0.025 stdev(0.008) throughput= 30518.000086216045 stdev(10363)
depth= 16 size= 1292028.5 time= 0.032 stdev(0.005) throughput= 40556.20465959821 stdev(6374)
depth= 32 size= 1608608 time= 0.042 stdev(0.006) throughput= 37745.09260583524 stdev(5087)
depth= 64 size= 1511572 time= 0.037 stdev(0.008) throughput= 40228.897804437685 stdev(8341)
depth= 128 size= 1511572 time= 0.037 stdev(0.006) throughput= 39939.244918492965 stdev(6781)
depth= 256 size= 1511572 time= 0.042 stdev(0.001) throughput= 34737.59749403031 stdev(575)
Throughput of  40556.20465959821  kilobytes per second at depth =  16
Total time: 0:00:12.072356


#### Context threaded VM

In [242]:
# II
class CTMMapFuzzer(DTMMapFuzzer):
    def fn_choice(self, val):
        return '''
    # [ random 
    # extract one byte from the random stream %%r14,
    movq (%%r14), %%rdi
    # advance the random cursor
    inc %%r14                                     # rand_region++
    movzbl %%dil, %%edi                           # X  --- (rdi:(edi:(di:(dil))))
    # then multiply with the choices we have

    xor %%rsi, %%rsi                              # avoid data dependencies
    movb $%(val)s, %%sil                          # = %(val)s       
    movzbl %%sil, %%edx
    imull %%edi, %%edx                            # m = (short) x * (short) N)
    sarl $8, %%edx                                # return (char)(m >> 8) ;
    # random ]
    # %%edx now contains the selected random value from %(val)d options''' % {'val':val}

    def cheap_strings(self, k):
        cheap_strings = self.pool_of_strings[k]
        results = ['''
    # --- cheap -- [''']
        results.append('''
%(choices)s
''' % {'choices':self.fn_choice(len(cheap_strings)), 'len_choices': len(cheap_strings)})
        # get the choices from vm, then call it, and return.
        
        results.append('''
    # now we have the right print quad in %%edx. Load the right address and call it.
    leaq _%(key)s_prints(%%rip), %%rcx
    leaq (%%rcx, %%rdx, 8), %%rax
    callq *(%%rax)
    ret
    ''' % {'key': self.k_to_s(k)})
        results.append('''
    # --- cheap -- ]''')
        return '\n'.join(results)
    
    def output_char(self, c):
        if len(c) != 1:
            assert c[0] == '\\'
            c = c[-1]
        return '''
   movb $%(ichar)d, (%%r13)                     # '%(char)s'
   inc %%r13                                    # out_region++   : increment a byte (r13++)
   ''' % {'char':repr(c), 'ichar':ord(c)}

    def gen_rule_src(self, rule, k, j):
        # in each rule, there are a number of tokens.
        # iter each token in turn, choose the right rule and call.
        result = []
        for token in rule:
            if token not in self.grammar:
                result.append(self.output_char(token))
                continue
            else:
                # how many choices do we have?
                rules = self.grammar[token]
                result.append('''
    # start the choice machine.
    # length of rules = %(len_rules)d
%(choices)s
    # --- switch ---
    ''' % {'choices': self.fn_choice(len(rules)), 'len_rules':len(rules)})
                result.append('''
    # now we have the right choice in %%edx. Load the right address and call it.
    leaq _%(key)s_choices(%%rip), %%rcx
    leaq (%%rcx, %%rdx, 8), %%rax
    callq *(%%rax)
    ''' % {'key': self.k_to_s(token)})
        return '\n'.join(result)

    def gen_alt_src(self, k):
        result = []
        for ruleid, rule in enumerate(self.grammar[k]):
            # produce a skeletal subroutine structure.
            result.append('''
gen_%(key)s_%(ruleid)s:
    # check if the max depth is breached.
    cmpq %%rsp, %%r8                             # returnp(rbp) <> max_depth(r8) ?
    jle _%(key)s_%(ruleid)s_fi                       # returnp <= max_depth
    
%(return_cheap_string)s
_%(key)s_%(ruleid)s_fi:
''' % {'return_cheap_string': self.cheap_strings(k),
       'key':self.k_to_s(k),
       'ruleid':ruleid,
       'last_label':self.last_label})
            self.last_label += 1
            result.append(self.gen_rule_src(rule, k, ruleid))
            # we were called. So simply return.
            result.append('''
    ret
            ''')
        return '\n'.join(result)
 
    def fn_fuzz_decs(self):
        result = ['''
  .section  __DATA,__data

# Virtual Machine OPS.
        ''']
        for k in self.grammar:
            result.append('''
    .globl  _%(key)s_choices
    .p2align 4
_%(key)s_choices:''' % {'key':self.k_to_s(k)})
            for i, rule in enumerate(self.grammar[k]):
                result.append('''\
    .quad gen_%s_%d''' % (self.k_to_s(k), i))
                
        for k in self.pool_of_strings:
            result.append('''
    .globl  _%(key)s_prints
    .p2align 4
_%(key)s_prints:''' % {'key':self.k_to_s(k)})
            for string in self.pool_of_strings[k]:
                result.append('''\
    .quad %s''' % (self.all_prints[string]))
                
                
        result.append('''
# End Virtual Machine OPS.''')
        return '\n'.join(result)

    def gen_cheap(self, grammar):
        all_strings = set()
        for k in grammar:
            all_strings |= set(self.pool_of_strings[k])
        all_strings = list(all_strings)
        all_strings.sort(key=lambda item: (-len(item), item))
        all_prints_hash = {}
        result = ['''
.text
        ''']
        for i, s_ in enumerate(all_strings):
            s = s_
            result.append('''\
print_%(name)d: # "%(value)s"''' % {'name': i, 'value': repr(s)})
            for j in s:
                result.append('''\
    movb $%(ichar)s, (%%r13)            # '%(char)s'
    inc %%r13''' % {'ichar':ord(j), 'char':repr(j)})
            result.append('''\
    ret''')
            all_prints_hash[s_] = 'print_%d' % i
        return ('\n'.join(result), all_prints_hash)
 
    def fuzz_entry(self):
        result = ["""
#include "ctmmap_vm_ops.s"
.macro pushaq
    push %%rsp
    push %%rbp
    push %%r8
    push %%r9
    push %%r10
    push %%r11
    push %%r12
    push %%r13
    push %%r14
    push %%r15
.endm


.macro popaq
    pop %%r15
    pop %%r14
    pop %%r13
    pop %%r12
    pop %%r11
    pop %%r10
    pop %%r9
    pop %%r8
    pop %%rbp
    pop %%rsp
.endm

.global %(os)sgen_init__
.global return__init
.text
%(os)sgen_init__:
    # 1 rdi = max_depth
    # 2 rsi = returnp
    # 3 rdx = &out_region
    # 4 rcx = &rand_region
    pushaq

    leal 0(,%%rdi,8), %%eax
    movq %%rsp, %%r8
    subq %%rax, %%r8

    movq %%rdx, %%r11                              # &out_region
    movq %%rcx, %%r12                              # &rand_region
    movq (%%r11),%%r13                             # out_region
    movq (%%r12),%%r14                             # rand_region

    # general regs
    # rax, rcx, rdx, rbx, rsi,rdi
    # rbp, r8-r15
    
    call gen_start_0
    movq %%r13, (%%r11)                            # *(&out_region) <-
    movq %%r14, (%%r12)                            # *(&rand_region) <-
    popaq
    movq  $0, %%rax
    ret   
""" % {'os': '_' if sys.platform == 'darwin' else ''}]
        result.append(self.fuzz_fn_defs())
        return ''.join(result)

    def main_init_var_defs(self):
        return'''\
void gen_init__(uint32_t max_depth, void** returnp, char** out_region, uint8_t** rand_region);
'''

    def fn_main_loop_frag(self):
        return '''\
    for(int i=0; i < max_num; i++) {
        gen_init__(max_depth32, stackp, &out_regionp, &rand_regionp);
        *out_regionp++ = '\\n';
    }
    *out_regionp = 0;'''
    
    def fn_main_def(self):
        return self.fn_truncateio() + '''
int main(int argc, char** argv) {
    struct stat st;
    long out_size;
    char* out_region_initp;
    int out_fd;
    uint32_t max_depth32;
    int seed, max_num;
%(input_frag)s
    max_depth32 = max_depth;
%(rand_frag)s
%(out_frag)s
    out_region_initp = out_regionp;
%(loop_frag)s
    out_size = out_regionp - out_region_initp;
    out_cursor = out_size;
%(sync_frag)s
    return 0;
}''' % {'input_frag': self.fn_main_input_frag(),
        'rand_frag': self.fn_main_rand_frag(),
        'out_frag': self.fn_main_out_frag(),
        'loop_frag': self.fn_main_loop_frag(),
        'sync_frag': self.fn_main_sync_frag()
       }
    
    def fuzz_src(self, key='<start>'):
        self.last_label = 0
        self.cheap, self.all_prints = self.gen_cheap(self.grammar)
        ext_strings = '\n'.join([self.fn_fuzz_decs(), self.cheap])
        return ext_strings, self.gen_main_src(), self.gen_fuzz_src()
    
    def gen_fuzz_src(self):
        return '\n'.join([self.fuzz_entry()])

In [243]:
vm_ops, main_src, fuzz_src = CTMMapFuzzer(c_grammar).fuzz_src()
with open('testers/grammar_producer_ctmmap_main.c', 'w+') as f:
    print(main_src, file=f)
with open('testers/grammar_producer_ctmmap_fuzz.s', 'w+') as f:
    print(fuzz_src, file=f)
with open('testers/ctmmap_vm_ops.s', 'w+') as f:
    print(vm_ops, file=f)

In [244]:
!nl -ba testers/grammar_producer_ctmmap_fuzz.s

     1	
     2	#include "ctmmap_vm_ops.s"
     3	.macro pushaq
     4	    push %rsp
     5	    push %rbp
     6	    push %r8
     7	    push %r9
     8	    push %r10
     9	    push %r11
    10	    push %r12
    11	    push %r13
    12	    push %r14
    13	    push %r15
    14	.endm
    15	
    16	
    17	.macro popaq
    18	    pop %r15
    19	    pop %r14
    20	    pop %r13
    21	    pop %r12
    22	    pop %r11
    23	    pop %r10
    24	    pop %r9
    25	    pop %r8
    26	    pop %rbp
    27	    pop %rsp
    28	.endm
    29	
    30	.global _gen_init__
    31	.global return__init
    32	.text
    33	_gen_init__:
    34	    # 1 rdi = max_depth
    35	    # 2 rsi = returnp
    36	    # 3 rdx = &out_region
    37	    # 4 rcx = &rand_region
    38	    pushaq
    39	
    40	    leal 0(,%rdi,8), %eax
    41	    movq %rsp, %r8
    42	    subq %rax, %r8
    43	
    44	    movq %rdx, %r11                              # &out_region
    45	    mo

  9270	    inc %r14                                     # rand_region++
  9271	    movzbl %dil, %edi                           # X  --- (rdi:(edi:(di:(dil))))
  9272	    # then multiply with the choices we have
  9273	
  9274	    xor %rsi, %rsi                              # avoid data dependencies
  9275	    movb $38, %sil                          # = 38       
  9276	    movzbl %sil, %edx
  9277	    imull %edi, %edx                            # m = (short) x * (short) N)
  9278	    sarl $8, %edx                                # return (char)(m >> 8) ;
  9279	    # random ]
  9280	    # %edx now contains the selected random value from 38 options
  9281	
  9282	
  9283	    # now we have the right print quad in %edx. Load the right address and call it.
  9284	    leaq _HASH_prints(%rip), %rcx
  9285	    leaq (%rcx, %rdx, 8), %rax
  9286	    callq *(%rax)
  9287	    ret
  9288	    
  9289	
  9290	    # --- cheap -- ]
  9291	_HASH_0_fi:
  9292	
  9293	
  9294	   mo

 21228	    # %edx now contains the selected random value from 81 options
 21229	
 21230	
 21231	    # now we have the right print quad in %edx. Load the right address and call it.
 21232	    leaq _echar_prints(%rip), %rcx
 21233	    leaq (%rcx, %rdx, 8), %rax
 21234	    callq *(%rax)
 21235	    ret
 21236	    
 21237	
 21238	    # --- cheap -- ]
 21239	_echar_5_fi:
 21240	
 21241	
 21242	   movb $35, (%r13)                     # ''#''
 21243	   inc %r13                                    # out_region++   : increment a byte (r13++)
 21244	   
 21245	
 21246	    ret
 21247	            
 21248	
 21249	gen_echar_6:
 21250	    # check if the max depth is breached.
 21251	    cmpq %rsp, %r8                             # returnp(rbp) <> max_depth(r8) ?
 21252	    jle _echar_6_fi                       # returnp <= max_depth
 21253	    
 21254	
 21255	    # --- cheap -- [
 21256	
 21257	
 21258	    # [ random 
 21259	    # extract one byte from the random stream %

In [245]:
!nl -ba testers/grammar_producer_ctmmap_main.c

     1	
     2	#define _LARGEFILE64_SOURCE
     3	#define _FILE_OFFSET_BITS 64
     4	
     5	#include <stdlib.h>
     6	#include <stdio.h>
     7	#include <time.h>
     8	#include <string.h>
     9	
    10	#include <unistd.h>
    11	#include <limits.h>
    12	#include <fcntl.h>
    13	#include <sys/types.h>
    14	#include <sys/mman.h>
    15	#include <sys/stat.h>
    16	#include <math.h>
    17	
    18	
    19	int max_depth;
    20	void** max_depthp;
    21	void* stackp[INT_MAX];
    22	
    23	void gen_init__(uint32_t max_depth, void** returnp, char** out_region, uint8_t** rand_region);
    24	
    25	uint8_t* rand_region_sizep = 0;
    26	const uint64_t rand_region_size = 1ULL << 16;
    27	uint8_t rand_region_initp[rand_region_size];
    28	
    29	uint8_t* rand_regionp = rand_region_initp;
    30	
    31	char* out_regionp;
    32	int out_cursor;
    33	
    34	
    35	uint8_t
    36	__attribute__((always_inline))
    37	map(uint8_t to) {
    3

In [246]:
%cd testers
!cc -g -Ofast -o grammar_producer_ctmmap grammar_producer_ctmmap_main.c grammar_producer_ctmmap_fuzz.s
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [247]:
# II
!./testers/grammar_producer_ctmmap 0 10 10 io.x

In [248]:
!cat io.x

 -->  
 -->  
@charset '' ;   
@charset "" ; <!-- @import 				 url("		") 	 	   ; 	    <!-- @import  	 "r"   ;  	  <!-- @import  url("""")   ;  --> 	 
   [  g   ]  ,  selector {   l  : 	  0in   #7098c5  @import  ;   w  :  #264    }    
 	 @import 	 '\k '   ;   --> --> <!-- @import  url(" "" ")    ;  @import  ""  i  ,  i  ;  <!-- <!-- --> -->  
@charset '' ;  	-->	--> @import  url("") 	  ;   @import  url(" ")    ;  -->   @import 		 ''   ;   
@charset '' ;   [  a   ]  ,  selector,  selector,  selector,  selector {    } 	 <!--  
 --> @import 	 ''  j   ; 	 <!--  	-->  
@charset '' ; 	      * .l   {    j  :   ''     }  -->  <!--  	@media  o   {   } 		 @page   { 	 u  :  #867aec    ;   }  @page  : m  {  z  :  #0ab    ;   }  @page   {    }  


In [249]:
# II
class CTesterMMapCT(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_ctmmap {seed} {self.max_num} {max_depth} {fn}"

In [250]:
CTesterMMapCT().run_test().show()

depth= 8 size= 99351.5 time= 0.014 stdev(0.0) throughput= 6930.210658482143 stdev(2)
depth= 16 size= 87024.5 time= 0.014 stdev(0.0) throughput= 6070.347377232143 stdev(0)
depth= 32 size= 84274.5 time= 0.017 stdev(0.004) throughput= 5105.083264802632 stdev(1094)
depth= 64 size= 84457.5 time= 0.019 stdev(0.007) throughput= 4664.015997023809 stdev(1736)
depth= 128 size= 84457.5 time= 0.021 stdev(0.006) throughput= 4075.435431985294 stdev(1098)
depth= 256 size= 84457.5 time= 0.014 stdev(0.0) throughput= 5891.287667410714 stdev(1)
Throughput of  6930.210658482143  kilobytes per second at depth =  8
Total time: 0:00:11.696641


### FWrite

#### Direct threaded VM

In [251]:
# II
class CFWriteDTFuzzer(CFWriteFuzzer):
    def fn_out_def(self): return ''
    def gen_rule_src(self, rule, k, j):
        res = []
        leaf = True
        for i, token in enumerate(rule):
            if token in self.grammar:
                leaf = False
                trules = self.grammar[token] # ordered by cost
                len_min_choices = len(self.c_grammar[token])
                assert len(trules) < 256
                cheap_strings = self.pool_of_strings[token]
                if len(cheap_strings) < 256: # we only have 255 random choices
                    check_pool = '''
        val = map(%(len_cheap_strings)s);
        const char* str = pool_%(k)s[val];
        const int str_l = pool_l_%(k)s[val];
        for (int i = 0; i < str_l; i++) {
            *out_regionp++ = str[i];
        }
        --returnp;
        goto **returnp; 
            ''' % { 'len_cheap_strings': len(cheap_strings), 'k': self.k_to_s(token)}
                else:
                    check_pool = '''
        val = map(%(len_min_choices)s);
                ''' % {'len_min_choices':len_min_choices}
                res.append('''\
    *returnp = &&return__%(i)d__%(j)d__%(k)s;
    if (returnp > max_depthp) {
        %(check_pool)s;
    } else {
        val = map(%(len_rules)s);
    }
    goto *gen_%(t)s[val];
return__%(i)d__%(j)d__%(k)s:;
            ''' % {'i':i, 'j':j, 'k':self.k_to_s(k),
                   't':self.k_to_s(token), 'rnum':0, 'len_rules':len(trules), 'len_min_choices':len_min_choices, 'check_pool':check_pool})
            else:
                t = self.esc_char(token)
                res.append('''\
    *out_regionp++ = '%s';''' % t)
        return res, leaf
    
    def gen_alt_src_1rule(self, k):
        rule = self.grammar[k][0]
        ri = 0
        src, leaf = self.gen_rule_src(rule, k, ri)
        body = '\n'.join(src)
        result = []
        if leaf:
            return '''
gen_%(name)s_0: {
%(body)s
    goto **returnp;
}''' % {'name':self.k_to_s(k), 'body':body}
        else:
             return '''
gen_%(name)s_0: {
    ++returnp;
    // single -- no switch
%(body)s
    --returnp;
    goto **returnp;
}''' % {'name':self.k_to_s(k), 'body':body}

    def gen_alt_src(self, k):
        rules = self.grammar[k]
        ret = self.k_to_s(k)
        result = []
        if len(rules) == 1: return self.gen_alt_src_1rule(k)
        for ri, rule in enumerate(rules):
            src, leaf = self.gen_rule_src(rule, k, ri)
            body = '\n'.join(src)
            if leaf:
                result.append('''
gen_%(name)s_%(rnum)d: {
%(body)s
    goto **returnp;
}
    ''' % {'name': self.k_to_s(k), 'rnum': ri, 'body':body})
            else:
                 result.append('''
gen_%(name)s_%(rnum)d: {
    ++returnp;
%(body)s
    --returnp;
    goto **returnp;
}
    ''' % {'name': self.k_to_s(k), 'rnum': ri, 'body':body})
        return '\n'.join(result)

    def fuzz_out_var_defs(self):
        return '''\
extern char* out_regionp;'''
    
    def fuzz_rand_var_defs(self):
        return '''
uint8_t map(uint8_t to);'''
    
    def fuzz_stack_var_defs(self):
        return '''
extern void* stackp[];
'''

    def fuzz_entry(self):
        result = ['''
void gen_init__(void** max_depthp) {
    uint8_t val;
    void** returnp = stackp;
    *returnp =  &&return__init;
''']
        for k in self.grammar:
            l = []
            for ri,rule in enumerate(self.grammar[k]):
                l.append('&&gen_%(k)s_%(ri)d' % {'k':self.k_to_s(k), 'ri':ri})
            s = '''
    void** gen_%(k)s[] = {
%(body)s
    };''' % {'k': self.k_to_s(k), 'body': ',\n'.join(l)}
            result.append(s)
        result.append('''
    goto gen_start_0;''')
        result.append(self.fuzz_fn_defs())
        result.append("""
return__init:
    *out_regionp++ = '\\n';
    return;
return_abort:
    exit(10); 
}""")
        return '\n'.join(result)

    
    def main_stack_var_defs(self):
        return'''
int max_depth;
void** max_depthp;
void* stackp[INT_MAX];
'''
    def main_init_var_defs(self):
        return'''
void gen_init__(void** max_depthp);
'''

    def fn_main_loop_frag(self):
        return '''
    fs = fdopen(out_fd, "w");
    for(int i=0; i < max_num; i++) {
        out_regionp = out_region_initp;
        gen_init__(max_depthp);
        out_cursor = out_regionp - out_region_initp;
        fwrite(out_region_initp, sizeof(char), out_cursor, fs);
    }
    '''
    
    def fn_main_def(self):
        return self.fn_truncateio() + '''
int main(int argc, char** argv) {
    struct stat st;
    long out_size;
    char* out_region_sizep = 0;
    int out_fd;
    int seed, max_num;
%(input_frag)s
    max_depthp = stackp + max_depth;
%(rand_frag)s
%(out_frag)s
%(loop_frag)s
%(sync_frag)s
    return 0;
}''' % {'input_frag': self.fn_main_input_frag(),
        'rand_frag': self.fn_main_rand_frag(),
        'out_frag': self.fn_main_out_frag(),
        'loop_frag': self.fn_main_loop_frag(),
        'sync_frag': self.fn_main_sync_frag()
       }


    def gen_fuzz_src(self):
        return '\n'.join([self.fuzz_hdefs(),
                          self.fuzz_var_defs(),
                          self.fn_fuzz_decs(),
                          self.string_pool_defs(),
                          # self.fuzz_fn_defs(),
                          self.fuzz_entry()])

In [252]:
main_src, fuzz_src = CFWriteDTFuzzer(c_grammar).fuzz_src()
with open('testers/grammar_producer_fwritedt_main.c', 'w+') as f:
    print(main_src, file=f)
with open('testers/grammar_producer_fwritedt_fuzz.c', 'w+') as f:
    print(fuzz_src, file=f)

In [253]:
!wc -l testers/grammar_producer_fwritedt_fuzz.c

   10236 testers/grammar_producer_fwritedt_fuzz.c


In [254]:
!cat testers/grammar_producer_fwritedt_main.c


#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>

#include <unistd.h>
#include <limits.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <math.h>


int max_depth;
void** max_depthp;
void* stackp[INT_MAX];


void gen_init__(void** max_depthp);

uint8_t* rand_region_sizep = 0;
const uint64_t rand_region_size = 1ULL << 16;
uint8_t rand_region_initp[rand_region_size];

uint8_t* rand_regionp = rand_region_initp;

const uint64_t size = UINT_MAX; /*max size of a single input -- 4G*/
char out_region_initp[size];
char *out_regionp = out_region_initp;
uint64_t out_cursor = 0;
FILE* fs;


uint8_t
__attribute__((always_inline))
map(uint8_t to) {
    uint8_t from = *rand_regionp++;
    if (rand_regionp >= rand_region_sizep)
        rand_regionp = rand_region_initp;
    return ((uint16_t) from * (uint16_t) to) >> 8;
}



In [255]:
%cd testers
!cc -g -Ofast -o grammar_producer_fwritedt grammar_producer_fwritedt_main.c grammar_producer_fwritedt_fuzz.c
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [256]:
# II
class CTesterFWriteDT(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_fwritedt {seed} {self.max_num} {max_depth} {fn}"

In [257]:
# II
!./testers/grammar_producer_fwritedt 0 10 10 io.x

In [258]:
!cat io.x

 -->  
 -->  
@charset '' ;   
@charset "" ; <!-- @import 				 url("	") 	  -x  , 	 q ,  s ;  -->   
@charset "\\j" ; 	 	  @page    { 	   }  -->  --> 
 <!-- @import 	  "\w" 	    ; 	   --> --> 	 	<!-- 	 --> 	@import  "" 	  ;  	 -->    <!--  
@charset '\|' ;  @import 	 url("  ") 	  ;   	  
@charset "" ;   * : h(   ): b(   )  , 	 selector { 	   }  
@charset "" ;   
  	   [     ,   selector {   	 : 		 url("     } 	 v    { 		   }   @page   {   p :  #8d88dc    }  -->  @media  i  { 	 *   }   @media  y  {  *    {    }  }  <!-- -->*   <!--<!-- 
 	 ''  j   ; 	 <!--  	-->  
@charset '' ; 	      * .l   {    j  :   ''     }  -->  <!--  	@media  o   {   } 		 @page   { 	 u  :  #867aec    ;   }  @page  : m  {  z  :  #0ab    ;   }  @page   {    }  


In [259]:
CTesterFWriteDT().run_test().show()

depth= 8 size= 736913 time= 0.01 stdev(0.0) throughput= 71964.16015625 stdev(14)
depth= 16 size= 1292144.5 time= 0.022 stdev(0.001) throughput= 58722.89616308171 stdev(1930)
depth= 32 size= 1608538 time= 0.03 stdev(0.004) throughput= 52889.862163299666 stdev(7476)
depth= 64 size= 1511572 time= 0.026 stdev(0.001) throughput= 57910.221604567305 stdev(1601)
depth= 128 size= 1511572 time= 0.026 stdev(0.0) throughput= 56774.78966346153 stdev(5)
depth= 256 size= 1511572 time= 0.026 stdev(0.001) throughput= 57910.221604567305 stdev(1601)
Throughput of  71964.16015625  kilobytes per second at depth =  8
Total time: 0:00:11.685617


#### Context threaded VM

In [260]:
# II
class CFWriteCTFuzzer(CFWriteDTFuzzer):
    
    def fn_choice(self, val):
        return '''
    # [ random 
    # extract one byte from the random stream %%r14,
    movq (%%r14), %%rdi
    # advance the random cursor
    inc %%r14                                     # rand_region++
    movzbl %%dil, %%edi                           # X  --- (rdi:(edi:(di:(dil))))
    # then multiply with the choices we have

    xor %%rsi, %%rsi                              # avoid data dependencies
    movb $%(val)s, %%sil                          # = %(val)s       
    movzbl %%sil, %%edx
    imull %%edi, %%edx                            # m = (short) x * (short) N)
    sarl $8, %%edx                                # return (char)(m >> 8) ;
    # random ]
    # %%edx now contains the selected random value from %(val)d options''' % {'val':val}

    def cheap_strings(self, k):
        cheap_strings = self.pool_of_strings[k]
        results = ['''
    # --- cheap -- [''']
        results.append('''
%(choices)s
''' % {'choices':self.fn_choice(len(cheap_strings)), 'len_choices': len(cheap_strings)})
        # get the choices from vm, then call it, and return.
        
        results.append('''
    # now we have the right print quad in %%edx. Load the right address and call it.
    leaq _%(key)s_prints(%%rip), %%rcx
    leaq (%%rcx, %%rdx, 8), %%rax
    callq *(%%rax)
    ret
    ''' % {'key': self.k_to_s(k)})
        results.append('''
    # --- cheap -- ]''')
        return '\n'.join(results)
    
    def output_char(self, c):
        if len(c) != 1:
            assert c[0] == '\\'
            c = c[-1]
        return '''
   movb $%(ichar)d, (%%r13)                     # '%(char)s'
   inc %%r13                                    # out_region++   : increment a byte (r13++)
   ''' % {'char':self.esc(c), 'ichar':ord(c)}

    def gen_rule_src(self, rule, k, j):
        # in each rule, there are a number of tokens.
        # iter each token in turn, choose the right rule and call.
        result = []
        for token in rule:
            if token not in self.grammar:
                result.append(self.output_char(token))
                continue
            else:
                # how many choices do we have?
                rules = self.grammar[token]
                result.append('''
    # start the choice machine.
    # length of rules = %(len_rules)d
%(choices)s
    # --- switch ---
    ''' % {'choices': self.fn_choice(len(rules)), 'len_rules':len(rules)})
                result.append('''
    # now we have the right choice in %%edx. Load the right address and call it.
    leaq _%(key)s_choices(%%rip), %%rcx
    leaq (%%rcx, %%rdx, 8), %%rax
    callq *(%%rax)
    ''' % {'key': self.k_to_s(token)})
        return '\n'.join(result)

    def gen_alt_src(self, k):
        result = []
        for ruleid, rule in enumerate(self.grammar[k]):
            # produce a skeletal subroutine structure.
            result.append('''
gen_%(key)s_%(ruleid)s:
    # check if the max depth is breached.
    cmpq %%rsp, %%r8                             # returnp(rbp) <> max_depth(r8) ?
    jle _%(key)s_%(ruleid)s_fi                       # returnp <= max_depth
    
%(return_cheap_string)s
_%(key)s_%(ruleid)s_fi:
''' % {'return_cheap_string': self.cheap_strings(k),
       'key':self.k_to_s(k),
       'ruleid':ruleid,
       'last_label':self.last_label})
            self.last_label += 1
            result.append(self.gen_rule_src(rule, k, ruleid))
            # we were called. So simply return.
            result.append('''
    ret
            ''')
        return '\n'.join(result)
 
    def fn_fuzz_decs(self):
        result = ['''
  .section  __DATA,__data

# Virtual Machine OPS.
        ''']
        for k in self.grammar:
            result.append('''
    .globl  _%(key)s_choices
    .p2align 4
_%(key)s_choices:''' % {'key':self.k_to_s(k)})
            for i, rule in enumerate(self.grammar[k]):
                result.append('''\
    .quad gen_%s_%d''' % (self.k_to_s(k), i))
                
        for k in self.pool_of_strings:
            result.append('''
    .globl  _%(key)s_prints
    .p2align 4
_%(key)s_prints:''' % {'key':self.k_to_s(k)})
            for string in self.pool_of_strings[k]:
                result.append('''\
    .quad %s''' % (self.all_prints[string]))
                
                
        result.append('''
# End Virtual Machine OPS.''')
        return '\n'.join(result)

    def gen_cheap(self, grammar):
        all_strings = set()
        for k in grammar:
            all_strings |= set(self.pool_of_strings[k])
        all_strings = list(all_strings)
        all_strings.sort(key=lambda item: (-len(item), item))
        all_prints_hash = {}
        result = ['''
.text
        ''']
        for i, s_ in enumerate(all_strings):
            s = s_
            result.append('''\
print_%(name)d: # "%(value)s"''' % {'name': i, 'value': self.esc(s)})
            for j in s:
                result.append('''\
    movb $%(ichar)s, (%%r13)            # '%(char)s'
    inc %%r13''' % {'ichar':ord(j), 'char':self.esc(j)})
            result.append('''\
    ret''')
            all_prints_hash[s_] = 'print_%d' % i
        return ('\n'.join(result), all_prints_hash)
 
    def fuzz_entry(self):
        result = ["""
#include "ctfwrite_vm_ops.s"
.macro pushaq
    push %%rsp
    push %%rbp
    push %%r8
    push %%r9
    push %%r10
    push %%r11
    push %%r12
    push %%r13
    push %%r14
    push %%r15
.endm


.macro popaq
    pop %%r15
    pop %%r14
    pop %%r13
    pop %%r12
    pop %%r11
    pop %%r10
    pop %%r9
    pop %%r8
    pop %%rbp
    pop %%rsp
.endm

.global %(os)sgen_init__
.global return__init
.text
%(os)sgen_init__:
    # 1 rdi = max_depth
    # 2 rsi = returnp
    # 3 rdx = &out_region
    # 4 rcx = &rand_region
    pushaq

    leal 0(,%%rdi,8), %%eax
    movq %%rsp, %%r8
    subq %%rax, %%r8

    movq %%rdx, %%r11                              # &out_region
    movq %%rcx, %%r12                              # &rand_region
    movq (%%r11),%%r13                             # out_region
    movq (%%r12),%%r14                             # rand_region

    # general regs
    # rax, rcx, rdx, rbx, rsi,rdi
    # rbp, r8-r15
    
    call gen_start_0
    movq %%r13, (%%r11)                            # *(&out_region) <-
    movq %%r14, (%%r12)                            # *(&rand_region) <-
    popaq
    movq  $0, %%rax
    ret   
""" % {'os': '_' if sys.platform == 'darwin' else ''}]
        result.append(self.fuzz_fn_defs())
        return ''.join(result)

    def main_init_var_defs(self):
        return'''
void gen_init__(uint32_t max_depth, void** returnp, char** out_region, uint8_t** rand_region);
'''

    def fn_main_loop_frag(self):
        return '''
    fs = fdopen(out_fd, "w");
    for(int i=0; i < max_num; i++) {
        out_regionp = out_region_initp;
        gen_init__(max_depth32, stackp, &out_regionp, &rand_regionp);
        *out_regionp++ = '\\n';
        out_cursor = out_regionp - out_region_initp;
        fwrite(out_region_initp, sizeof(char), out_cursor, fs);
    }
    '''
    
    def fn_main_def(self):
        return self.fn_truncateio() + '''
int main(int argc, char** argv) {
    struct stat st;
    long out_size;
    int out_fd;
    uint32_t max_depth32;
    int seed, max_num;
%(input_frag)s
    max_depth32 = max_depth;
%(rand_frag)s
%(out_frag)s
%(loop_frag)s
%(sync_frag)s
    return 0;
}''' % {'input_frag': self.fn_main_input_frag(),
        'rand_frag': self.fn_main_rand_frag(),
        'out_frag': self.fn_main_out_frag(),
        'loop_frag': self.fn_main_loop_frag(),
        'sync_frag': self.fn_main_sync_frag()
       }
    
    def fuzz_src(self, key='<start>'):
        self.last_label = 0
        self.cheap, self.all_prints = self.gen_cheap(self.grammar)
        ext_strings = '\n'.join([self.fn_fuzz_decs(), self.cheap])
        return ext_strings, self.gen_main_src(), self.gen_fuzz_src()
    
    def gen_fuzz_src(self):
        return '\n'.join([self.fuzz_entry()])

In [261]:
vm_ops, main_src, fuzz_src = CFWriteCTFuzzer(c_grammar).fuzz_src()
with open('testers/grammar_producer_ctfwrite_main.c', 'w+') as f:
    print(main_src, file=f)
with open('testers/grammar_producer_ctfwrite_fuzz.s', 'w+') as f:
    print(fuzz_src, file=f)
with open('testers/ctfwrite_vm_ops.s', 'w+') as f:
    print(vm_ops, file=f)

In [262]:
!nl -ba testers/ctfwrite_vm_ops.s

     1	
     2	  .section  __DATA,__data
     3	
     4	# Virtual Machine OPS.
     5	        
     6	
     7	    .globl  _start_choices
     8	    .p2align 4
     9	_start_choices:
    10	    .quad gen_start_0
    11	
    12	    .globl  _stylesheet_choices
    13	    .p2align 4
    14	_stylesheet_choices:
    15	    .quad gen_stylesheet_0
    16	
    17	    .globl  _OsqCHARSET_SYM_STRING_SEMICsq_choices
    18	    .p2align 4
    19	_OsqCHARSET_SYM_STRING_SEMICsq_choices:
    20	    .quad gen_OsqCHARSET_SYM_STRING_SEMICsq_0
    21	
    22	    .globl  _OsqS_OR_CDO_OR_CDCCsq_choices
    23	    .p2align 4
    24	_OsqS_OR_CDO_OR_CDCCsq_choices:
    25	    .quad gen_OsqS_OR_CDO_OR_CDCCsq_0
    26	    .quad gen_OsqS_OR_CDO_OR_CDCCsq_1
    27	    .quad gen_OsqS_OR_CDO_OR_CDCCsq_2
    28	
    29	    .globl  _OsqXimportX_CDO_S_OR_CDC_SCsq_choices
    30	    .p2align 4
    31	_OsqXimportX_CDO_S_OR_CDC_SCsq_choices:
    32	    .quad gen_OsqXimportX_CDO_S_OR_CDC_SCsq

 12321	    movb $32, (%r13)            # ' '
 12322	    inc %r13
 12323	    movb $32, (%r13)            # ' '
 12324	    inc %r13
 12325	    movb $35, (%r13)            # '#'
 12326	    inc %r13
 12327	    movb $52, (%r13)            # '4'
 12328	    inc %r13
 12329	    movb $56, (%r13)            # '8'
 12330	    inc %r13
 12331	    movb $57, (%r13)            # '9'
 12332	    inc %r13
 12333	    movb $57, (%r13)            # '9'
 12334	    inc %r13
 12335	    movb $50, (%r13)            # '2'
 12336	    inc %r13
 12337	    movb $102, (%r13)            # 'f'
 12338	    inc %r13
 12339	    movb $32, (%r13)            # ' '
 12340	    inc %r13
 12341	    movb $32, (%r13)            # ' '
 12342	    inc %r13
 12343	    movb $32, (%r13)            # ' '
 12344	    inc %r13
 12345	    ret
 12346	print_196: # "t  :  #5b37d8   "
 12347	    movb $116, (%r13)            # 't'
 12348	    inc %r13
 12349	    movb $32, (%r13)            # ' '
 12350	    inc %r13
 123

 26521	    ret
 26522	print_758: # "#f67d0d  "
 26523	    movb $35, (%r13)            # '#'
 26524	    inc %r13
 26525	    movb $102, (%r13)            # 'f'
 26526	    inc %r13
 26527	    movb $54, (%r13)            # '6'
 26528	    inc %r13
 26529	    movb $55, (%r13)            # '7'
 26530	    inc %r13
 26531	    movb $100, (%r13)            # 'd'
 26532	    inc %r13
 26533	    movb $48, (%r13)            # '0'
 26534	    inc %r13
 26535	    movb $100, (%r13)            # 'd'
 26536	    inc %r13
 26537	    movb $32, (%r13)            # ' '
 26538	    inc %r13
 26539	    movb $32, (%r13)            # ' '
 26540	    inc %r13
 26541	    ret
 26542	print_759: # "#f88eac  "
 26543	    movb $35, (%r13)            # '#'
 26544	    inc %r13
 26545	    movb $102, (%r13)            # 'f'
 26546	    inc %r13
 26547	    movb $56, (%r13)            # '8'
 26548	    inc %r13
 26549	    movb $56, (%r13)            # '8'
 26550	    inc %r13
 26551	    movb $101, (%r13

 40213	    movb $48, (%r13)            # '0'
 40214	    inc %r13
 40215	    movb $32, (%r13)            # ' '
 40216	    inc %r13
 40217	    ret
 40218	print_1655: # "#338 "
 40219	    movb $35, (%r13)            # '#'
 40220	    inc %r13
 40221	    movb $51, (%r13)            # '3'
 40222	    inc %r13
 40223	    movb $51, (%r13)            # '3'
 40224	    inc %r13
 40225	    movb $56, (%r13)            # '8'
 40226	    inc %r13
 40227	    movb $32, (%r13)            # ' '
 40228	    inc %r13
 40229	    ret
 40230	print_1656: # "#33f "
 40231	    movb $35, (%r13)            # '#'
 40232	    inc %r13
 40233	    movb $51, (%r13)            # '3'
 40234	    inc %r13
 40235	    movb $51, (%r13)            # '3'
 40236	    inc %r13
 40237	    movb $102, (%r13)            # 'f'
 40238	    inc %r13
 40239	    movb $32, (%r13)            # ' '
 40240	    inc %r13
 40241	    ret
 40242	print_1657: # "#34a "
 40243	    movb $35, (%r13)            # '#'
 40244	    

In [263]:
!wc -l testers/grammar_producer_ctfwrite_fuzz.s

   32927 testers/grammar_producer_ctfwrite_fuzz.s


In [264]:
!cat testers/grammar_producer_ctfwrite_main.c


#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>

#include <unistd.h>
#include <limits.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <math.h>


int max_depth;
void** max_depthp;
void* stackp[INT_MAX];


void gen_init__(uint32_t max_depth, void** returnp, char** out_region, uint8_t** rand_region);

uint8_t* rand_region_sizep = 0;
const uint64_t rand_region_size = 1ULL << 16;
uint8_t rand_region_initp[rand_region_size];

uint8_t* rand_regionp = rand_region_initp;

const uint64_t size = UINT_MAX; /*max size of a single input -- 4G*/
char out_region_initp[size];
char *out_regionp = out_region_initp;
uint64_t out_cursor = 0;
FILE* fs;


uint8_t
__attribute__((always_inline))
map(uint8_t to) {
    uint8_t from = *rand_regionp++;
    if (rand_regionp >= rand_region_sizep)
        rand_regionp = rand_region_initp;

In [265]:
%cd testers
!cc -g -Ofast -o grammar_producer_ctfwrite grammar_producer_ctfwrite_main.c grammar_producer_ctfwrite_fuzz.s
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [266]:
# II
!./testers/grammar_producer_ctfwrite 0 10 10 io.x

In [267]:
!cat io.x

 -->  
 -->  
@charset '' ;   
@charset "" ; <!-- @import 			  url("		") 	 		  ; 	  	 <!-- @import  	 "A"   ;  		 <!-- @import  url("""")   ;  -->   
   [  s   ]  ,  selector {   u  :    4   v  @import  ; 	 _  :  #b00c40    }    
 	 @import 	 '\-{'   ;   --> --> <!-- @import  url(" "" ")    ;  @import  ""  v  ,  e  ;  <!-- <!-- --> <!--  
@charset '' ;  	-->	--> @import  url("") 	  ;   @import  url(" ")    ;  -->  	@import 	  ''   ;   
@charset '' ;   [  u   ]  ,  selector,  selector,  selector,  selector {    } 	 <!--  
 --> @import 	 ''  q   ; 	 <!--  	-->  
@charset '' ; 	      * .t   {    f  : 	 ''     }  -->  <!--   @media  a   {   } 	  @page   {   r  :  #58b    ;   }  @page  : u  {  l  :  #077    ;   }  @page   {    }  
   }  


In [268]:
class CTesterFWriteCT(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_ctfwrite {seed} {self.max_num} {max_depth} {fn}"

In [269]:
CTesterFWriteCT().run_test().show()

depth= 8 size= 99345.5 time= 0.003 stdev(0.0) throughput= 32339.029947916664 stdev(10)
depth= 16 size= 87076 time= 0.003 stdev(0.0) throughput= 28345.052083333332 stdev(3)
depth= 32 size= 84274.5 time= 0.003 stdev(0.0) throughput= 27433.10546875 stdev(3)
depth= 64 size= 84457.5 time= 0.003 stdev(0.0) throughput= 27492.67578125 stdev(3)
depth= 128 size= 84457.5 time= 0.004 stdev(0.001) throughput= 21994.498697916664 stdev(7778)
depth= 256 size= 84457.5 time= 0.003 stdev(0.0) throughput= 27492.67578125 stdev(3)
Throughput of  32339.029947916664  kilobytes per second at depth =  8
Total time: 0:00:11.323014


### No Output

#### Direct threaded VM

In [270]:
class DTNoWriteFuzzer(CNoWriteFuzzer):
    def fn_out_def(self): return ''
    def fn_main_out_frag(self): return ''
    def gen_rule_src(self, rule, k, j):
        res = []
        leaf = True
        for i, token in enumerate(rule):
            if token in self.grammar:
                leaf = False
                trules = self.grammar[token] # ordered by cost
                len_min_choices = len(self.c_grammar[token])
                assert len(trules) < 256
                cheap_strings = self.pool_of_strings[token]
                if len(cheap_strings) < 256: # we only have 255 random choices
                    check_pool = '''
        val = map(%(len_cheap_strings)s);
        const char* str = pool_%(k)s[val];
        const int str_l = pool_l_%(k)s[val];
        for (int i = 0; i < str_l; i++) {
            *out_regionp++ = str[i];
        }
        --returnp;
        goto **returnp; 
            ''' % { 'len_cheap_strings': len(cheap_strings), 'k': self.k_to_s(token)}
                else:
                    check_pool = '''
        val = map(%(len_min_choices)s);
                ''' % {'len_min_choices':len_min_choices}
                res.append('''\
    *returnp = &&return__%(i)d__%(j)d__%(k)s;
    if (returnp > max_depthp) {
        %(check_pool)s;
    } else {
        val = map(%(len_rules)s);
    }
    goto *gen_%(t)s[val];
return__%(i)d__%(j)d__%(k)s:;
            ''' % {'i':i, 'j':j, 'k':self.k_to_s(k),
                   't':self.k_to_s(token), 'rnum':0, 'len_rules':len(trules), 'len_min_choices':len_min_choices, 'check_pool':check_pool})
            else:
                t = self.esc_char(token)
                res.append('''\
    *out_regionp++ = '%s';''' % t)
        return res, leaf
    
    def gen_alt_src_1rule(self, k):
        rule = self.grammar[k][0]
        ri = 0
        src, leaf = self.gen_rule_src(rule, k, ri)
        body = '\n'.join(src)
        result = []
        if leaf:
            return '''
gen_%(name)s_0: {
%(body)s
    goto **returnp;
}''' % {'name':self.k_to_s(k), 'body':body}
        else:
             return '''
gen_%(name)s_0: {
    ++returnp;
    // single -- no switch
%(body)s
    --returnp;
    goto **returnp;
}''' % {'name':self.k_to_s(k), 'body':body}

    def gen_alt_src(self, k):
        rules = self.grammar[k]
        ret = self.k_to_s(k)
        result = []
        if len(rules) == 1: return self.gen_alt_src_1rule(k)
        for ri, rule in enumerate(rules):
            src, leaf = self.gen_rule_src(rule, k, ri)
            body = '\n'.join(src)
            if leaf:
                result.append('''
gen_%(name)s_%(rnum)d: {
%(body)s
    goto **returnp;
}
    ''' % {'name': self.k_to_s(k), 'rnum': ri, 'body':body})
            else:
                 result.append('''
gen_%(name)s_%(rnum)d: {
    ++returnp;
%(body)s
    --returnp;
    goto **returnp;
}
    ''' % {'name': self.k_to_s(k), 'rnum': ri, 'body':body})
        return '\n'.join(result)

    def fuzz_out_var_defs(self):
        return '''\
extern char* out_regionp;'''
    
    def fuzz_rand_var_defs(self):
        return '''
uint8_t map(uint8_t to);'''
    
    def fuzz_stack_var_defs(self):
        return '''
extern void* stackp[];
'''

    def fuzz_entry(self):
        result = ['''
void gen_init__(void** max_depthp) {
    uint8_t val;
    void** returnp = stackp;
    *returnp =  &&return__init;
''']
        for k in self.grammar:
            l = []
            for ri,rule in enumerate(self.grammar[k]):
                l.append('&&gen_%(k)s_%(ri)d' % {'k':self.k_to_s(k), 'ri':ri})
            s = '''
    void** gen_%(k)s[] = {
%(body)s
    };''' % {'k': self.k_to_s(k), 'body': ',\n'.join(l)}
            result.append(s)
        result.append('''
    goto gen_start_0;''')
        result.append(self.fuzz_fn_defs())
        result.append("""
return__init:
    *out_regionp++ = '\\n';
    return;
return_abort:
    exit(10); 
}""")
        return '\n'.join(result)
    
    def main_stack_var_defs(self):
        return'''
int max_depth;
void** max_depthp;
void* stackp[INT_MAX];
'''
    def main_init_var_defs(self):
        return'''
void gen_init__(void** max_depthp);
'''

    def fn_main_loop_frag(self):
        return '''
    uint64_t out_size = 0;
    for(int i=0; i < max_num; i++) {
        out_regionp = out_region_initp;
        gen_init__(max_depthp);
        out_cursor = out_regionp - out_region_initp;
        out_size += out_cursor;
    }
    printf("%lld\\n", out_size);
    '''
    
    def fn_main_def(self):
        return self.fn_truncateio() + '''
int main(int argc, char** argv) {
    struct stat st;
    char* out_region_sizep = 0;
    int out_fd;
    int seed, max_num;
%(input_frag)s
    max_depthp = stackp + max_depth;
%(rand_frag)s
%(out_frag)s
%(loop_frag)s
%(sync_frag)s
    return 0;
}''' % {'input_frag': self.fn_main_input_frag(),
        'rand_frag': self.fn_main_rand_frag(),
        'out_frag': self.fn_main_out_frag(),
        'loop_frag': self.fn_main_loop_frag(),
        'sync_frag': self.fn_main_sync_frag()
       }


    def gen_fuzz_src(self):
        return '\n'.join([self.fuzz_hdefs(),
                          self.fuzz_var_defs(),
                          self.fn_fuzz_decs(),
                          self.string_pool_defs(),
                          # self.fuzz_fn_defs(),
                          self.fuzz_entry()])

In [271]:
main_src, fuzz_src = DTNoWriteFuzzer(c_grammar).fuzz_src()
with open('testers/grammar_producer_dtnowrite_main.c', 'w+') as f:
    print(main_src, file=f)
with open('testers/grammar_producer_dtnowrite_fuzz.c', 'w+') as f:
    print(fuzz_src, file=f)

In [272]:
!cat testers/grammar_producer_dtnowrite_main.c


#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>

#include <unistd.h>
#include <limits.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <math.h>


int max_depth;
void** max_depthp;
void* stackp[INT_MAX];


void gen_init__(void** max_depthp);

uint8_t* rand_region_sizep = 0;
const uint64_t rand_region_size = 1ULL << 16;
uint8_t rand_region_initp[rand_region_size];

uint8_t* rand_regionp = rand_region_initp;

const uint64_t size = UINT_MAX; // size of a single output item -- 4G
char out_region_initp[size];
char *out_regionp = out_region_initp;
uint64_t out_cursor = 0;

uint8_t
__attribute__((always_inline))
map(uint8_t to) {
    uint8_t from = *rand_regionp++;
    if (rand_regionp >= rand_region_sizep)
        rand_regionp = rand_region_initp;
    return ((uint16_t) from * (uint16_t) to) >> 8;
}


static inli

In [273]:
%cd testers
!cc -g -Ofast -o grammar_producer_dtnowrite grammar_producer_dtnowrite_main.c grammar_producer_dtnowrite_fuzz.c
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [274]:
!./testers/grammar_producer_dtnowrite  0 10 10

538


In [275]:
class CTesterNoWriteDT(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_dtnowrite {seed} {self.max_num} {max_depth} > {fn}"
  
    def post_time(self):
        super().post_time()
        with open(self.file) as f:
            self.size = int(f.read())

In [276]:
CTesterNoWriteDT().run_test().show()

depth= 8 size= 736258.5 time= 0.007 stdev(0.001) throughput= 96294.11969866071 stdev(9061)
depth= 16 size= 1292505.5 time= 0.018 stdev(0.004) throughput= 72126.25093005953 stdev(16999)
depth= 32 size= 1608608 time= 0.023 stdev(0.001) throughput= 68429.79847301135 stdev(4213)
depth= 64 size= 1511572 time= 0.02 stdev(0.0) throughput= 73807.2265625 stdev(6)
depth= 128 size= 1511572 time= 0.025 stdev(0.006) throughput= 62353.73703529095 stdev(16192)
depth= 256 size= 1511572 time= 0.023 stdev(0.001) throughput= 64301.90762606534 stdev(3959)
Throughput of  96294.11969866071  kilobytes per second at depth =  8
Total time: 0:00:11.561921


#### Context threaded VM

In [277]:
class CTNoWriteFuzzer(DTNoWriteFuzzer):
    def fn_main_out_frag(self): return ''
    def fn_main_sync_frag(self): return ''
    
    def fn_choice(self, val):
        return '''
    # [ random 
    # extract one byte from the random stream %%r14,
    movq (%%r14), %%rdi
    # advance the random cursor
    inc %%r14                                     # rand_region++
    movzbl %%dil, %%edi                           # X  --- (rdi:(edi:(di:(dil))))
    # then multiply with the choices we have

    xor %%rsi, %%rsi                              # avoid data dependencies
    movb $%(val)s, %%sil                          # = %(val)s       
    movzbl %%sil, %%edx
    imull %%edi, %%edx                            # m = (short) x * (short) N)
    sarl $8, %%edx                                # return (char)(m >> 8) ;
    # random ]
    # %%edx now contains the selected random value from %(val)d options''' % {'val':val}

    def cheap_strings(self, k):
        cheap_strings = self.pool_of_strings[k]
        results = ['''
    # --- cheap -- [''']
        results.append('''
%(choices)s
''' % {'choices':self.fn_choice(len(cheap_strings)), 'len_choices': len(cheap_strings)})
        # get the choices from vm, then call it, and return.
        
        results.append('''
    # now we have the right print quad in %%edx. Load the right address and call it.
    leaq _%(key)s_prints(%%rip), %%rcx
    leaq (%%rcx, %%rdx, 8), %%rax
    callq *(%%rax)
    ret
    ''' % {'key': self.k_to_s(k)})
        results.append('''
    # --- cheap -- ]''')
        return '\n'.join(results)
    
    def output_char(self, c):
        if len(c) != 1:
            assert c[0] == '\\'
            c = c[-1]
        return '''
   movb $%(ichar)d, (%%r13)                     # '%(char)s'
   inc %%r13                                    # out_region++   : increment a byte (r13++)
   ''' % {'char':self.esc(c), 'ichar':ord(c)}

    def gen_rule_src(self, rule, k, j):
        # in each rule, there are a number of tokens.
        # iter each token in turn, choose the right rule and call.
        result = []
        for token in rule:
            if token not in self.grammar:
                result.append(self.output_char(token))
                continue
            else:
                # how many choices do we have?
                rules = self.grammar[token]
                result.append('''
    # start the choice machine.
    # length of rules = %(len_rules)d
%(choices)s
    # --- switch ---
    ''' % {'choices': self.fn_choice(len(rules)), 'len_rules':len(rules)})
                result.append('''
    # now we have the right choice in %%edx. Load the right address and call it.
    leaq _%(key)s_choices(%%rip), %%rcx
    leaq (%%rcx, %%rdx, 8), %%rax
    callq *(%%rax)
    ''' % {'key': self.k_to_s(token)})
        return '\n'.join(result)

    def gen_alt_src(self, k):
        result = []
        for ruleid, rule in enumerate(self.grammar[k]):
            # produce a skeletal subroutine structure.
            result.append('''
gen_%(key)s_%(ruleid)s:
    # check if the max depth is breached.
    cmpq %%rsp, %%r8                             # returnp(rbp) <> max_depth(r8) ?
    jle _%(key)s_%(ruleid)s_fi                       # returnp <= max_depth
    
%(return_cheap_string)s
_%(key)s_%(ruleid)s_fi:
''' % {'return_cheap_string': self.cheap_strings(k),
       'key':self.k_to_s(k),
       'ruleid':ruleid,
       'last_label':self.last_label})
            self.last_label += 1
            result.append(self.gen_rule_src(rule, k, ruleid))
            # we were called. So simply return.
            result.append('''
    ret
            ''')
        return '\n'.join(result)
 
    def fn_fuzz_decs(self):
        result = ['''
  .section  __DATA,__data

# Virtual Machine OPS.
        ''']
        for k in self.grammar:
            result.append('''
    .globl  _%(key)s_choices
    .p2align 4
_%(key)s_choices:''' % {'key':self.k_to_s(k)})
            for i, rule in enumerate(self.grammar[k]):
                result.append('''\
    .quad gen_%s_%d''' % (self.k_to_s(k), i))
                
        for k in self.pool_of_strings:
            result.append('''
    .globl  _%(key)s_prints
    .p2align 4
_%(key)s_prints:''' % {'key':self.k_to_s(k)})
            for string in self.pool_of_strings[k]:
                result.append('''\
    .quad %s''' % (self.all_prints[string]))
                
                
        result.append('''
# End Virtual Machine OPS.''')
        return '\n'.join(result)

    def gen_cheap(self, grammar):
        all_strings = set()
        for k in grammar:
            all_strings |= set(self.pool_of_strings[k])
        all_strings = list(all_strings)
        all_strings.sort(key=lambda item: (-len(item), item))
        all_prints_hash = {}
        result = ['''
.text
        ''']
        for i, s_ in enumerate(all_strings):
            s = s_
            result.append('''\
print_%(name)d: # "%(value)s"''' % {'name': i, 'value': self.esc(s)})
            for j in s:
                result.append('''\
    movb $%(ichar)s, (%%r13)            # '%(char)s'
    inc %%r13''' % {'ichar':ord(j), 'char':self.esc(j)})
            result.append('''\
    ret''')
            all_prints_hash[s_] = 'print_%d' % i
        return ('\n'.join(result), all_prints_hash)
 
    def fuzz_entry(self):
        result = ["""
#include "ctnowrite_vm_ops.s"
.macro pushaq
    push %%rsp
    push %%rbp
    push %%r8
    push %%r9
    push %%r10
    push %%r11
    push %%r12
    push %%r13
    push %%r14
    push %%r15
.endm


.macro popaq
    pop %%r15
    pop %%r14
    pop %%r13
    pop %%r12
    pop %%r11
    pop %%r10
    pop %%r9
    pop %%r8
    pop %%rbp
    pop %%rsp
.endm

.global %(os)sgen_init__
.global return__init
.text
%(os)sgen_init__:
    # 1 rdi = max_depth
    # 2 rsi = returnp
    # 3 rdx = &out_region
    # 4 rcx = &rand_region
    pushaq

    leal 0(,%%rdi,8), %%eax
    movq %%rsp, %%r8
    subq %%rax, %%r8

    movq %%rdx, %%r11                              # &out_region
    movq %%rcx, %%r12                              # &rand_region
    movq (%%r11),%%r13                             # out_region
    movq (%%r12),%%r14                             # rand_region

    # general regs
    # rax, rcx, rdx, rbx, rsi,rdi
    # rbp, r8-r15
    
    call gen_start_0
    movq %%r13, (%%r11)                            # *(&out_region) <-
    movq %%r14, (%%r12)                            # *(&rand_region) <-
    popaq
    movq  $0, %%rax
    ret   
""" % {'os': '_' if sys.platform == 'darwin' else ''}]
        result.append(self.fuzz_fn_defs())
        return ''.join(result)

    def main_init_var_defs(self):
        return'''
void gen_init__(uint32_t max_depth, void** returnp, char** out_region, uint8_t** rand_region);
'''

    def fn_main_loop_frag(self):
        return '''
    uint64_t out_size = 0;
    for(int i=0; i < max_num; i++) {
        out_regionp = out_region_initp;
        gen_init__(max_depth32, stackp, &out_regionp, &rand_regionp);
        *out_regionp++ = '\\n';
        out_cursor = out_regionp - out_region_initp;
        out_size += out_cursor;
    }
    printf("%lld\\n", out_size);
    '''
    
    def fn_main_def(self):
        return self.fn_truncateio() + '''
int main(int argc, char** argv) {
    struct stat st;
    int rand_fd;
    uint32_t max_depth32;
    int seed, max_num;
%(input_frag)s
    max_depth32 = max_depth;
%(rand_frag)s
%(loop_frag)s
    return 0;
}''' % {'input_frag': self.fn_main_input_frag(),
        'rand_frag': self.fn_main_rand_frag(),
        'loop_frag': self.fn_main_loop_frag()
       }
    
    def fuzz_src(self, key='<start>'):
        self.last_label = 0
        self.cheap, self.all_prints = self.gen_cheap(self.grammar)
        ext_strings = '\n'.join([self.fn_fuzz_decs(), self.cheap])
        return ext_strings, self.gen_main_src(), self.gen_fuzz_src()
    
    def gen_fuzz_src(self):
        return '\n'.join([self.fuzz_entry()])

In [278]:
vm_ops, main_src, fuzz_src = CTNoWriteFuzzer(c_grammar).fuzz_src()
with open('testers/grammar_producer_ctnowrite_main.c', 'w+') as f:
    print(main_src, file=f)
with open('testers/grammar_producer_ctnowrite_fuzz.s', 'w+') as f:
    print(fuzz_src, file=f)
with open('testers/ctnowrite_vm_ops.s', 'w+') as f:
    print(vm_ops, file=f)

In [279]:
%cd testers
!cc -g -Ofast -o grammar_producer_ctnowrite grammar_producer_ctnowrite_main.c grammar_producer_ctnowrite_fuzz.s
%cd ..

/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix/testers
/Users/rahul/Research/fastgrammarfuzzing/notebook/usenix


In [280]:
!./testers/grammar_producer_ctnowrite 0 10 10

744


In [281]:
class CTesterNoWriteCT(CTester):
    def exec_program(self, seed, max_depth, t):
        fn = self.ofile(max_depth, seed)
        return f"./testers/grammar_producer_ctnowrite {seed} {self.max_num} {max_depth} > {fn}"
 
    def post_time(self):
        super().post_time()
        with open(self.file) as f:
            self.size = int(f.read())

In [282]:
CTesterNoWriteCT().run_test().show()

depth= 8 size= 99364 time= 0.003 stdev(0.001) throughput= 40432.45442708333 stdev(11444)
depth= 16 size= 87005.5 time= 0.003 stdev(0.0) throughput= 28322.102864583332 stdev(2)
depth= 32 size= 84277.5 time= 0.003 stdev(0.0) throughput= 27434.08203125 stdev(3)
depth= 64 size= 84457.5 time= 0.003 stdev(0.0) throughput= 27492.67578125 stdev(3)
depth= 128 size= 84457.5 time= 0.003 stdev(0.0) throughput= 27492.67578125 stdev(3)
depth= 256 size= 84457.5 time= 0.003 stdev(0.0) throughput= 27492.67578125 stdev(3)
Throughput of  40432.45442708333  kilobytes per second at depth =  8
Total time: 0:00:11.334354


# Results

In [283]:
for k in TX:
    print(k)
    for depth in TX[k]:
        print(depth)
        if 'avgruntime' not in TX[k][depth]: continue
        print('\truntime =',TX[k][depth]['avgruntime'])
        print('\tsize = ',TX[k][depth]['avgsize'])
        print('\tthroughput =',TX[k][depth]['avgthroughput'])
    print()
    
END_TIME = datetime.now()

RandomTester
8
	runtime = 0.038000000000000006
	size =  14.5
	throughput = 0.3732323068260568
16
	runtime = 0.037000000000000005
	size =  22.5
	throughput = 0.5939327485380117
32
	runtime = 0.036500000000000005
	size =  41
	throughput = 1.09789965746997
64
	runtime = 0.037000000000000005
	size =  75
	throughput = 1.9795185810810807
128
	runtime = 0.037500000000000006
	size =  146
	throughput = 3.80553765113798
256
	runtime = 0.03900000000000001
	size =  279.5
	throughput = 7.006208285266974

GrammarinatorTester
8
	runtime = 4.9645
	size =  283535.5
	throughput = 55.77911743717848
16
	runtime = 7.3575
	size =  450632
	throughput = 59.81331993314168
32
	runtime = 7.0809999999999995
	size =  454815
	throughput = 62.743195759392094
64
	runtime = 6.952
	size =  456442
	throughput = 64.11886914896179
128
	runtime = 7.054
	size =  457099.5
	throughput = 63.2813654472841
256
	runtime = 7.0055
	size =  456886
	throughput = 63.69052207146876

GramFuzzTester
8
	runtime = 3.2455
	size =  559219.5


	runtime = 0.014
	size =  99351.5
	throughput = 6930.210658482143
16
	runtime = 0.014
	size =  87024.5
	throughput = 6070.347377232143
32
	runtime = 0.0165
	size =  84274.5
	throughput = 5105.083264802632
64
	runtime = 0.019
	size =  84457.5
	throughput = 4664.015997023809
128
	runtime = 0.021
	size =  84457.5
	throughput = 4075.435431985294
256
	runtime = 0.014
	size =  84457.5
	throughput = 5891.287667410714

CTesterFWriteDT
8
	runtime = 0.01
	size =  736913
	throughput = 71964.16015625
16
	runtime = 0.021500000000000002
	size =  1292144.5
	throughput = 58722.89616308171
32
	runtime = 0.03
	size =  1608538
	throughput = 52889.862163299666
64
	runtime = 0.025500000000000002
	size =  1511572
	throughput = 57910.221604567305
128
	runtime = 0.026000000000000002
	size =  1511572
	throughput = 56774.78966346153
256
	runtime = 0.025500000000000002
	size =  1511572
	throughput = 57910.221604567305

CTesterFWriteCT
8
	runtime = 0.003
	size =  99345.5
	throughput = 32339.029947916664
16
	runti

In [284]:
str(END_TIME - START_TIME)

'1:09:05.088692'

In [285]:
import json

In [286]:
!mkdir -p results

In [287]:
from datetime import datetime
curtime = datetime.now().isoformat()
name = 'results/tx-%s.json' % curtime
with open(name, 'w+') as f:
    print(json.dumps(TX), file=f)
print(name)

results/tx-2019-08-23T11:25:38.652068.json


# Grammar Transformations

## Speed vs Code size Tradeoffs

A sliding scale of

* Completely String pools
* Compile to a state machine (CFG to Regular expression of fixed depth)
* Encode depth in function name (Remove depth comparisons)

### Expanding the use of string pools from just closing to before max_depth is exhausted

 We are not stuck with a pool of strings only after exhaustion of max_depth. But we will have to account for differing probabilties of different strings if we want to achieve a distribution of strings as the original. Whether it is required to have the same distribution as the original is a different question (because the original is clearly non-optimial -- shallow paths have more chance of being explroed again).

## Inlining

For inlining, we simply iterate through each key in the grammar, and each rule corresponding to a single key. For each rule, we inline one level, which will give us a list of corresponding rules. This set of rules will replace the original rule for the key.

# Probabilistic Fuzzing

One of the problems with our dumb grammar fuzzer is that each alternative rule for a key expansion is given the same probability. Hence, given a JSON element that can be a boolean, number, object or an array, the boolean (true and false) values will occur very 5 elements. This is clearly non-optimal. Hence, we need to extend our fuzzer to include probabilities in the grammar definition.

Once we adopt probabilistic fuzzing, we can generate a probabilistic profile of the grammar rules by expanding the grammar to a given depth, and simply counting the number of complete items produced by each expansion. This can ensure that there is a high probability of exploring at least to that depth.

In [288]:
from functools import reduce
import operator

def items_in_rule(grammar, rule, depth, max_depth):
    if depth > max_depth: return 1
    return reduce(operator.mul,
                  [items_in_key(grammar, key, depth+1, max_depth)
                   for key in  rule], 1)


def items_in_key(grammar, key, depth=0, max_depth=10):
    if key not in grammar: return 1
    return sum(items_in_rule(grammar, rule, depth, max_depth)
               for rule in grammar[key])

def explore_grammar(grammar, max_depth):
    new_g = {}
    for k in grammar:
        new_rules = []
        for rule in grammar[k]:
            items = items_in_rule(grammar, rule, depth=0, max_depth=max_depth)
            new_rule = (rule, items)
            new_rules.append(new_rule)
        new_g[k] = new_rules
    return new_g

def to_ranges(pgrammar):
    new_g = {}
    for k in pgrammar:
        last = 0
        elts = []
        for elt in pgrammar[k]:
            rule, count = elt
            frm = last
            last += count
            to = last
            new_elt = rule, (frm, to)
            elts.append(new_elt)
        new_g[k] = elts
    return new_g
p_grammar = explore_grammar(my_grammar, 1)

In [289]:
p_grammar

{'<start>': [(['<stylesheet>'], 16)],
 '<stylesheet>': [(['<[CHARSET_SYM_STRING_SEMI]-1>',
    ' ',
    '<[S_OR_CDO_OR_CDC]-1>',
    ' ',
    '<[import_CDO_S_OR_CDC_S]-1>',
    ' ',
    '<[stylesheet_closing_GROUPING]-1>'],
   126)],
 '<[CHARSET_SYM_STRING_SEMI]>': [(['<CHARSET_SYM>', ' ', '<STRING>', ' ;'],
   2)],
 '<[S_OR_CDO_OR_CDC]>': [(['<Sp>'], 2), (['<CDO>'], 1), (['<CDC>'], 1)],
 '<[import_CDO_S_OR_CDC_S]>': [(['<import>', ' ', '<[CDO_S_OR_CDC_S]-1>'],
   20)],
 '<[CDO_S_OR_CDC_S]>': [(['<CDO>', ' ', '<Ss>'], 2),
  (['<CDC>', ' ', '<Ss>'], 2)],
 '<[ruleset_OR_media_OR_page]>': [(['<ruleset>'], 8),
  (['<media>'], 2),
  (['<page>'], 8)],
 '<[stylesheet_closing_GROUPING]>': [(['<[ruleset_OR_media_OR_page]>',
    ' ',
    '<[CDO_S_OR_CDC_S]-2>'],
   15)],
 '<import>': [(['<IMPORT_SYM>',
    ' ',
    '<Ss>',
    ' ',
    '<[STRING_OR_URI]>',
    ' ',
    '<Ss>',
    ' ',
    '<media_list-1>',
    ' ; ',
    '<Ss>'],
   64)],
 '<[STRING_OR_URI]>': [(['<STRING>'], 2), (['<URI>'], 12

In [290]:
def get_included_rule(idx, rules_):
    for r,rng in rules_:
        if rng[0] <= idx and idx < rng[1]: return r
    assert False

def gen_key(grammar, key, depth=0, max_depth=10):
    if key not in grammar: return [key]
    if depth > max_depth: return [random.choice(pool_of_strings[key])]
    
    rules_ = grammar[key]
    max_val = max([j for rule, (i,j) in rules_])
    rule = get_included_rule(random.randrange(max_val), rules_)
    return gen_rule(grammar, rule, depth+1, max_depth)

def gen_rule(grammar, rule, depth, max_depth):
    return sum([gen_key(grammar, token, depth, max_depth) for token in rule], [])

def grammar_producer_p(grammar, key='<start>'):
    cp_grammar = to_ranges(grammar)
    return ''.join(gen_key(cp_grammar, key))

In [291]:
pf = PooledFuzzer(my_grammar)
pool_of_strings = pf.pool_of_strings
for i in range(10):
    print(grammar_producer_p(p_grammar))

@charset "$}\_~/" ;  				  	  			  --> @import 	  url(" \E\v* ") 		 		 -\Qy 		  ;   		  <!-- 		  -->  	-->  	--> --> <!-- <!-- @import 	 	 url("'h'")  c   ,  s ,  u ,  t  ;    	 --> 			<!-- 		<!--  --> --> <!--  @page      : -\Ox 		 { 	 		   } 			  --> 			 <!--    -->   --> 	<!-- <!-- --> 
 	 		 	 --><!----> 	   -->--><!-- @import  	  url("\j\:!")  			  \s- 		 , 	 u ,  l ,  u  ; 	 	 -->    	--> 	 	<!--   --> 	@import 		 	 url("	'~' ")      _ 	 ,  q  ;   	  <!--   	<!-- 	 <!--  <!-- --> <!-- @import 			 url(""" ") 			 a  ,  m ,  a  ; 			 --> 	 <!-- 	--> <!-- <!-- @import 	 url("""	")    l   ; 		 <!-- 	<!-- --> <!-- @import   url("''") 	 h   ; 	 <!--  @page  	   { 			  z   : 		 ""   #b81  #c1e11b   ; 	   } 		 	 <!-- 	  <!-- 	--> 	<!-- 	--> --> <!-- #0[  f   ][  d   ] >  *   ,  selector,  selector,  selector,  selector { 		  p  : 	 url("")   #c1c   ; 	 ;  b  :  #d39   ;  ;   }   <!-- 		-->   --> 	i [  l   ] 	   {  	 z  :  h     } 	  <!-- 	 -->  --> <!-- --> .a    ,  selector {    ;  ;   }

# Generating Large Inputs Fast