# Generating Samples

In this notebook we will show how to generate valid samples for a given parser without using a grammar.

## Examples

Convenience utilities

In [1]:
import src.utils as utils

### Calculator.py

In [2]:
calculator = utils.load_file('subjects/calculator.py', 'calculator')

#### The error handler

We often need to interpret the error we get back. We use a simple exception class to capture the error.

In [3]:
with utils.ExpectError():
     calculator.main('xyz')

('xyz', 0)


#### A random fuzzer.

In [4]:
import random
random.seed(0)

In [5]:
import string

In [6]:
def fuzzer(max_length=100):
    string_length = random.randrange(1, max_length + 1)
    return ''.join([random.choice(string.printable) for c in range(string_length)])

In [7]:
fuzzer()

"\rR5x$!PCZJ-r#hAh\nc<w'{:iDc~9^GY*cJTE;>q)ZU%x7)1b}P"

What happens if you feed this input to the program?

In [8]:
with utils.ExpectError():
    s = fuzzer()
    print(repr(s))
    calculator.main(s)

'\\=0;"Gv~F{8o+sui(VbaE$!dC)B{f)G(q:).AUb/NE,uBnon4;[xY8b]\ngj4a`(^O{&z%ur].R-zV"[?`JaF;e!.=Go'


('\\=0;"Gv~F{8o+sui(VbaE$!dC)B{f)G(q:).AUb/NE,uBnon4;[xY8b]\ngj4a`(^O{&z%ur].R-zV"[?`JaF;e!.=Go', 0)


This is rather unsatisfying. We need a better way to reach deeper into the program. Let us observe the error again, this time with a plausible partial input.

In [9]:
with utils.ExpectError():
     calculator.main('(1+2)de')

('(1+2)de', 5)


As you can see, the exception we got precisely indicates exactly where the parse error occurred.

In [10]:
'(1+2)de'[0:5]

'(1+2)'

Can we make use of it to construct better inputs?

In [11]:
import enum

In [12]:
class ExprStatus(enum.Enum):
    Complete = 0
    Unterminated = -1
    Unexpected = -2

In [13]:
class ExpectExprError:
    def __init__(self, s, log=True):
        self.msg = None
        self.boundary = None
        self.result = None
        self.s = s
        self.log = log

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, tb):
        if exc_type is None:
            self.boundary = 0
            self.result = ExprStatus.Complete
            return
        inp, self.boundary = exc_value.args
        if self.boundary >= len(self.s):
            self.result = ExprStatus.Unterminated
        elif self.boundary < len(self.s):
            self.result =  ExprStatus.Unexpected
        else:
            assert False
        return True

In [14]:
with ExpectExprError('(1+2x)') as e:
     calculator.main(e.s)
e.boundary, e.result

(4, <ExprStatus.Unexpected: -2>)

In [15]:
with ExpectExprError('(1+2') as e:
     calculator.main(e.s)
e.boundary, e.result

(4, <ExprStatus.Unterminated: -1>)

In [16]:
with ExpectExprError('(1+2)') as e:
     calculator.main(e.s)
e.boundary, e.result

(0, <ExprStatus.Complete: 0>)

In [17]:
with ExpectExprError('(1+2)x') as e:
     calculator.main(e.s)
e.boundary, e.result

(5, <ExprStatus.Unexpected: -2>)

The chicken and egg problem; we need to use dynamic analysis; which means we need to have a few valid inputs.

#### Building the Evolutionary Algorithm

In [18]:
def get_expr_fitness(s):
    with ExpectExprError(s) as e:
        calculator.main(e.s)
    match e.result:
        case ExprStatus.Complete:
            return 1.0/len(e.s)
        case ExprStatus.Unexpected:
            return len(e.s) - e.boundary
        case ExprStatus.Unterminated:
            return 1
    assert False, (s, e)

In [19]:
class Evolver:
    def __init__(self, fitness_fn=None, delta=0.1, log=True):
        self.fitness_fn = fitness_fn
        self.log = log
        self.delta = delta
        
    def get_fitness(self, s):
        return self.fitness_fn(s)

In [20]:
expr_evolver = Evolver(get_expr_fitness)

In [21]:
expr_evolver.get_fitness('(1+2)')

0.2

In [22]:
expr_evolver.get_fitness('(1+(2*4+4))')

0.09090909090909091

In [23]:
expr_evolver.get_fitness('(1+2)234')

3

In [24]:
expr_evolver.get_fitness('(1+2+3')

1

In [25]:
expr_evolver.get_fitness('(1+2+3XXY')

3

In [26]:
class Evolver(Evolver):
    def create_population(self, size):
        return [fuzzer() for i in range(size)]

In [27]:
expr_evolver = Evolver(get_expr_fitness)

In [28]:
expr_evolver.create_population(10)

["2~ye{sLlGS7ci`s5,>':^93f>o:,fObL",
 '4:2on|fZq~7]2(S',
 'cx8s9?CITn7#X5/c`OpxJ~Y+l`]q\x0b7]kkH&wf/U\\m1Y^Q+$D@JN[wj*_1W aG 5(zhu\rZJ;A]J.><g|D',
 '\tR@a0/o`Gkus>VM{]+R4P`+R\x0b[{5lV8x`kV&!*:\n04"FDX6Ro)',
 'a}g1P]RE0r1|\n0]&;cof:@pCz_ncYO=a2zVewh@%@?Iejz255q^x*EK+5\t`:@"|?W>TL\'mqM.B1hjyGHL|',
 'H\x0c<45ykj-BKO',
 'gBeZ~u6Dm%~9CPGCRdc*ZYHHfZe`"S4CG ^jl=+M>b8ap\ts7N1cO*%BV!-|]rSaLsx-\x0clTo',
 'e8`3&V\n]pf"Owq?5r<idpWMK(jd/!i+P>^S%"]F"">\\p(;',
 '1H{\tEF4&iw:jM-B|{Y8a%58sg5C1\r',
 'Gkj@WL#M&#4,b]%\r/9\tS\nqB\'/RZN:.t2[0 nC#+wG8"xC\x0bQNN7k?guA~G7']

In [29]:
class Evolver(Evolver):
    def evaluate_population(self, population):
        fitness = [self.get_fitness(x) for x in population]
        return list(zip(population, fitness))

In [30]:
expr_evolver = Evolver(get_expr_fitness)

In [31]:
population = expr_evolver.create_population(100)
expr_evolver.evaluate_population(population)

[('ZRi!:', 5),
 ('a]`jJQ4;XNW6cY\x0cj24/<g=Fd`)@IoN\x0c\x0c!e7;`X;=H@f^|<BgNB\t^f%o4OUL\noWJ=955!w3%\\+,rtb\x0c=\x0c#`&R#DeiS+Sa',
  92),
 ('R8cR\x0cj~3VT^R3"', 14),
 ('}waJ9fJ_3IIm1tK9/iq0q[]~f\t0BL_3:tinWeZI{xg', 42),
 ('qKGY', 4),
 ("B)>Fn.ad'-DkMigsE$vu\nnBLR[5g/2O9`9gRC)", 38),
 (' i.SC>JavU=L>&7MQ1R~FUqLBYbndze*:_j`VPn\x0bRTmvWH%iJX=>bZ', 54),
 ('qB0`V<X0rCe\x0b=C(:jS{\nYb]"\rt(\rPz=2fy\\50wO&-{OUd\twJA\n]p/a49xD\'Hf&v\rk8RBA%h,%=q\'dQ>(P \x0czBUL+=hkf`fMP.',
  97),
 ('h*\\CJ=Y\tRrZ!_#E"@7UCi\t"6<r3JYO1&8^a^ \\O0K5e<0y>`B~ti\n,AodTW|', 60),
 ('NlGR?^TiV{i&EgqnUINS!N~spUq.{6N4t>anK7\t>]mt', 43),
 ("C;b{$\nA\x0bJQW6=`%\\@) T-W!w{YrHy556kI0B@0h8S^s:O*sWoH:d:aEF'WFw3%5oLaq&Iopw]~ CD%N",
  79),
 ('ZI|u5D)91W"}U6Q"WUfaauc\rjQrU;9S*\n', 33),
 ("5nv!sgzJETd*A;(p|B\x0cU$:X'>xyt2f;|cm~RvrA [0 '$S6fN?y", 50),
 (' +Jt]|{([As u8%D', 16),
 ('FtL=ZA-lh1)#FK->3gOjm$9h\rq\x0c"+\x0b`ru~gt\rNJ:.g="d;3&/J!WD1s*@k[" Z(E{axh:P{oEBN7q4E}\tvHU\\}[',
  87),
 ("sxI\\kD2J,(7~=jJ2!=73u51s@F87I[ShrVTiJDm@G~\

In [32]:
class Evolver(Evolver):
    def selection(self, evaluated_population, tournament_size):
        competition = random.sample(evaluated_population, tournament_size)
        winner = min(competition, key=lambda individual: individual[1])[0]
        # Return a copy of the selected individual
        assert winner
        return winner[:]

In [33]:
class Evolver(Evolver):
    def crossover(self, parent1, parent2):
        assert parent1
        assert parent2
        pos = random.randint(1, len(parent1))

        offspring1 = parent1[:pos] + parent2[pos:]
        offspring2 = parent2[:pos] + parent1[pos:]
        assert offspring1
        assert offspring2

        return (offspring1, offspring2)

In [34]:
class Evolver(Evolver):
    def mutate(self, chromosome):
        assert chromosome
        mutated = chromosome[:]
        P = 1.0 / len(mutated)

        for pos in range(len(mutated)):
            if random.random() < P:
                new_c = chr(int(random.gauss(ord(mutated[pos]), 100) % 65536))
                mutated = mutated[:pos] + new_c + mutated[pos + 1:]
        return mutated

In [35]:
class Evolver(Evolver):
    def genetic_algorithm(self):
        generation = 0
        population = self.create_population(100)
        fitness = self.evaluate_population(population)
        best = min(fitness, key=lambda item: item[1])
        best_individual = best[0]
        best_fitness = best[1]
        if self.log:
            print("Best fitness of initial population: %s - %.10f" % (repr(best_individual), best_fitness))

        # Stop when optimum found, or we run out of patience
        while best_fitness > self.delta and generation < 1000:
            if self.log:
                print('.', best_fitness)
            # The next generation will have the same size as the current one
            new_population = []
            while len(new_population) < len(population):
                # Selection
                offspring1 = self.selection(fitness, 10)
                offspring2 = self.selection(fitness, 10)

                # Crossover
                if random.random() < 0.7:
                    (offspring1, offspring2) = self.crossover(offspring1, offspring2)

                # Mutation
                offspring1 = self.mutate(offspring1)
                offspring2 = self.mutate(offspring2)

                new_population.append(offspring1)
                new_population.append(offspring2)

            # Once full, the new population replaces the old one
            generation += 1
            population = new_population
            fitness = self.evaluate_population(population)

            best = min(fitness, key=lambda item: item[1])
            best_individual = best[0]
            best_fitness = best[1]
            if self.log:
                print("Best fitness at generation %d: %s - %.8f" % (generation, repr(best_individual), best_fitness))

        if self.log:
            print("Best individual: %s, fitness %.10f" %(repr(best_individual), best_fitness))
        return best_individual, best_fitness

In [36]:
expr_evolver = Evolver(get_expr_fitness)

In [37]:
expr_evolver.genetic_algorithm()

Best fitness of initial population: '?' - 1.0000000000
. 1
Best fitness at generation 1: 'S' - 1.00000000
. 1
Best fitness at generation 2: 'Ü' - 1.00000000
. 1
Best fitness at generation 3: 'Î' - 1.00000000
. 1
Best fitness at generation 4: 'Ę' - 1.00000000
. 1
Best fitness at generation 5: 'ﾞ' - 1.00000000
. 1
Best fitness at generation 6: '`' - 1.00000000
. 1
Best fitness at generation 7: 'ﻰ' - 1.00000000
. 1
Best fitness at generation 8: 'ﻃ' - 1.00000000
. 1
Best fitness at generation 9: 'C' - 1.00000000
. 1
Best fitness at generation 10: 'Ƈ' - 1.00000000
. 1
Best fitness at generation 11: ' ' - 1.00000000
. 1
Best fitness at generation 12: 'ź' - 1.00000000
. 1
Best fitness at generation 13: '\ufff8' - 1.00000000
. 1
Best fitness at generation 14: 'ﾸ' - 1.00000000
. 1
Best fitness at generation 15: 'Ǫ' - 1.00000000
. 1
Best fitness at generation 16: 'ē' - 1.00000000
. 1
Best fitness at generation 17: 'ā' - 1.00000000
. 1
Best fitness at generation 18: 'ŵ' - 1.00000000
. 1
Best fitn

Best fitness at generation 296: 'ﲢ' - 1.00000000
. 1
Best fitness at generation 297: 'Υ' - 1.00000000
. 1
Best fitness at generation 298: 'ۗ' - 1.00000000
. 1
Best fitness at generation 299: '\\' - 1.00000000
. 1
Best fitness at generation 300: 'ڳ' - 1.00000000
. 1
Best fitness at generation 301: 'Ò' - 1.00000000
. 1
Best fitness at generation 302: 'ϛ' - 1.00000000
. 1
Best fitness at generation 303: 'ﳽ' - 1.00000000
. 1
Best fitness at generation 304: 'Ɨ' - 1.00000000
. 1
Best fitness at generation 305: 'ҙ' - 1.00000000
. 1
Best fitness at generation 306: 'ߨ' - 1.00000000
. 1
Best fitness at generation 307: 'Ғ' - 1.00000000
. 1
Best fitness at generation 308: 'ӂ' - 1.00000000
. 1
Best fitness at generation 309: 'ￜ' - 1.00000000
. 1
Best fitness at generation 310: 'ҵ' - 1.00000000
. 1
Best fitness at generation 311: 'ց' - 1.00000000
. 1
Best fitness at generation 312: 'ﺳ' - 1.00000000
. 1
Best fitness at generation 313: 'ﱦ' - 1.00000000
. 1
Best fitness at generation 314: '\u0603' - 1.

Best fitness at generation 566: 'µ' - 1.00000000
. 1
Best fitness at generation 567: 'בֿ' - 1.00000000
. 1
Best fitness at generation 568: '\ufe67' - 1.00000000
. 1
Best fitness at generation 569: '\ufaee' - 1.00000000
. 1
Best fitness at generation 570: 'ﴒ' - 1.00000000
. 1
Best fitness at generation 571: '穀' - 1.00000000
. 1
Best fitness at generation 572: 'ﭓ' - 1.00000000
. 1
Best fitness at generation 573: '泌' - 1.00000000
. 1
Best fitness at generation 574: 'ﯬ' - 1.00000000
. 1
Best fitness at generation 575: 'ﲊ' - 1.00000000
. 1
Best fitness at generation 576: 'ﻲ' - 1.00000000
. 1
Best fitness at generation 577: '/' - 1.00000000
. 1
Best fitness at generation 578: 'ﰭ' - 1.00000000
. 1
Best fitness at generation 579: '\uf7ea' - 1.00000000
. 1
Best fitness at generation 580: '塀' - 1.00000000
. 1
Best fitness at generation 581: 'ﮠ' - 1.00000000
. 1
Best fitness at generation 582: '\uf7a9' - 1.00000000
. 1
Best fitness at generation 583: '諾' - 1.00000000
. 1
Best fitness at generation

Best fitness at generation 815: '\uf6bc' - 1.00000000
. 1
Best fitness at generation 816: 'ƹ' - 1.00000000
. 1
Best fitness at generation 817: 'Ě' - 1.00000000
. 1
Best fitness at generation 818: 'Ӄ' - 1.00000000
. 1
Best fitness at generation 819: '\uf7fb' - 1.00000000
. 1
Best fitness at generation 820: 'Ҡ' - 1.00000000
. 1
Best fitness at generation 821: 'Ũ' - 1.00000000
. 1
Best fitness at generation 822: '\uf84c' - 1.00000000
. 1
Best fitness at generation 823: 'ﲚ' - 1.00000000
. 1
Best fitness at generation 824: 'Ǎ' - 1.00000000
. 1
Best fitness at generation 825: 'ｈ' - 1.00000000
. 1
Best fitness at generation 826: '\uf156' - 1.00000000
. 1
Best fitness at generation 827: 'ʳ' - 1.00000000
. 1
Best fitness at generation 828: 'ￎ' - 1.00000000
. 1
Best fitness at generation 829: 'ﲉ' - 1.00000000
. 1
Best fitness at generation 830: 'å' - 1.00000000
. 1
Best fitness at generation 831: '×' - 1.00000000
. 1
Best fitness at generation 832: '\uf282' - 1.00000000
. 1
Best fitness at gener

('\ufdeb', 1)

In [38]:
class ExprEvolver(Evolver):
    def get_fitness(self, s):
        with ExpectExprError(s, log=self.log) as e:
            calculator.main(e.s)
        match e.result:
            case ExprStatus.Complete:
                return 1.0/len(e.s)
            case ExprStatus.Unexpected:
                return len(e.s) - e.boundary
            case ExprStatus.Unterminated:
                return 1
        assert False, (s, e)

In [39]:
expr_evolver = ExprEvolver(log=False)

In [40]:
for i in range(10):
    v = expr_evolver.genetic_algorithm()
    print(repr(v))

('ׅ', 1)
('37', 0.5)
('\ueeb0', 1)
('ﴲ', 1)
('F', 1)
('蓼', 1)
('115', 0.3333333333333333)
('\uf8d0', 1)
('45', 0.5)
('\u0603', 1)


### JSON
Generating JSON can be slow. (Only if we have enough time).

In [41]:
class JStatus(enum.Enum):
    Complete = 0
    Extra = 1
    Unterminated = -1
    Expecting = -2

In [42]:
import subjects.microjson as microjson

In [43]:
class ExpectJSONError:
    def __init__(self, s=None, log=False):
        self.msg = None
        self.boundary = None
        self.result = None
        self.s = s
        self.log = log

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, tb):
        if exc_type is None:
            self.boundary = 0
            self.result = JStatus.Complete
            return
        json_error = exc_value
        msg = str(exc_value)
        if self.log:
            print(msg, file=sys.stderr)
        if msg.startswith('extra data after JSON at position'):
            self.result = JStatus.Extra
        elif msg.startswith('malformed JSON data at position'):
            self.result = JStatus.Expecting
        elif msg.startswith('missing colon after key at position'):
            self.result = JStatus.Expecting
        elif msg.startswith('expected null at position'):
            self.result = JStatus.Expecting
        elif msg.startswith('expected boolean at position'):
            self.result = JStatus.Expecting
        elif msg.startswith('truncated JSON data at position'):                                                       
            self.result = JStatus.Unterminated
        else:
            # Not all exceptions have been specifically caught in the interest of simplicity.
            # assert False, msg
            self.result = JStatus.Expecting
        self.boundary = exc_value.pos
        return True

In [44]:
error_data = [
    #expected null at position 0, "'n%m\ri<Q8P<t{STo~V&iH|_pJu}8_*fB\r'"
    'n%m\ri<Q8P<t{STo~V&iH|_pJu}8_*fB\r',
    # expected boolean at position 0, "'tWI6n )AB/'"
    'tWI6n )AB/',
    # missing colon after key at position 36, "'fn1+"AC8fwp{@cQ'"
    'fn1+"AC8fwp{@cQ'
]

In [45]:
for x in error_data:
    with ExpectJSONError(x) as e:
        microjson.main(e.s)
    print(e.boundary, e.result)

0 JStatus.Expecting
0 JStatus.Expecting
0 JStatus.Expecting


In [46]:
with ExpectJSONError() as e:
     microjson.main('["abc"]de')
e.boundary, e.result

(7, <JStatus.Extra: 1>)

In [47]:
with ExpectJSONError() as e:
     microjson.main('["abc')
e.boundary, e.result

(4, <JStatus.Unterminated: -1>)

In [48]:
with ExpectJSONError() as e:
     microjson.main('[ab')
e.boundary, e.result

(1, <JStatus.Expecting: -2>)

In [49]:
with ExpectJSONError() as e:
     microjson.main('[1,2,3]')
e.boundary, e.result

(0, <JStatus.Complete: 0>)

In [50]:
class JSONEvolver(Evolver):
    def get_fitness(self, s):
        with ExpectJSONError(s, self.log) as e:
            microjson.main(e.s)
        match e.result:
            case JStatus.Complete:
                return 1.0/len(e.s)
            case JStatus.Extra:
                return len(s) - e.boundary
                # better to be incomplete than incorrect.
                return len(s) * 0.1
            case JStatus.Expecting:
                if len(s) == e.boundary:
                    return 1
                return len(s) - e.boundary
            case JStatus.Unterminated:
                return 1
        assert False, (s, e)

In [51]:
json_evolver = JSONEvolver(log=False)

**Can be really slow**

In [52]:
for i in range(10):
    v = json_evolver.genetic_algorithm()
    print(repr(v))

('"Ðđ𢡄＂ｾǳ\x89Ôￓ\t\x8fﻪｦŴģȣĿŉȕy\x00Â["', 0.04)
(' 6', 0.5)
('ᅀ', 1)
('--', 0.5)
('שּׁ', 1)
('899', 0.3333333333333333)
('ʺ', 1)
('292', 0.3333333333333333)
('}', 1)
('"ÞąＵ\x9cｩr±_ａￚ\x1dç"', 0.07142857142857142)


# Done

In [53]:
#%tb

No traceback available to show.
