## An attempt at a more formal grammar

```
function_to_call  ::= <wordcharacters>
parameters        ::= "" | "(" ( wordcharacters | number ) ")"
as_name           ::= "" | "as" <whitespace> <wordcharacters>
column_name       ::= as_name | function_to_call
reference         ::= "" |  "[" number "]"
unique_mark       ::= "" | "*"
column_definition ::= <function_to_call> <parameters> <whitespace> \
                      <as_name> <whitespace> <reference> <unique_mark>
df_size           ::= "" |  "[" integer "]"
df_sep            ::= "--" ("-"*)
df_definition     ::= <wordcharacters> <df_size> <newline> <df_sep> <newline> \
                      (<column_definition>*) <newline> <newline>
language_spec     ::= <def_definition>*
```


In [9]:
from IPython.core.magic import (register_line_magic, register_cell_magic,
                                register_line_cell_magic)
import pandas as pd
from faker import Faker
import re
import random

# Config for generator
# Please tweak these numbers sensibly, keeping in mind unique faker values and reference linking
NUMROWS = 99           # number of rows per DataFrame
MAX_REPEATS = 4        # maximum number of times a reference can repeat (1 = repeated at most once)
ORPHANED_UNIQUES = 0.2 # % of uniques references which won't be used 

# Sanity check against crazy configs
assert NUMROWS < 1000, "Putting this here because there aren't enough last names for the default example"
assert (1-ORPHANED_UNIQUES) * MAX_REPEATS >= 1, "Can't generate enough data with these settings (ORPHANED_UNIQUES, MAX_REPEATS)"

# Global variables
reference_dict = {} # persistent reference for the reference columns, dict of sets
fake = Faker()

# Helper functions for casting parameters to int, float or string
def cast_parameter(x):
    if x is None:
        return x
    elif type(x) != str:
        raise ValueError('Input must be a string or None')
    try:
        a = float(x)
        b = int(a)
    except ValueError:
        pass
    else:
        if a == b:
            return b
    try:
        a = float(x)
    except ValueError:
        return x
    else:
        return a

# Pass set() to set_for_uniqueness for uniqueness requirement
# Pass None to set_for_uniqueness for no uniqueness requirement
def gen_data(_, function_name,parameter,set_for_uniqueness):
    # QUESTION: not sure if passing by set() by reference for set_for_uniqueness will mess up by panda's
    #           concurrency. It SEEMS OK, but have not tested rigorously
    # TODO:     parameter currently only takes 1 variable, need to convert to support multiple?
    # TODO:     For some function_name like first_name, we can directly get the Provider list of possible
    #           values and get a subset from it rather than repeatedly generating them
    #               from faker.providers.person.en import Provider
    #               set(Provider.first_names)
    func = getattr(fake, function_name)
    parameter = cast_parameter(parameter)
    while(True):
        value = func() if parameter is None else func(parameter)
        if type(set_for_uniqueness) is set:
            if value not in set_for_uniqueness:
                set_for_uniqueness.add(value)
                return value
        else:
            return value

def get_reference_column_data(reference_key, function_name,parameter,is_unique):
    # ASSUMPTION: assuming that function_name and parameter for each reference is configured the same way
    # first generate unique set of data if it doesn't already exist
    if reference_key not in reference_dict.keys():
        new_data = set()
        while(len(new_data)<NUMROWS):
            new_data.add(gen_data(None, function_name,parameter,new_data))
        reference_dict[reference_key] = new_data

    if is_unique:
        # if we are just looking for the unique data, we can returned the shuffled set
        return random.sample(reference_dict[reference_key], len(reference_dict[reference_key]))
    else:
        # sample items to exclude some reference values
        values = random.sample(reference_dict[reference_key], int(NUMROWS*(1-ORPHANED_UNIQUES)))
        # return sampled items (so they appear at least once) and fill the rest with the same sampled items
        # based on the MAX_REPEATS configuration
        return values + random.sample(values * (MAX_REPEATS-1),NUMROWS - len(values))

@register_cell_magic
def fakedata(line, cell):

    # do the DataFrames one by one
    regex = re.compile(r'(?P<df_name>[\w]+)(?: \[(?P<df_size>\d+)\])?(?:\n-+\n)(?P<df_def>(?:.+\n?)+)')
    
    for dfdict in [m.groupdict() for m in regex.finditer(cell.strip())]:
        definitions = dfdict['df_def'].strip().split('\n')
        df_size = dfdict['df_size'] if dfdict['df_size'] is not None else NUMROWS
        column_info = {}
        
        # break down each line of a DataFrame based on provided definitions
        for d in definitions:
            dpattern = r"^(?P<function>\w+)(?:(?:\((?P<parameter>[-+]?\d*\.?\d+|\w+)\))?)(?: as (?P<as_name>\w*))?(?: \[(?P<reference>\d+)\])?(?P<unique_mark>\*)?$"
            dmatch = re.search(dpattern,d).groupdict()
            column_info[dmatch['as_name'] if dmatch['as_name'] is not None else dmatch['function']] = dmatch
            
        # create DataFrame 
        df = pd.DataFrame(index=range(0,NUMROWS), columns=column_info.keys())
        
        #populate each column with data
        for k,v in column_info.items():
            if v['reference'] is not None:
                # this is a reference column, we need to reference our references of the reference
                # since we're demonstrating One to Many
                df[k] = get_reference_column_data(v['reference'],v['function'],v['parameter'],v['unique_mark'] is not None)
            else:
                df[k] = df[k].apply(gen_data, args=(v['function'],v['parameter'],None if v['unique_mark'] is None else set()))
        
        # assign the created DataFrame as a global variable using the provided name
        globals()[dfdict['df_name']] = df

In [10]:
%%fakedata
persons
-------
first_name
last_name*
phone_number
random_number(5) as customer_number [1]*

purchases
---------
isbn10
credit_card_full
random_number(3) as price
random_number(5) as customer_number [1]

99
99


In [3]:
persons

Unnamed: 0,first_name,last_name,phone_number,customer_number
0,Jeremiah,Castillo,+1-323-496-9462x27807,58861
1,Raymond,Rowe,+1-934-687-7266,60153
2,Jennifer,Wolf,207.789.7382x488,4050
3,Andrea,Miller,+1-696-907-0916x022,14187
4,Robert,Armstrong,368.372.6263x4960,36996
...,...,...,...,...
94,Tiffany,Burch,030-483-4082x3961,28233
95,Walter,Schultz,656.747.6172,6798
96,Stacy,Casey,066-643-9944,45137
97,Tyler,Davis,108.279.1420x241,11020


In [4]:
purchases

Unnamed: 0,isbn10,credit_card_full,price,customer_number
0,0-11-773548-5,American Express\nAndrew Clark\n37550002493694...,333,34391
1,1-152-78741-1,Mastercard\nJeffrey Schultz\n2554867928641719 ...,374,30999
2,1-69062-745-X,VISA 19 digit\nMisty Adams\n468941753504668202...,580,7096
3,1-04-770967-8,Maestro\nStephanie Allen\n675965825424 03/29\n...,521,11020
4,0-8001-5706-0,JCB 15 digit\nVictoria Thornton\n1800580041189...,847,16682
...,...,...,...,...
94,0-8419-5047-4,JCB 16 digit\nMaxwell Ryan\n3556745298333495 0...,663,12303
95,0-665-94779-8,Mastercard\nCorey Gonzales\n2702808051654120 0...,220,28196
96,0-581-72707-X,VISA 19 digit\nJamie Hill\n4047775650164331987...,495,33880
97,0-550-26427-2,JCB 15 digit\nNatalie Burns\n180061283747321 0...,225,96034


In [5]:
# stats for purchases['customer_number'] reference values
# how many times distribution of number of times each customer_number appeared
purchases['customer_number'].value_counts().value_counts()

1    61
2    16
3     2
Name: customer_number, dtype: int64

In [6]:
# # testing my cast_parameter() helper function
# l = ["-1","12313","-1.0.0","1.546","-.4","+31","+.7","abc123","d7f8g8h8jh","1e2","-6e-3","", None]
# for v in l:
#     a = cast_parameter(v)
#     print("{} value: [{}]".format(type(a),a))