## An attempt at a more formal grammar

```
function_to_call  ::= <wordcharacters>
parameters        ::= "" | "(" ( wordcharacters | number ) ")"
as_name           ::= "" | "as" <whitespace> <wordcharacters>
column_name       ::= as_name | function_to_call
reference         ::= "" |  "[" number "]"
unique_mark       ::= "" | "*"
column_definition ::= <function_to_call> <parameters> <whitespace> \
                      <as_name> <whitespace> <reference> <unique_mark>
df_size           ::= "" |  "[" integer "]"
df_sep            ::= "--" ("-"*)
df_definition     ::= <wordcharacters> <df_size> <newline> <df_sep> <newline> \
                      (<column_definition>*) <newline> <newline>
language_spec     ::= <def_definition>*
```


In [None]:
from IPython.core.magic import (register_line_magic, register_cell_magic,
                                register_line_cell_magic)
import pandas as pd
from faker import Faker
import re
import random

# Config for generator
# Please tweak these numbers sensibly, keeping in mind unique faker values and reference linking
DEFAULT_DF_SIZE = 99   # default number of rows per DataFrame
MAX_REPEATS = 4        # maximum number of times a reference can repeat (1 = repeated at most once)
ORPHANED_UNIQUES = 0.2 # % of uniques references which won't be used 
IS_DEBUG = True        # debug flag for a few print statements.  not sure if and how __debug__ should be used instead?
# Global variables
reference_dict = {} # persistent reference for the reference columns, dict of sets
fake = Faker()

# Helper functions for casting parameters to int, float or string
def cast_parameter(x):
    if x is None:
        return x
    elif type(x) != str:
        raise ValueError('Input must be a string or None')
    try:
        a = float(x)
        b = int(a)
    except ValueError:
        pass
    else:
        if a == b:
            return b
    try:
        a = float(x)
    except ValueError:
        return x
    else:
        return a
    
def dprint(s):
    if IS_DEBUG:
        print(s)

In [None]:
# This function generate a singular value based on the Faker function and reference used
# Uniqueness is checked against the set "set_for_uniqueness", a local reference for the caller is needed
# Pass None to set_for_uniqueness for no uniqueness requirement

def gen_data(_, function_name,parameter,set_for_uniqueness):
    # QUESTION: not sure if passing by set() by reference for set_for_uniqueness will mess up by panda's
    #           concurrency. It SEEMS OK, but have not tested rigorously
    # TODO:     parameter currently only takes 1 variable, need to convert to support multiple?
    # TODO:     For some function_name like first_name, we can directly get the Provider list of possible
    #           values and get a subset from it rather than repeatedly generating them
    #               from faker.providers.person.en import Provider
    #               set(Provider.first_names)
    func = getattr(fake, function_name)
    parameter = cast_parameter(parameter)
    while(True):
        value = func() if parameter is None else func(parameter)
        if type(set_for_uniqueness) is set:
            if value not in set_for_uniqueness:
                set_for_uniqueness.add(value)
                return value
        else:
            return value

In [None]:
# This function generates data specifically for reference columns since the data needs to be shared amongst DataFrames

def get_reference_column_data(reference_key, function_name,parameter,is_unique, count):
    # ASSUMPTION: assuming that function_name and parameter for each reference is configured the same way
    # first generate unique set of data if it doesn't already exist
    if reference_key not in reference_dict.keys():
        # FIXME: need to generate unique reference key first.
        new_data = set()
        while(len(new_data)<count):
            new_data.add(gen_data(None, function_name,parameter,new_data))
        reference_dict[reference_key] = new_data

    if is_unique:
        # if we are just looking for the unique data, we can returned the shuffled set
        return random.sample(reference_dict[reference_key], len(reference_dict[reference_key]))
    else:
        # sample items to exclude some reference values
        values = random.sample(reference_dict[reference_key], int(count*(1-ORPHANED_UNIQUES)))
        # return sampled items (so they appear at least once) and fill the rest with the same sampled items
        # based on the MAX_REPEATS configuration
        return values + random.sample(values * (MAX_REPEATS-1),count - len(values))

In [None]:
@register_cell_magic
def fakedata(line, cell):

    # We can probably do some checking here, such as:
    # * no dataframes with same name
    # * no duplicate names for columns with same name
    
    # Step 1, we parse the input into 2 separate dataframes
    # dataframe_size contains the size for each dataframe
    # dataframe_col_def contains column definitions of each dataframe
    dataframe_size = pd.DataFrame(columns=['df_name', 'size'])
    dataframe_size.set_index('df_name', inplace=True)
    dataframe_col_def = pd.DataFrame(columns=['df_name', 'col_name', 'function', 'parameter', 'as_name', 'reference', 'unique_mark'])
    dataframe_col_def.set_index(['df_name','col_name'], inplace=True)

    # we split the dataframes out to process them one at a time
    regex = re.compile(r'(?P<df_name>[\w]+)(?: \[(?P<df_size>\d+)\])?(?:\n-+\n)(?P<df_def>(?:.+\n?)+)')
    
    for dfdict in [m.groupdict() for m in regex.finditer(cell.strip())]:
        df_name = dfdict['df_name']
        df_size = int(dfdict['df_size']) if dfdict['df_size'] is not None else DEFAULT_DF_SIZE
        definitions = dfdict['df_def'].strip().split('\n')

        # we can check that there are no duplicated dataframe names
        assert df_name not in dataframe_size.index, "DataFrames with duplicated names found: {}".format(df_name)
        dataframe_size.loc[df_name] = [df_size]
        
        # break down each column of the DataFrame based on provided definitions
        for d in definitions:
            dpattern = r"^(?P<function>\w+)(?:(?:\((?P<parameter>[-+]?\d*\.?\d+|\w+)\))?)(?: as (?P<as_name>\w*))?(?: \[(?P<reference>\d+)\])?(?P<unique_mark>\*)?$"
            dmatch = re.search(dpattern,d).groupdict()
            col_name = dmatch['as_name'] if dmatch['as_name'] is not None else dmatch['function']
            dmatch['unique_mark'] = dmatch['unique_mark'] is not None
        
            # we can check that there are no duplicated dataframe names
            assert (df_name, col_name) not in dataframe_col_def.index, "Columns with duplicated names found in DataFrame \"{}\": {}".format(df_name, col_name)
            dataframe_col_def.loc[(df_name, col_name),dataframe_col_def.columns] = [dmatch['function'], dmatch['parameter'], dmatch['as_name'], dmatch['reference'], dmatch['unique_mark']]
    
    # we don't need as_name anymore since column names are set, so let's drop it
    dataframe_col_def.drop(columns=['as_name'], inplace=True)

    # These are the parsed input
    dprint(dataframe_size)
    dprint(dataframe_col_def)
    
    # Now let's create the dataframes!
    
    # Step 2, pre-generate all references first
    # for each reference that has  a unique_mark, we generate the exact amount needed to fill its dataframe
    # if no unique, just find the highest
    ref_df = dataframe_col_def.join(dataframe_size, how='inner', on='df_name')
    ref_df = ref_df.groupby(['reference','function','parameter','unique_mark'], as_index=False)[['size']].max()
    ref_check = ref_df[ref_df.reset_index().duplicated(subset=['reference','unique_mark'])]
    ref_df.sort_values(by=['unique_mark'], ascending=False, inplace=True)
    ref_df.drop_duplicates(subset=['reference','function','parameter'], inplace=True)
    dprint(ref_df)
    assert len(ref_check) == 0, 'Inconsistent "function" and "parameter" definitions for references: {}'.format(ref_check.reference.tolist())
    
    while(False):
        # create DataFrame 
        df = pd.DataFrame(index=range(0,df_size), columns=column_info.keys())
        
        #populate each column with data
        for k,v in column_info.items():
            if v['reference'] is not None:
                # this is a reference column, we need to reference our references of the reference
                # since we're demonstrating One to Many
                df[k] = get_reference_column_data(v['reference'],v['function'],v['parameter'],v['unique_mark'], df_size)
            else:
                df[k] = df[k].apply(gen_data, args=(v['function'],v['parameter'],set() if v['unique_mark'] else None))
        
        # assign the created DataFrame as a global variable using the provided name
        globals()[df_name] = df

In [None]:
%%fakedata
persons [20]
-------
first_name
last_name*
phone_number
random_number(5) as customer_number [1]*

purchasesA [20]
---------
isbn10
credit_card_full
random_number(3) as price
random_number(5) as customer_number [1]

purchasesB [40]
---------
isbn10
credit_card_full
random_number(3) as price
random_number(5) as customer_number [1]

purchasesC [30]
---------
isbn10
credit_card_full
random_number(3) as price
random_number(5) as customer_number [1]

In [None]:
persons

In [None]:
purchases

In [None]:
# stats for purchases['customer_number'] reference values
# how many times distribution of number of times each customer_number appeared
purchases['customer_number'].value_counts().value_counts()

In [None]:
# # testing my cast_parameter() helper function
# l = ["-1","12313","-1.0.0","1.546","-.4","+31","+.7","abc123","d7f8g8h8jh","1e2","-6e-3","", None]
# for v in l:
#     a = cast_parameter(v)
#     print("{} value: [{}]".format(type(a),a))