# 1 Setup & Initial Configuration
##### Import required packages and configure the generator.

In [2]:
# import common packages
import math
import matplotlib.pyplot as pp
import numpy as np
import pandas as pd
import seaborn
import sklearn
import tslearn

from collections import OrderedDict
from tqdm import tqdm

In [3]:
# import Faker package
from faker import Faker

# Call BaseProvider (contains address, bank, barcode, company, job, etc.)
from faker.providers import BaseProvider

# Call DynamicProvider (optional - it allows injection of external sources)
from faker.providers import DynamicProvider

# 2 Generator Setup
##### Define the required functions and config parameters that determines the data generated.

## Locale Configuration
### Option 1 - Single Localized Provider: Use the cell below for single locale configuration.
#### **Format: fake = Faker('locale')

In [12]:
# Choose Locale (locale code can be found at: https://faker.readthedocs.io/en/master/locales.html)
fake = Faker('en_AU')

In [None]:
# Seeding (optional - default is 0)
# fake.seed_locale('en_AU', 0)

### Option 2 - Multi Localized Provider: Use the cell below for multiple locale configuration, these allows data from different localized provider to be generated at the same time by a given weight.
#### **Format: ('locale', weight)

In [5]:
# Add all wanted localized providers to OrderedDict and assign a weight
# locales = OrderedDict([
#     ('en_AU', 98),
#     ('en_US', 1),
#     ('ja_JP', 1)
# ])
# fake = Faker(locales)

# # Get list of locales (optional - view config result)
# # fake.locales

# # Get list of internal generator (otpional - view config result)
# fake.factories

In [6]:
# Seeds (optional - default is 0)
# Faker.seed(0)

# Explicitly call to seed instances within function (replace # with value)
# Faker.seed_instance(0)

# Call random instance
# fake.random

# Seeds for multi-locale
# fake.seed_locale('en_AU', 0)

## Generator Configuration
### Defines data generation function, generated data will be populated in a panda dataframe.

In [8]:
# Define Dynamic Provider Functions
def dp_variables(dp_name, dp_elements):
    fake_provier = DynamicProvider(
        provider_name = dp_name,
        elements = dp_elements)
    fake.add_provider(fake_provier)

In [None]:
# Define Data Elements Function


### Configure Parameters
* Number of rows (x)
* Dynamic providers (optional)
* Data Elements

In [None]:
# Number of rows (x):
x = 200

In [9]:
# Define all required dynamic providers, which is only required to generate self-defined random variables
# (optional - only required if using DynamicProvider)
# Note: This CELL can be reused to add multiple dynamic providers, just given a different name and re-run/copy the CELL.

dp_name = "dp_name" # Select a fitting name for your provider

dp_elements = ["element 1", "element 2","..."] # Elements within the array will be selected randomly when generating data

dp_variables(dp_name, dp_elements) # Don't touch.

In [None]:
# Define data elements (more info: https://faker.readthedocs.io/en/master/providers.html)

def generate_fake():
    x = x
    data = pd.DataFrame()
    for i in tqdm(range(x), desc = 'Creating DataFrame'): # Configure element name, provider, and format below. 
        data.loc[i, 'element_1_name'] = fake.bothify(text = '###-###-###') # Add more rows for addtional data elements.
    return data

### Below is a slightly more complicated sample, also can be used as an alternative.

In [None]:
# One Formula Alternative (optional - this is a one formula which bypasses all required Generator Configuration)
# def create_dataframe():
#     x = 200
#     data = pd.DataFrame()
#     fake_titles_provier = DynamicProvider(
#         provider_name = "fake_titles",
#         elements = ["", "","", "", "", "", " ", "Mr.", "Ms", "Miss", "Dr.", "Mr", "Prof", "staff"])
#     fake.add_provider(fake_titles_provier)
#     for i in tqdm(range(x), desc = 'Creating DataFrame'):
#         data.loc[i, 'TFN'] = fake.bothify(text = '###-###-###')
#         data.loc[i, 'Title'] = fake.fake_titles()
#         data.loc[i, 'FirstName'] = fake.first_name()
#         data.loc[i, 'MiddleName'] = fake.last_name()
#         data.loc[i, 'LastName'] = fake.last_name()
#         data.loc[i, 'Phone'] = fake.phone_number()
#         data.loc[i, 'Email'] = fake.email()
#         data.loc[i, 'Address_Street'] = fake.street_address()
#         data.loc[i, 'Address_Suburb'] = fake.city()
#         data.loc[i, 'Address_State'] = fake.administrative_unit()
#         data.loc[i, 'Address_PostCode'] = fake.postcode()
#         data.loc[i, 'ABN'] = fake.bothify(text = '## ### ### ###')
#         data.loc[i, 'ACN'] = fake.bothify(text = '### ### ###')
#         data.loc[i, 'BusinessName'] = fake.company() + " " + fake.company_suffix()
#     return data

# 3 Generate Data
##### Simply run the code below to generate your data.

In [9]:
dummy_data = generate_fake()

Creating DataFrame: 100%|███████████████████████████████████████████████████████████| 200/200 [00:01<00:00, 106.25it/s]


In [14]:
# Optional - view data generated
print(dummy_data)

<pandas.io.formats.style.Styler object at 0x0000022C8464A880>


### Export to CSV

In [15]:
dummy_data.to_csv('Dummy_Data_No_Index.csv', index = False)

##  Done - Go to your project folder to retrive generated data.

# Test Ground
##### TestGround for any basic test the values you want to generate, contains some samples.

### Att 1. TFN

In [102]:
# TFN
# for _ in range(20):
#     print(fake.bothify(text = '###-###-###'))

### Att 2. Prefix

In [103]:
# Create external sources called fake_titles (only required for custom outputs)
# fake_titles_provier = DynamicProvider(
#     provider_name = "fake_titles",
#     elements = ["", "","", "", "", "", " ", "Mr.", "Ms", "Miss", "Dr.", "Mr", "Prof", "staff"]
# )

In [104]:
# add new provider to instance
# fake.add_provider(fake_titles_provier)

In [105]:
# Test
# for _ in range(20):
#     print(fake.fake_titles())

In [64]:
# Aternatively, use .random_elements
# for _ in range (20):
#     print(fake.random_element(elements = OrderedDict(
#     [
#         ("", 50),
#         ("Mr", 10),
#         ("Mr.", 10),
#         ("Ms", 10),
#         ("Miss", 10),
#         ("Dr.", 6),
#         ("Prof", 3),
#         ("staff", 1)
#     ])))

### Att 3. Name

In [77]:
# Freetxt name
# for _ in range(20):
#     print(fake.name())

In [106]:
# # First Name
# for _ in range(20):
#     print(fake.first_name())

In [107]:
# Middle & Last Name
# for _ in range(20):
#     print(fake.last_name())

### Att 4. Phone No

In [108]:
# Phone No.
# for _ in range(20):
#     print(fake.phone_number())

### Att 5. Email

In [109]:
# test generate random address
# for _ in range(20):
#     print(fake.email())

### Att 6. Address

In [90]:
# Address
# for _ in range(20):
#     print(fake.address())

In [110]:
# Street Address
# for _ in range(20):
#     print(fake.street_address())

In [111]:
# Suburb
# for _ in range(20):
#     print(fake.city())

In [112]:
# # State
# for _ in range(20):
#     print(fake.administrative_unit())

In [113]:
# Postcode
# for _ in range(20):
#     print(fake.postcode())

### Att 7. ABN

In [114]:
# ABN
# for _ in range(20):
#     print(fake.bothify(text = '## ### ### ###'))

### Att 8. ACN

In [115]:
# ACN
# for _ in range(20):
#     print(fake.bothify(text = '### ### ###'))

### Att 9. Business Name

In [116]:
# for _ in range(20):
#     print(fake.company() + " " + fake.company_suffix())