<h1 style="color: red;">Generating fake data with Faker</h1>

In [1]:
from faker import Faker
import decimal, random, secrets, sqlalchemy, json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
# import pandas_profiling

In [2]:
# initialize Faker instance
faker = Faker()

In [3]:
# creating our sqlalchemy engine and creating connection
def create_engine(username, password, database, *args, **kwargs):
    return sqlalchemy.create_engine(f'postgresql://{username}:{password}@localhost:5432/{database}')

# creating a connection
engine = create_engine('', '', '')

# general column fields
general_columns = ['id', 'appstech_labs_id', 'sparse', 'active', 'sync_token', 'domain', 'metadata_createtime', 'metadata_last_updatedtime']

In [4]:
# functions for general fields..
def general_active():
    return np.random.choice([True, False], p=[0.9998, 0.0002])

def general_sync_token():
    return str(faker.uuid4())

def general_domain_name():
    return np.random.choice(['QBO', 'OTHER'], p=[0.9998, 0.0002])

def general_sparse():
    return np.random.choice([True, False], p=[0.00001, 0.99999])

def general_meta_time():
    return faker.date_between(start_date='-2y')

In [5]:
def get_item_name():
    return faker.name()

def get_txn_date():
    return faker.date_between(start_date='-2y')

def get_unit_price():
    return random.randint(100, 200)

def get_item_qyt():
    return random.randint(1, 10)

def get_item_amt(qty, unit_price, *args, **kwargs):
    return qty * unit_price

def get_is_discount_percent():
    return faker.boolean()

def get_discount_per():
    return random.randint(1, 20)

def get_currency_code():
    currency_codes = ['USD', 'AUD', 'XAF']
    return faker.random_element(elements=currency_codes)

def get_total_tax():
    return random.randint(0, 2000)

def get_tax_code_value():
    tax_codes = ['TAX', 'NON']
    return faker.random_element(elements=tax_codes)

def get_total_tax():
    return np.random.choice(10, p=[0.6, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0])

def get_total_amt(amt, total_tax, discount_amt, is_discount_per, *args, **kwargs):
    if is_discount_per:
        discount_per = kwargs.get('discount_per', None)
        
        if not discount_per:
            raise Exception('Discount percentage value required')
        
        return amt - (discount_per / 100 * amt) + total_tax
        
    else:
        
        return amt + total_tax - discount_amt

<h1 style="color: red;">Generate Dataframe</h1>

In [6]:
def generate_dataframe():
    df = pd.DataFrame(columns=['date', 'item_id', 'item_name', 'qty', 'unit_price', 'amt', 'discount_amt', 'tax_code_value','is_discount_percent','discount_per', 'txn_total_tax', 'margin', 'total_amt'])
    
    # loop for 1000 customers for a company
    for transaction_id in range(500):
        qty = get_item_qyt()
        unit_price = get_unit_price()
        amt = get_item_amt(qty, unit_price)
        is_discount_percent = get_is_discount_percent()
        discount_amt = get_discount_per() / 100 * amt
        tax_code = get_tax_code_value()
        discount_per = get_discount_per()
        total_tax = get_total_tax()
        total_amt = get_total_amt(amt, total_tax, discount_amt, is_discount_percent, discount_per=discount_per)
        
        df = df.append({'date': get_txn_date(), 'item_id': transaction_id, 'item_name': get_item_name(), 'qty': qty, 'unit_price': unit_price, 'amt': amt, 'discount_amt': discount_amt if not is_discount_percent else discount_per / 100 * amt, 'tax_code_value': tax_code, 'is_discount_percent': is_discount_percent, 'discount_per': discount_per if is_discount_percent else 0, 'txn_total_tax': total_tax, 'total_amt': total_amt}, ignore_index=True)

    return df

In [7]:
generated_df = generate_dataframe()
generated_df.sort_values(by=['date'], inplace=True, ignore_index=True)
generated_df.head()

Unnamed: 0,date,item_id,item_name,qty,unit_price,amt,discount_amt,tax_code_value,is_discount_percent,discount_per,txn_total_tax,margin,total_amt
0,2018-04-28,461,Oscar Jenkins,4,200,800,32.0,NON,False,0,5,,773.0
1,2018-04-29,278,Samantha Lopez,8,188,1504,225.6,TAX,False,0,0,,1278.4
2,2018-04-29,87,Katherine Stone,4,122,488,43.92,TAX,True,9,0,,444.08
3,2018-05-01,77,Jessica Fleming,9,165,1485,118.8,TAX,True,8,0,,1366.2
4,2018-05-01,146,William Kelly,9,164,1476,118.08,NON,False,0,7,,1364.92


In [8]:
generated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 500 non-null    object 
 1   item_id              500 non-null    object 
 2   item_name            500 non-null    object 
 3   qty                  500 non-null    object 
 4   unit_price           500 non-null    object 
 5   amt                  500 non-null    object 
 6   discount_amt         500 non-null    float64
 7   tax_code_value       500 non-null    object 
 8   is_discount_percent  500 non-null    object 
 9   discount_per         500 non-null    object 
 10  txn_total_tax        500 non-null    object 
 11  margin               0 non-null      float64
 12  total_amt            500 non-null    float64
dtypes: float64(3), object(10)
memory usage: 50.9+ KB


<h1 style="color: red;">Pickle Dataframe</h1>

In [9]:
# generated_df.to_pickle('./fake_data.pkl')

<h1 style="color: red;">Generate User Table</h1>

In [10]:
# create dataframe and columns
user_columns = ['id', 'first_name', 'last_name', 'email', 'company', 'password', 'active', 'secret_code', 'created_on', 'updated_on']

In [11]:
# function definations for getting fake data...
def get_first_name():
    return faker.first_name()

def get_last_name():
    return faker.last_name()

def get_email(id):
    email = faker.email()
    split = email.split('@')
    return f'{split[0]}_{id}@{split[-1]}'

def get_company_name():
    return faker.company()

def get_secrets(length):
    return str(secrets.token_hex(int(length)))

def get_creation_date():
    return faker.date_between(start_date='-2y')

In [12]:
# generate dataframe
def generate_user_dataframe(num_of_users):
    df = pd.DataFrame(columns=user_columns)
    
    # generating 1000 users
    for user_id in range(num_of_users):
        df = df.append({'id': user_id + 1, 'first_name': get_first_name(), 'last_name': get_last_name(), 'email': get_email(user_id + 1), 'company': get_company_name(), 'password': get_secrets(16), 'active': general_active(), 'secret_code': get_secrets(4), 'created_on': get_creation_date(), 'updated_on': get_creation_date()}, ignore_index=True)
        
    df.sort_values(by=['id', 'created_on'], inplace=True, ignore_index=True)
    
    return df

In [13]:
user_df = generate_user_dataframe(10000)
user_df.head()

Unnamed: 0,id,first_name,last_name,email,company,password,active,secret_code,created_on,updated_on
0,1,Cheryl,Williams,tdavis_1@pineda-hopkins.com,Fletcher Inc,1eabc49af2f6b752305c77b8a64dcac2,True,362337db,2019-06-17,2020-03-04
1,2,Aaron,Spencer,ashleycooper_2@reid-smith.com,Becker Group,b0ef82890e6b9e1a8416dd9df05cc7be,True,b1e29ff4,2018-06-12,2019-10-23
2,3,Katherine,Knight,wendysummers_3@myers-ellis.info,"James, Burch and Snow",2ffde329bf0f81bebd79075ea952eeb3,True,5f2e1861,2020-03-24,2018-09-27
3,4,Alexandria,Dominguez,amy55_4@gmail.com,Garcia-White,0bb250e8d55237fa58dd1d00b07c89bf,True,a54e1edc,2019-09-19,2020-03-20
4,5,Joshua,Maddox,ronaldwhite_5@gmail.com,Villarreal Group,753b764d196df273eb226905abbf6ae3,True,8effe100,2019-07-27,2020-04-18


In [14]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           10000 non-null  object
 1   first_name   10000 non-null  object
 2   last_name    10000 non-null  object
 3   email        10000 non-null  object
 4   company      10000 non-null  object
 5   password     10000 non-null  object
 6   active       10000 non-null  object
 7   secret_code  10000 non-null  object
 8   created_on   10000 non-null  object
 9   updated_on   10000 non-null  object
dtypes: object(10)
memory usage: 781.4+ KB


In [15]:
# save dataframe to db
# user_df.to_sql('users', con=engine, if_exists='append', index=False)

<h1 style="color: red;">Generate TaxAgency Table</h1>

In [16]:
def tax_tracked_onpurchases():
    on_purchase_tax = [True, False]
    return faker.random_element(elements=on_purchase_tax)

def tax_tracked_onsales():
    on_sales_tax = [True, False]
    return faker.random_element(elements=on_sales_tax)

def display_tax_name():
    return faker.word().upper()

def tax_registration_number():
    return faker.credit_card_number()

def get_last_file_date():
    return faker.date_between(start_date='-2y')

In [17]:
tax_tracked_onsales()

True

In [18]:
# generate dataframe
def generate_tax_agency_dataframe(num_rows):
    columns = ['tax_tracked_onpurchases', 'tax_tracked_onsales', 'last_file_date', 'display_name', 'tax_registration_number']
    columns = columns.extend(general_columns)
    df = pd.DataFrame(columns=columns)
    
    k = 0
    for user_id in range(num_rows):
        for row_id in range(100):
            if k >= num_rows:
                break
            else:
                df = df.append({'id': row_id + 1, 'appstech_labs_id': k + 1, 'tax_tracked_onpurchases': tax_tracked_onpurchases(), 'tax_tracked_onsales': tax_tracked_onsales(), 'last_file_date': get_last_file_date(), 'display_name': display_tax_name(), 'tax_registration_number': tax_registration_number(), 'domain': general_domain_name(), 'sparse': general_sparse(), 'active': general_active(), 'sync_token': general_sync_token(), 'metadata_createtime': general_meta_time(), 'metadata_last_updatedtime': general_meta_time()}, ignore_index=True)
                k = k + 1
                
#     df = df.sample(n=df.shape[0], axis=0).reset_index(drop=True)
    df[['tax_tracked_onpurchases', 'tax_tracked_onsales', 'sparse', 'active']]= df[['tax_tracked_onpurchases', 'tax_tracked_onsales', 'sparse', 'active']].astype('boolean')
    return df

In [19]:
tax_agency_df = generate_tax_agency_dataframe(10000)
tax_agency_df.head()

Unnamed: 0,active,appstech_labs_id,display_name,domain,id,last_file_date,metadata_createtime,metadata_last_updatedtime,sparse,sync_token,tax_registration_number,tax_tracked_onpurchases,tax_tracked_onsales
0,True,1.0,RECOGNIZE,QBO,1.0,2018-05-18,2019-12-12,2019-04-20,False,405270c1-27f7-4b8e-b42d-1f11dd238eb2,4508208634004,True,False
1,True,2.0,ENVIRONMENTAL,QBO,2.0,2019-11-07,2018-10-11,2019-08-25,False,4e397440-a390-4115-b476-d5d8c4fcb5e9,376580637004927,False,True
2,True,3.0,FIGURE,QBO,3.0,2018-10-21,2018-06-16,2018-09-04,False,a86a4cda-37bc-46c6-88a3-a0de7c400949,3598413371370740,True,True
3,True,4.0,RECORD,QBO,4.0,2019-11-08,2020-02-17,2018-11-01,False,5fa259c1-42a6-40bd-a264-b9eab3970599,4641297769470088,False,True
4,True,5.0,YET,QBO,5.0,2018-09-26,2018-09-24,2019-08-09,False,258480b6-6be5-4718-badd-85fba24d529b,36742529123437,False,True


In [20]:
# save dataframe to db
# tax_agency_df.to_sql('quickbooks_tax_agencies', con=engine, if_exists='append', index=False)

<h1 style="color: red;">Generate TaxRate Table</h1>

In [21]:
# tax code values
tax_code_dic = [
    {'id': 'Z', 'description': 'Zero-rated', 'rate_in_percent': 0, 'tax_onsale': 'Taxable'},
    {'id': 'E', 'description': 'Tax-exempt', 'rate_in_percent': 'N/A', 'tax_onsale': 'Non-taxable'},
    {'id': 'Out of scope', 'description': 'Nontaxable goods and services', 'rate_in_percent': 0, 'tax_onsale': 'N/A'},
    {'id': 'GST', 'description': 'Federal goods & services tax', 'rate_in_percent': 5, 'tax_onsale': 'Taxable'},
    {'id': 'GST BC', 'description': 'Federal tax (GST) only', 'rate_in_percent': 5, 'tax_onsale': 'Taxable'},
    {'id': 'PST BC', 'description': 'BC provincial tax only', 'rate_in_percent': 7, 'tax_onsale': 'Taxable'},
    {'id': 'GST/PST BC', 'description': 'Federal and provincial tax (BC)', 'rate_in_percent': 12, 'tax_onsale': 'Taxable'},
    {'id': 'GST/PST MB', 'description': 'Combined federal and Manitoba provincial tax', 'rate_in_percent': 13, 'tax_onsale': 'Taxable'},
    {'id': 'GST', 'description': 'Federal tax (GST) only', 'rate_in_percent': 5, 'tax_onsale': 'Taxable'},
    {'id': 'PST MB', 'description': 'Manitoba provincial tax only', 'rate_in_percent': 8, 'tax_onsale': 'Taxable'},
    {'id': 'HST NB', 'description': 'Harmonized federal and New Brunswick provincial tax', 'rate_in_percent': 15, 'tax_onsale': 'Taxable'},
    {'id': 'HST NL', 'description': 'Harmonized federal and Newfoundland and Labrador provincial tax', 'rate_in_percent': 15, 'tax_onsale': 'Taxable'},
    {'id': 'HST NS', 'description': 'Harmonized federal and Nova Scotia provincial tax', 'rate_in_percent': 15, 'tax_onsale': 'Taxable'},
    {'id': 'HST ON', 'description': 'Harmonized federal and Ontario provincial tax', 'rate_in_percent': 13, 'tax_onsale': 'Taxable'},
    {'id': 'HST', 'description': 'Harmonized federal and PEI provincial tax', 'rate_in_percent': 14, 'tax_onsale': 'Taxable'},
    {'id': 'GST/QST QC', 'description': 'Combined federal and Quebec provincial tax', 'rate_in_percent': 14.975, 'tax_onsale': 'Taxable'},
    {'id': 'GST', 'description': 'Federal tax (GST) only', 'rate_in_percent': 5, 'tax_onsale': 'Taxable'},
    {'id': 'QST QC', 'description': 'Quebec provincial tax only', 'rate_in_percent': 9.975, 'tax_onsale': 'Taxable'},
    {'id': 'GST/PST SK', 'description': 'Combined federal and Saskatchewan provincial tax', 'rate_in_percent': 10, 'tax_onsale': 'Taxable'},
    {'id': 'GST', 'description': 'Federal tax (GST) only', 'rate_in_percent': 5, 'tax_onsale': 'Taxable'},
    {'id': 'PST SK', 'description': 'Zero-rated', 'rate_in_percent': 5, 'tax_onsale': 'Taxable'}, 
]

In [22]:
def generate_tax_rate_dataframe(num_rows):
    columns = ['name', 'rate_value', 'agency_ref_value', 'agency_ref_name', 'desciption', 'display_type', 'tax_return_line_ref_value', 'tax_return_line_ref_name', 'effective_tax_rate_data_value', 'effective_tax_rate_data_end_date', 'effective_tax_rate_data_effective_date', 'special_tax_type']
    columns = columns.extend(general_columns)
    df = pd.DataFrame(columns=columns)
    
    k = 0
    for user_id in range(num_rows):
        for index, tax_obj in enumerate(tax_code_dic):
            if k >= num_rows:
                break
            else:
                tax_agency_obj = tax_agency_df.loc[tax_agency_df['appstech_labs_id'] == k + 1]
                agency_ref_value = tax_agency_obj['id']
                agency_ref_name = tax_agency_obj['display_name']
                df = df.append({'id': index + 1, 'appstech_labs_id': k + 1, 'name': tax_obj['id'], 'rate_value': tax_obj['rate_in_percent'], 'agency_ref_value': agency_ref_value, 'agency_ref_name': agency_ref_name, 'active': general_active(), 'description': tax_obj['description'], 'display_type': 'ReadOnly', 'tax_return_line_ref_value': np.nan, 'tax_return_line_ref_name': np.nan, 'effective_tax_rate_data_value': np.nan, 'effective_tax_rate_data_end_date': get_creation_date(), 'effective_tax_rate_data_effective_date': get_creation_date(), 'special_tax_type': 'NONE', 'domain': general_domain_name(), 'sparse': general_sparse(), 'sync_token': general_sync_token(), 'metadata_createtime': general_meta_time(), 'metadata_updatedtime': general_meta_time()}, ignore_index=True)
                k = k + 1
#     df = df.sample(n=df.shape[0], axis=0).reset_index(drop=True)
    df[['sparse', 'active']]= df[['sparse', 'active']].astype('boolean')
    return df

In [23]:
tax_rate_df = generate_tax_rate_dataframe(10000)
tax_rate_df.head()

Unnamed: 0,active,agency_ref_name,agency_ref_value,appstech_labs_id,description,display_type,domain,effective_tax_rate_data_effective_date,effective_tax_rate_data_end_date,effective_tax_rate_data_value,id,metadata_createtime,metadata_updatedtime,name,rate_value,sparse,special_tax_type,sync_token,tax_return_line_ref_name,tax_return_line_ref_value
0,True,"0 RECOGNIZE Name: display_name, dtype: object","0 1.0 Name: id, dtype: float64",1.0,Zero-rated,ReadOnly,QBO,2020-04-21,2019-02-28,,1.0,2018-09-29,2019-07-02,Z,0.0,False,NONE,b2a68f34-f291-4755-8ff0-cc23f647fc29,,
1,True,"1 ENVIRONMENTAL Name: display_name, dtype: ...","1 2.0 Name: id, dtype: float64",2.0,Tax-exempt,ReadOnly,QBO,2018-10-25,2020-04-05,,2.0,2019-03-06,2020-04-05,E,,False,NONE,e250808b-3b9d-449e-8c25-b13b05707296,,
2,True,"2 FIGURE Name: display_name, dtype: object","2 3.0 Name: id, dtype: float64",3.0,Nontaxable goods and services,ReadOnly,QBO,2019-06-10,2019-06-04,,3.0,2019-05-09,2019-10-31,Out of scope,0.0,False,NONE,9ee5e3b2-53d9-49f4-916d-40e68f4b6900,,
3,True,"3 RECORD Name: display_name, dtype: object","3 4.0 Name: id, dtype: float64",4.0,Federal goods & services tax,ReadOnly,QBO,2018-08-10,2019-01-01,,4.0,2018-08-19,2018-05-15,GST,5.0,False,NONE,f947fbf0-6055-49ec-8998-2d8f426f1333,,
4,True,"4 YET Name: display_name, dtype: object","4 5.0 Name: id, dtype: float64",5.0,Federal tax (GST) only,ReadOnly,QBO,2019-07-01,2020-04-18,,5.0,2018-06-08,2018-07-23,GST BC,5.0,False,NONE,8c6f28e2-729e-4c61-8c2f-e95fe096a7e3,,


In [24]:
# save dataframe to db
# tax_rate_df.to_sql('quickbooks_tax_rates', con=engine, if_exists='append', index=False)

<h1 style="color: red;">Generate TaxCode Table</h1>

In [25]:
def generate_tax_name(taxable, tax_group, *args, **kwargs):
    if taxable and tax_group:
        return kwargs['id']
        
    elif taxable and not tax_group:
        return 'TAX'
        
    elif not taxable and not tax_group or not taxable:
        return 'NON'
    
def is_taxable(*args, **kwargs):
    if 'taxable' in kwargs:
        if kwargs['taxable'] == True:
            return np.random.choice([True, False], p=[0.35, 0.65])
        else:
            return False
    else:
        return np.random.choice([True, False], p=[0.55, 0.45])
    
def generate_tax_rate(tax_group, tax_ref, *args, **kwargs):
    if tax_group:
        if 'purchase' in kwargs:
            return generate_tax_list(np.random.randint(low=1,high=4), tax_ref, purchase=kwargs['purchase'])
        else:
            return generate_tax_list(np.random.randint(low=1,high=4), tax_ref)
    else:
        return []
    
def generate_tax_list(list_len, tax_ref, *args, **kwargs):
    for i in range(list_len):
        if 'purchase' in kwargs:
            return np.random.choice([{
                'TaxRateDetail': [
                    {"TaxRateRef": {
                        "value": int(tax_ref['value']),
                        "name": tax_ref['name']
                        }
                    }],
                "TaxTypeApplication": np.random.choice(["TaxOnAmount", "TaxOnAmountPlusTax", "TaxOnTax"], p=[0.8, 0.1, 0.1]),
                "TaxOrder": 0}, []], p=[0.15, 0.85])
        
        return {
            'TaxRateDetail': [
                {"TaxRateRef": {
                    "value": int(tax_ref['value']),
                    "name": tax_ref['name']
                }},
            ],
            "TaxTypeApplication": np.random.choice(["TaxOnAmount", "TaxOnAmountPlusTax", "TaxOnTax"], p=[0.8, 0.1, 0.1]),
            "TaxOrder": 0
        }

In [26]:
# tax code values
tax_code = [
    {'id': 'Z', 'description': 'Zero-rated', 'rate_in_percent': 0, 'tax_onsale': 'Taxable'},
    {'id': 'E', 'description': 'Tax-exempt', 'rate_in_percent': 'N/A', 'tax_onsale': 'Non-taxable'},
    {'id': 'Out of scope', 'description': 'Nontaxable goods and services', 'rate_in_percent': 0, 'tax_onsale': 'N/A'},
    {'id': 'GST', 'description': 'Federal goods & services tax', 'rate_in_percent': 5, 'tax_onsale': 'Taxable'},
    {'id': 'GST BC', 'description': 'Federal tax (GST) only', 'rate_in_percent': 5, 'tax_onsale': 'Taxable'},
    {'id': 'PST BC', 'description': 'BC provincial tax only', 'rate_in_percent': 7, 'tax_onsale': 'Taxable'},
    {'id': 'GST/PST BC', 'description': 'Federal and provincial tax (BC)', 'rate_in_percent': 12, 'tax_onsale': 'Taxable'},
    {'id': 'GST/PST MB', 'description': 'Combined federal and Manitoba provincial tax', 'rate_in_percent': 13, 'tax_onsale': 'Taxable'},
    {'id': 'GST', 'description': 'Federal tax (GST) only', 'rate_in_percent': 5, 'tax_onsale': 'Taxable'},
    {'id': 'PST MB', 'description': 'Manitoba provincial tax only', 'rate_in_percent': 8, 'tax_onsale': 'Taxable'},
    {'id': 'HST NB', 'description': 'Harmonized federal and New Brunswick provincial tax', 'rate_in_percent': 15, 'tax_onsale': 'Taxable'},
    {'id': 'HST NL', 'description': 'Harmonized federal and Newfoundland and Labrador provincial tax', 'rate_in_percent': 15, 'tax_onsale': 'Taxable'},
    {'id': 'HST NS', 'description': 'Harmonized federal and Nova Scotia provincial tax', 'rate_in_percent': 15, 'tax_onsale': 'Taxable'},
    {'id': 'HST ON', 'description': 'Harmonized federal and Ontario provincial tax', 'rate_in_percent': 13, 'tax_onsale': 'Taxable'},
    {'id': 'HST', 'description': 'Harmonized federal and PEI provincial tax', 'rate_in_percent': 14, 'tax_onsale': 'Taxable'},
    {'id': 'GST/QST QC', 'description': 'Combined federal and Quebec provincial tax', 'rate_in_percent': 14.975, 'tax_onsale': 'Taxable'},
    {'id': 'GST', 'description': 'Federal tax (GST) only', 'rate_in_percent': 5, 'tax_onsale': 'Taxable'},
    {'id': 'QST QC', 'description': 'Quebec provincial tax only', 'rate_in_percent': 9.975, 'tax_onsale': 'Taxable'},
    {'id': 'GST/PST SK', 'description': 'Combined federal and Saskatchewan provincial tax', 'rate_in_percent': 10, 'tax_onsale': 'Taxable'},
    {'id': 'GST', 'description': 'Federal tax (GST) only', 'rate_in_percent': 5, 'tax_onsale': 'Taxable'},
    {'id': 'PST SK', 'description': 'Zero-rated', 'rate_in_percent': 5, 'tax_onsale': 'Taxable'}, 
]

In [27]:
def generate_tax_code_dataframe(num_rows):
    columns = ['name', 'description', 'hidden', 'taxable', 'tax_group', 'purchase_tax_ratelist_nested', 'sales_tax_ratelist_nested']
    columns = columns.extend(general_columns)
    df = pd.DataFrame(columns=columns)
    
    k = 0
    for user_id in range(num_rows):
        for index, tax_obj in enumerate(tax_code_dic):
            if k >= num_rows:
                break
            else:
                taxable = is_taxable() # randomly generate boolean
                tax_group = is_taxable(taxable=taxable) # randomly generated boolean
                tax_id = generate_tax_name(taxable, tax_group, id=tax_obj['id'])
                name = generate_tax_name(taxable, tax_group, id=tax_obj['id'])
                purchase_tax_rateList = generate_tax_rate(tax_group, {'value': tax_rate_df['id'][user_id], 'name': tax_rate_df['name'][user_id]}, purchase=True)
                sales_tax_rateList = generate_tax_rate(tax_group, {'value': tax_rate_df['id'][user_id], 'name': tax_rate_df['name'][user_id]})
                
                df = df.append({'id': tax_id, 'appstech_labs_id': k + 1, 'name': name, 'description': tax_obj['description'], 'active': general_active(), 'hidden': np.random.choice([True, False], p=[0.25, 0.75]), 'taxable': taxable, 'tax_group': tax_group, 'purchase_tax_ratelist_nested': purchase_tax_rateList, 'sales_tax_ratelist_nested': sales_tax_rateList, 'domain': general_domain_name(), 'sparse': general_sparse(), 'sync_token': general_sync_token(), 'metadata_createtime': general_meta_time(), 'metadata_updatedtime': general_meta_time()}, ignore_index=True)
                
            k = k + 1
    
    df[['sparse', 'active', 'hidden', 'taxable', 'tax_group']]= df[['sparse', 'active', 'hidden', 'taxable', 'tax_group']].astype('boolean')
    return df

In [28]:
tax_code_df = generate_tax_code_dataframe(10000)
tax_code_df.head()

Unnamed: 0,active,appstech_labs_id,description,domain,hidden,id,metadata_createtime,metadata_updatedtime,name,purchase_tax_ratelist_nested,sales_tax_ratelist_nested,sparse,sync_token,tax_group,taxable
0,True,1.0,Zero-rated,QBO,True,NON,2018-09-01,2018-07-23,NON,[],[],False,00ad95fa-a28d-423b-81b8-c2f5f6d6af17,False,False
1,True,2.0,Tax-exempt,QBO,False,NON,2018-10-13,2020-01-11,NON,[],[],False,172014d8-4e84-41da-a527-13ac40b63099,False,False
2,True,3.0,Nontaxable goods and services,QBO,True,TAX,2018-08-23,2019-09-08,TAX,[],[],False,50bf1013-d233-4bdd-8a88-718cd4b934bf,False,True
3,True,4.0,Federal goods & services tax,QBO,True,TAX,2019-05-04,2019-01-04,TAX,[],[],False,ae031c20-2881-4d18-b26f-898f2cde4995,False,True
4,True,5.0,Federal tax (GST) only,QBO,False,NON,2019-09-15,2019-01-31,NON,[],[],False,b6c097e1-02ce-4fa8-aa76-3b7b31aa202b,False,False


<h1 style="color: red;">Generate Items Table</h1>

### Fetch data from site with bs4

In [29]:
url = "https://supergrocerylist.com/list/498?iframe=1"
page = requests.get(url)
print(page)
soup = BeautifulSoup(page.content, 'html.parser')

<Response [200]>


In [46]:
categories = []
items = []

for text in soup.find_all('tr', {'class': 'category'})[1:]:
    categories.append(text.get_text().strip())

cat_len = len(categories)
i = 0
for tag in soup.find_all('tr')[1:]:
    if i <= cat_len - 1:       
        if tag.find_all('td')[0].get_text().strip() not in categories:
            item = {'name': tag.find_all('td')[0].get_text(),
                    'description': tag.find_all('td')[1].get_text(),
                    'category': categories[i],
                    "unit_price": float(str(tag.find_all('td')[4].get_text()).split('/')[0].split('$')[1]) if tag.find_all('td')[4].get_text() != '' else '',
                    "inventory": np.random.choice([True, False], p=(0.90, 0.10))}
            items.append(item)
        else:
            i = i + 1
            
items = items[1:]
items[:3]

[{'name': '7Up Cake',
  'description': ' Village Bakery ',
  'category': 'Baking',
  'unit_price': 0.15,
  'inventory': True},
 {'name': 'Apple Cider Donuts (Seasonal)',
  'description': ' Bake Shop ',
  'category': 'Baking',
  'unit_price': 0.38,
  'inventory': True},
 {'name': 'Artisan Flatbread (Stone Oven Baked - 2 Pack)',
  'description': ' Specially Selected  ',
  'category': 'Baking',
  'unit_price': 0.18,
  'inventory': True}]

In [49]:
def generate_items(num_rows, items):
    columns = ['name', 'item_category_type', 'item_type', 'income_account_ref_value', 'income_account_ref_name', 'expense_account_ref_name', 'expense_account_ref_value', 'asset_account_ref_value', 'asset_account_ref_name', 'sales_tax_code_ref_vale', 'sales_tax_code_ref_name', 'class_ref_value', 'class_ref_name', 'parent_ref_value', 'parent_ref_name', 'fully_qualified_name', 'unitprice', 'purchase_cost', 'service_type', 'purchase_tax_code_ref_value', 'purchase_tax_code_ref_name', 'pref_vendor_ref_value', 'pref_vendor_ref_name', 'purchase_desc', 'reorder_point', 'uqc_display_text', 'uqc_id', 'sub_item', 'taxable', 'abatement_rate', 'reverse_charge_rate', 'description', 'level', 'sales_tax_included', 'purchase_tax_included', 'track_qty_onhead', 'stock_keeping_unit', 'qty_onhand', 'inv_start_date', 'tax_classification_ref_value', 'tax_classificaiton_ref_name']
    columns = columns.extend(general_columns)
    df = pd.DataFrame(columns=columns)
    
    k = 0
    for user_id in range(num_rows):
        for index, item in enumerate(items):
            name = item['name']
            item_category_type = item['category']
            item_type = np.random.choice(['inventory', 'service', 'non-inventory'], p=(0.6, 0.2, 0.2))
            income_account_ref_value = np.NaN
            income_account_ref_name = np.NaN
            expense_account_ref_value = np.NaN
            expense_account_ref_name = np.NaN
            asset_account_ref_value = np.NaN
            asset_account_ref_name = np.NaN
            sales_tax_code_ref_value = tax_code_df['id'][user_id]
            sales_tax_code_ref_name = tax_code_df['name'][user_id]
            class_ref_value = np.NaN
            class_ref_name = np.NaN
            parent_ref_value = np.NaN
            parent_ref_name = np.NaN
            fully_qualified_name = item['name']
            unitprice = item['unit_price']
            purchase_cost = item['unit_price'] - 0.02
            service_type = np.NaN
            purchase_tax_code_ref_value = tax_code_df['purchase_tax_ratelist_nested'][user_id]['TaxRateDetail'][0]['TaxRateRef']['value'] if tax_code_df['purchase_tax_ratelist_nested'][user_id] != [] else np.NaN
            purchase_tax_code_ref_name = tax_code_df['purchase_tax_ratelist_nested'][user_id]['TaxRateDetail'][0]['TaxRateRef']['name'] if tax_code_df['purchase_tax_ratelist_nested'][user_id] != [] else np.NaN
            pref_vendor_ref_value = np.NaN
            pref_vendor_ref_name = np.NaN
            purchase_desc = np.NaN
            reorder_point = np.NaN
            uqc_display_text = np.NaN
            uqc_id = np.NaN
            sub_item = np.NaN
            taxable = np.random.choice([True, False], p=(0.3, 0.7))
            abatement_rate = np.NaN
            reverse_charge_rate = np.NaN
            description = item['description']
            level = np.NaN
            sales_tax_included = np.random.choice([True, False], p=(0.6, 0.4))
            purchase_tax_included = np.random.choice([True, False], p=(0.3, 0.7))
            track_qty_onhand = False
            stock_keeping_unit = np.NaN
            qty_onhand = np.NaN
            inv_start_date = faker.date_between(start_date='-2y')
            tax_classification_ref_value = np.NaN
            tax_classification_ref_name = np.NaN

            df = df.append({'id': index + 1, 'appstech_labs_id': k + 1, 'name': name, 'item_category_type': item_category_type, 'item_type': item_type, 'income_account_ref_value': income_account_ref_value,
                           'income_account_ref_name': income_account_ref_name, 'expense_account_ref_value': expense_account_ref_value, 'expense_account_ref_name': expense_account_ref_name, 'asset_account_ref_value': asset_account_ref_value,
                           'asset_account_ref_name': asset_account_ref_name, 'sales_tax_code_ref_value': sales_tax_code_ref_value, 'sales_tax_code_ref_name': sales_tax_code_ref_name, 'class_ref_value': class_ref_value,
                           'class_ref_name': class_ref_name, 'parent_ref_value': parent_ref_value, 'parent_ref_name': parent_ref_name, 'fully_qualified_name': fully_qualified_name, 'unitprice': unitprice, 'purchase_cost': purchase_cost,
                           'service_type': service_type, 'purchase_tax_code_ref_value': purchase_tax_code_ref_value, 'purchase_tax_code_ref_name': purchase_tax_code_ref_name, 'pref_vendor_ref_value': pref_vendor_ref_value, 'pref_vendor_ref_name': pref_vendor_ref_name,
                           'purchase_desc': purchase_desc, 'reorder_point': reorder_point, 'uqc_display_text': uqc_display_text, 'uqc_id': uqc_id, 'sub_item':sub_item, 'active': general_active(), 'taxable': taxable, 'abatement_rate': abatement_rate,
                           'reverse_charge_rate': reverse_charge_rate, 'description': description, 'level': level, 'sales_tax_included': sales_tax_included, 'purchase_tax_included': purchase_tax_included, 'track_qty_onhand': track_qty_onhand,
                           'stock_keeping_unit': stock_keeping_unit, 'qty_onhand': qty_onhand, 'inv_start_date': inv_start_date, 'tax_classification_ref_value': tax_classification_ref_value, 'tax_classification_ref_name': tax_classification_ref_name,
                           'domain': general_domain_name(), 'sparse': general_sparse(), 'sync_token': general_sync_token(), 'metadata_createtime': general_meta_time(), 'metadata_last_updatedtime': general_meta_time()}, ignore_index=True)
            k = k + 1
                
    df[['sparse', 'active', 'sales_tax_included', 'purchase_tax_included', 'taxable']]= df[['sparse', 'active', 'sales_tax_included', 'purchase_tax_included', 'taxable']].astype('boolean')
    return df



In [52]:
items_df = generate_items(10000, items)
print(items_df.shape)
items_df.head()

KeyboardInterrupt: 