In [2]:
import pandas as pd
import random
from faker import Faker

fake = Faker('en_AU')  # Initialize Faker with Australian-specific data

# Set random seed for reproducibility
# random.seed(42)

In [3]:
import requests

# Send a GET request to the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_cities_in_Australia_by_population"
response = requests.get(url)

html = requests.get(url)

df = pd.read_html(html.text)

dfs = df[2].copy().iloc[:,[1,2]]
dfs.columns = ['City', 'Population']

dfs['Population_n'] = round(dfs['Population']/sum(dfs['Population']),2)

In [4]:
df_state = df[2].copy().iloc[:,[1,2,8,9,10,11,12,13,14,15]]
df_state.columns = ['City', 'Population', 'ACT', 'NSW', 'NT', 'QLD', 'SA', 'TAS', 'WA', 'VIC']

In [5]:
dfu = pd.melt(df_state, id_vars=['City', 'Population'], var_name='State', value_name='Value')
dfu = dfu[dfu['Value'].notna()]
dfu = dfu[['City', 'State']]

In [6]:
choice_account_type = ['Savings', 'Cheque', 'Offset']
choice_states = ['NSW', 'VIC', 'TAS', 'SA', 'WA', 'NT', 'QLD']
choice_managers = [fake.name() for _ in range(50)]

In [7]:
# Generate 100,000 records
data = []
list_size = 100000
for _ in range(list_size):
    account_number = random.randint(10000000, 99999999)  # Generate random 8-digit account number
    date = fake.date_between(start_date='-1y', end_date='today')
    balance = round(random.uniform(1000, 100000), 2)  # Generate random balance between $1,000 and $100,000
    currency = 'AUD'
    account_type = random.choice(['Savings', 'Cheque', 'Offset'])
    account_owner = fake.name()
    account_lead = random.choice(choice_managers)
    interest_rate = round(random.uniform(4, 5), 2)  # Generate random interest rate between 4% and 5%
    account_status = random.choice(['Active', 'Closed', 'Frozen'])
    
    data.append([account_number, date, balance, currency, account_type, account_owner,
                 interest_rate, account_status])

# Create a pandas DataFrame
df = pd.DataFrame(data, columns=['AccountNumber', 'CreationDate', 'Balance', 'Currency', 'AccountType',
                                 'AccountOwner', 'InterestRate', 'AccountStatus'])

In [8]:
choice_city = dfs['City'].tolist()
city_prob = dfs['Population_n'].tolist()

df['Branch'] = random.choices(choice_city, weights=city_prob, k=list_size)

In [9]:
df.head()

Unnamed: 0,AccountNumber,CreationDate,Balance,Currency,AccountType,AccountOwner,InterestRate,AccountStatus,Branch
0,59122995,2023-05-26,83840.29,AUD,Savings,Michelle Morales,4.83,Active,Darwin
1,82286590,2023-05-15,62406.8,AUD,Savings,Theresa Watson,4.82,Closed,Melbourne
2,30460529,2023-06-01,24946.46,AUD,Savings,James Daniels,4.83,Frozen,Melbourne
3,44740867,2022-11-04,8608.83,AUD,Cheque,Brandon Cummings,4.61,Frozen,Sydney
4,38832379,2023-06-12,11889.45,AUD,Offset,Deborah Miller,4.57,Active,Sydney


In [10]:
dfm = df.merge(dfu, left_on='Branch', right_on='City', how='left')

In [11]:
dfm.pop('City')
dfm = dfm[dfm['State'].notna()]

In [12]:
dfm.head()

Unnamed: 0,AccountNumber,CreationDate,Balance,Currency,AccountType,AccountOwner,InterestRate,AccountStatus,Branch,State
0,59122995,2023-05-26,83840.29,AUD,Savings,Michelle Morales,4.83,Active,Darwin,NT
1,82286590,2023-05-15,62406.8,AUD,Savings,Theresa Watson,4.82,Closed,Melbourne,VIC
2,30460529,2023-06-01,24946.46,AUD,Savings,James Daniels,4.83,Frozen,Melbourne,VIC
3,44740867,2022-11-04,8608.83,AUD,Cheque,Brandon Cummings,4.61,Frozen,Sydney,NSW
4,38832379,2023-06-12,11889.45,AUD,Offset,Deborah Miller,4.57,Active,Sydney,NSW


In [13]:
dfm.to_csv("bank balance.csv", index=False)