# Sale, Inventory, Marketing Simulation

- [python faker](https://github.com/xfxf/faker-python)
- [thelook_ecommerce fake.py](https://github.com/GoogleCloudPlatform/public-datasets-pipelines/blob/main/datasets/thelook_ecommerce/pipelines/_images/run_thelook_kub/fake.py)
- [thelook_ecommerce pipeline by airflow .py](https://github.com/GoogleCloudPlatform/public-datasets-pipelines/blob/main/datasets/thelook_ecommerce/pipelines/thelook_ecommerce/thelook_ecommerce_dag.py)

## Required Packages

In [170]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from faker import Faker
import copy
import holidays

# Tables

## product

In [237]:
fake = Faker()

random.seed(42)
np.random.seed(42)

# Settings
num_products = 2000

# Define possible categories and brands
categories = ["Grocery", "Bakery", "Deli", "Dairy", "Meat", "Produce", "Frozen Foods", "Snacks", "Beverages", "Household"]
brands = ["Western Family", "Local Farms"]
for i in range(3):
    brands.append("CPG" + str(i+1))

# Generate Products
products = {
    "product_id": [i for i in range(1, num_products + 1)],
    "product_name": [fake.word().capitalize() + " " + fake.word().capitalize() for _ in range(num_products)],
    "cost": lambda df=None: np.round(df["Price ($)"] * np.random.uniform(0.5, 0.8, num_products), 2),
    "category": np.random.choice(categories, num_products),
    "brand": np.random.choice(brands, num_products),
    "retail_price": np.round(np.random.uniform(1.0, 100.0, num_products), 2),
    "sku": [fake.unique.ean(length=8) for _ in range(num_products)]
}

products_df = pd.DataFrame(products)
products_df["cost"] = np.round(products_df["retail_price"] * np.random.uniform(0.5, 0.8), 2)

# Save as CSV
products_df.to_csv("product.csv", index=False)
print("Mock Save-On-Foods Product Table generated!")

Mock Save-On-Foods Product Table generated!


In [238]:
products_df.head(5)

Unnamed: 0,product_id,product_name,cost,category,brand,retail_price,sku
0,1,Cost Heart,52.02,Frozen Foods,Local Farms,93.69,83918087
1,2,Not Bit,4.08,Dairy,CPG3,7.34,4619901
2,3,Easy Poor,45.89,Snacks,Western Family,82.65,50536641
3,4,Join Think,16.63,Meat,Western Family,29.95,27742150
4,5,Day Without,24.96,Frozen Foods,CPG1,44.95,22995728


## customers

In [241]:
# Initialize Faker instance for generating random names, addresses, emails
fake = Faker()

random.seed(42)
np.random.seed(42)

# Constants
num_customers = 100000  # Number of customers
start_date = datetime(2024, 1, 1)
end_date = datetime(2025, 4, 25)
# end_date = datetime.now() + timedelta(days=1)

# Occupation/industry choices
occupations = [
    "Healthcare", "Technology", "Finance", "Retail", "Manufacturing", 
    "Education", "Construction", "Marketing", "Real Estate", "Hospitality"
]

# Generating the customer data
customers_data = []

# Function to generate random dates for account creation
def random_account_created_date():
    return fake.date_between(start_date=start_date, end_date=end_date)

# Function to generate fake address in the Greater Vancouver Area
def generate_vancouver_address():
    # List of cities within the Greater Vancouver Area
    cities = ['Vancouver', 'Burnaby', 'Richmond', 'Surrey', 'North Vancouver', 'West Vancouver', 'Langley', 'Coquitlam', 'Delta']
    # Randomly choose a city from the list
    city = random.choice(cities)
    # Generate a fake address using Faker
    address = fake.address().replace('\n', ',')
    # Replace city and province to match Greater Vancouver
    address = address.split(',')[0] + ', ' + city + ', BC'
    return address

# Function to generate random salary estimation based on occupation
def estimate_salary(occupation):
    if occupation == "Healthcare":
        return random.randint(50000, 120000)
    elif occupation == "Technology":
        return random.randint(70000, 150000)
    elif occupation == "Finance":
        return random.randint(60000, 130000)
    elif occupation == "Retail":
        return random.randint(30000, 70000)
    elif occupation == "Manufacturing":
        return random.randint(40000, 90000)
    elif occupation == "Education":
        return random.randint(35000, 75000)
    elif occupation == "Construction":
        return random.randint(40000, 90000)
    elif occupation == "Marketing":
        return random.randint(50000, 100000)
    elif occupation == "Real Estate":
        return random.randint(40000, 100000)
    elif occupation == "Hospitality":
        return random.randint(30000, 80000)

# Generate simulated customer data
for customer_id in range(1, num_customers + 1):
    # generate fake address
    address = generate_vancouver_address()
    
    # Extracting approximate longitude and latitude for Greater Vancouver area
    longitude = round(random.uniform(-123.5, -122.2), 6)  # Greater Vancouver area
    latitude = round(random.uniform(49.1, 49.5), 6)
    
    email = fake.email()
    age = random.randint(18, 70)
    gender = random.choice(['Male', 'Female'])
    account_created_date = random_account_created_date()
    is_vip_membership = random.choice([1, 0])
    family_size = random.randint(1, 5)  # Assuming family size between 1 and 5
    occupation = random.choice(occupations)
    annual_salary_estimate = estimate_salary(occupation)
    ad_group = random.randint(1, 10) # Assuming advertisement group from 1 to 10
    
    customers_data.append([
        customer_id, address, longitude, latitude, email, age, gender,
        account_created_date, is_vip_membership, family_size, occupation, annual_salary_estimate, ad_group
    ])

# Create a DataFrame from the generated data
customers_df = pd.DataFrame(customers_data, columns=[
    'customer_id', 'home_address', 'longitude', 'latitude', 'email', 'age', 'gender',
    'account_created_date', 'is_vip_membership', 'family_size', 'occupation', 'annual_salary_estimate', 'ad_group'
])

# save the data to a CSV file
customers_df.to_csv('customer.csv', index=False)
print("Mock Save-On-Foods Customer Table generated!")

Mock Save-On-Foods Customer Table generated!


In [242]:
customers_df.head(5)

Unnamed: 0,customer_id,home_address,longitude,latitude,email,age,gender,account_created_date,is_vip_membership,family_size,occupation,annual_salary_estimate,ad_group
0,1,"403 Frey Well Suite 067, Burnaby, BC",-123.467486,49.210012,ericksonrobert@example.org,32,Male,2024-08-14,1,5,Technology,147397,7
1,2,"56765 Lin Plaza Apt. 499, Vancouver, BC",-123.461264,49.187455,romannicole@example.com,50,Male,2024-03-22,1,5,Construction,54446,8
2,3,"5325 Christopher Plaza, North Vancouver, BC",-122.44774,49.1026,marcusknight@example.org,69,Male,2025-02-12,0,3,Manufacturing,50189,4
3,4,"03756 Charles Via Suite 872, West Vancouver, BC",-123.367127,49.251971,epowell@example.net,40,Female,2025-04-10,0,1,Marketing,85142,2
4,5,"USNV Herrera, Langley, BC",-123.39756,49.217271,shanecantrell@example.net,58,Female,2024-12-26,1,1,Healthcare,79871,5


## campaign

### budget_v3

In [555]:
# Setup
fake = Faker()
random.seed(42)
np.random.seed(42)

# Constants
campaign_types = [
    "Sales Promotion Campaign", "Seasonal Sales", "Loyalty Program",
    "Customer Retention Campaign", "Location-Based Campaign",
    "Holiday Campaign", "Back-to-School Campaign"
]
activities = [
    "In-store Setup", "Flyer Launch", "Google Ads Cost", 
    "Local Community Events", "Loyalty Program", "Digital Push"
]
agencies = [random.choice(['agency_', 'specialist_']) + fake.first_name() for _ in range(10)]
today = datetime(2025, 4, 25)

# Generate 20 campaigns and the products included in that campaign
campaigns = []
last_date = datetime(2024,1,1)
for i in range(1, 21):
    campaign_id = i
    start_date = fake.date_between_dates(date_start=last_date + timedelta(days=2), date_end=last_date + timedelta(weeks=3)) 
    duration_weeks = random.randint(1, 3)
    end_date = start_date + timedelta(weeks=duration_weeks)
    last_date = copy.deepcopy(end_date) # keep it no intersection.
    campaign_type = random.choice(campaign_types)
    estimated_budget = random.randint(80000, 150000)

    camp_num_products = random.randint(50, 200)
    products = []
    discounts = []
    used_products = set()
    
    while len(products) < camp_num_products:
        pid = random.randint(1, num_products)
        if pid not in used_products:
            products.append(pid)
            discounts.append(round(random.uniform(0.01, 0.1), 2))
            used_products.add(pid)
    
    campaigns.append({
        "campaign_id": campaign_id,
        "campaign_type": campaign_type,
        "start_date": start_date,
        "end_date": end_date,
        "estimated_budget": estimated_budget,
        "product_ids": products,
        "discounts": discounts
    })

# Create Daily Budget Monitor records
# each product shows up only once one day handled by any agency/specialist
daily_budget_monitor = []
for camp in campaigns:
    campaign_days = (camp["end_date"] - camp["start_date"]).days + 1
    selected_activities = random.sample(activities, random.randint(3, len(activities)))  # minimum 3 activities per campaign
    estimated_budget_per_activity = camp["estimated_budget"] // len(selected_activities)
    product_id_list = copy.deepcopy(camp["product_ids"])
    random.shuffle(product_id_list)
    m = len(product_id_list) // len(selected_activities) # estimated length of product_id_sublist
    
    for activity in selected_activities:
        spent_cumulative = 0
        for day in range(campaign_days):
            date = camp["start_date"] + timedelta(days=day)
            spent_today = max(0, int(np.random.normal(loc=estimated_budget_per_activity/campaign_days, scale=50)))
            spent_cumulative += spent_today
            left_budget = max(estimated_budget_per_activity - spent_cumulative, 0)
            estimated_m = int(random.uniform(1, 2) * m)
            daily_budget_monitor.append({
                "date": date,
                "campaign_id": camp["campaign_id"],
                "activity": activity,
                "spent": spent_today,
                "estimated_budget": estimated_budget_per_activity,
                "left_budget": left_budget,
                "product_id_list": random.sample(product_id_list, estimated_m),
                "product_list_len": estimated_m, # when it is product_id_list for 
                "agency_or_specialist": random.choice(agencies)
            })

# Convert to DataFrame
daily_budget_monitor_df = pd.DataFrame(daily_budget_monitor)

# save the data to a CSV file
daily_budget_monitor_df.to_csv('budget.csv', index=False)

In [556]:
# Show 5 demo rows
print("\n=== Demo: Daily Budget Monitor Table (5 rows) ===")
daily_budget_monitor_df.head(5)


=== Demo: Daily Budget Monitor Table (5 rows) ===


Unnamed: 0,date,campaign_id,activity,spent,estimated_budget,left_budget,product_id_list,product_list_len,agency_or_specialist
0,2024-01-13,1,Loyalty Program,4552,36219,31667,"[1812, 1084, 1558, 2, 1709, 1783, 1150, 412, 5...",48,agency_Julie
1,2024-01-14,1,Loyalty Program,4520,36219,27147,"[1903, 940, 450, 996, 1954, 1709, 1170, 286, 1...",60,specialist_Jennifer
2,2024-01-15,1,Loyalty Program,4559,36219,22588,"[940, 1783, 502, 1331, 1932, 131, 1954, 1482, ...",44,agency_Victor
3,2024-01-16,1,Loyalty Program,4603,36219,17985,"[1331, 996, 1150, 1394, 1471, 974, 394, 1131, ...",70,agency_Ashley
4,2024-01-17,1,Loyalty Program,4515,36219,13470,"[339, 1131, 319, 765, 570, 258, 736, 324, 1547...",46,specialist_Jennifer


In [557]:
daily_budget_monitor_df

Unnamed: 0,date,campaign_id,activity,spent,estimated_budget,left_budget,product_id_list,product_list_len,agency_or_specialist
0,2024-01-13,1,Loyalty Program,4552,36219,31667,"[1812, 1084, 1558, 2, 1709, 1783, 1150, 412, 5...",48,agency_Julie
1,2024-01-14,1,Loyalty Program,4520,36219,27147,"[1903, 940, 450, 996, 1954, 1709, 1170, 286, 1...",60,specialist_Jennifer
2,2024-01-15,1,Loyalty Program,4559,36219,22588,"[940, 1783, 502, 1331, 1932, 131, 1954, 1482, ...",44,agency_Victor
3,2024-01-16,1,Loyalty Program,4603,36219,17985,"[1331, 996, 1150, 1394, 1471, 974, 394, 1131, ...",70,agency_Ashley
4,2024-01-17,1,Loyalty Program,4515,36219,13470,"[339, 1131, 319, 765, 570, 258, 736, 324, 1547...",46,specialist_Jennifer
...,...,...,...,...,...,...,...,...,...
1424,2025-05-24,20,Loyalty Program,1124,25888,4343,"[1588, 1290, 1261, 493, 878, 1627, 1772, 1960,...",20,agency_Douglas
1425,2025-05-25,20,Loyalty Program,1078,25888,3265,"[1330, 327, 1102, 679, 493, 1059, 878, 782, 17...",18,agency_Robin
1426,2025-05-26,20,Loyalty Program,1279,25888,1986,"[437, 150, 197, 1291, 1800, 680, 90, 1261, 196...",23,agency_Jason
1427,2025-05-27,20,Loyalty Program,1121,25888,865,"[1197, 327, 1173, 1291, 1059, 1800, 197, 1530,...",16,agency_Julie


#### Split budget_v3

In [558]:
daily_budget_monitor_df_split = daily_budget_monitor_df.explode(['product_id_list'], ignore_index=True)
daily_budget_monitor_df_split.rename({'product_id_list': 'product_id'}, inplace=True, axis=1)

# save the data to a CSV file
daily_budget_monitor_df_split.to_csv('budget_split.csv', index=False)

In [559]:
daily_budget_monitor_df_split.head()

Unnamed: 0,date,campaign_id,activity,spent,estimated_budget,left_budget,product_id,product_list_len,agency_or_specialist
0,2024-01-13,1,Loyalty Program,4552,36219,31667,1812,48,agency_Julie
1,2024-01-13,1,Loyalty Program,4552,36219,31667,1084,48,agency_Julie
2,2024-01-13,1,Loyalty Program,4552,36219,31667,1558,48,agency_Julie
3,2024-01-13,1,Loyalty Program,4552,36219,31667,2,48,agency_Julie
4,2024-01-13,1,Loyalty Program,4552,36219,31667,1709,48,agency_Julie


### campaign_v3

In [560]:
# Track used product IDs to prevent overlap
random.seed(42)
np.random.seed(42)

# num_products = 2000

campaign_logs = []
for camp in campaigns:

    # Sum real spent budget from the daily monitor
    real_spent_budget = daily_budget_monitor_df[daily_budget_monitor_df["campaign_id"] == camp["campaign_id"]]["spent"].sum()
    approved_budget = int(camp["estimated_budget"] * random.uniform(0.95, 1.1))

    campaign_logs.append({
        "campaign_id": camp["campaign_id"],
        "campaign_name": f"{camp['campaign_type']} {fake.word().capitalize()}",
        "campaign_type": camp["campaign_type"],
        "start_date": camp["start_date"],
        "end_date": camp["end_date"],
        "estimated_budget": camp["estimated_budget"],
        "approved_budget": approved_budget,
        "real_spent_budget": real_spent_budget,
        "product_ids": camp["product_ids"],
        "discounts": camp["discounts"],
        "estimated_sales_increase": round(random.uniform(2, 20), 2)
    })

# Convert to DataFrame
campaign_logs_df = pd.DataFrame(campaign_logs)

# save the data to a CSV file
campaign_logs_df.to_csv('campaign.csv', index=False)

In [561]:
# Show 5 demo rows
print("\n=== Demo: Campaign Management Table (5 rows) ===")
campaign_logs_df.head(5)


=== Demo: Campaign Management Table (5 rows) ===


Unnamed: 0,campaign_id,campaign_name,campaign_type,start_date,end_date,estimated_budget,approved_budget,real_spent_budget,product_ids,discounts,estimated_sales_increase
0,1,Sales Promotion Campaign Response,Sales Promotion Campaign,2024-01-13,2024-01-20,108657,113645,108468,"[1035, 1150, 1331, 860, 1207, 1781, 1651, 866,...","[0.06, 0.03, 0.07, 0.03, 0.04, 0.01, 0.02, 0.0...",2.45
1,2,Loyalty Program Set,Loyalty Program,2024-02-05,2024-02-19,135519,134333,135223,"[957, 111, 1324, 825, 1640, 510, 1099, 865, 94...","[0.09, 0.07, 0.02, 0.08, 0.09, 0.03, 0.05, 0.0...",6.02
2,3,Customer Retention Campaign Building,Customer Retention Campaign,2024-02-29,2024-03-07,89545,94959,89417,"[849, 1179, 1427, 819, 1344, 1829, 1578, 1595,...","[0.09, 0.03, 0.04, 0.03, 0.07, 0.08, 0.09, 0.0...",14.18
3,4,Sales Promotion Campaign Rest,Sales Promotion Campaign,2024-03-20,2024-04-10,107235,116224,107698,"[542, 322, 1131, 6, 1412, 597, 591, 1440, 146,...","[0.07, 0.03, 0.02, 0.05, 0.06, 0.01, 0.07, 0.0...",3.56
4,5,Sales Promotion Campaign Value,Sales Promotion Campaign,2024-04-18,2024-04-25,141537,143417,141376,"[694, 815, 1745, 1382, 779, 1284, 1556, 1109, ...","[0.07, 0.07, 0.04, 0.09, 0.1, 0.07, 0.05, 0.01...",2.54


#### Split campaign_v3

In [562]:
campaign_logs_df_split = campaign_logs_df.explode(['product_ids', 'discounts'], ignore_index=True)
campaign_logs_df_split.rename({'product_ids': 'product_id', 'discounts': 'discount'}, inplace=True, axis=1)

# save the data to a CSV file
campaign_logs_df_split.to_csv('campaign_split.csv', index=False)

In [563]:
campaign_logs_df_split.head()

Unnamed: 0,campaign_id,campaign_name,campaign_type,start_date,end_date,estimated_budget,approved_budget,real_spent_budget,product_id,discount,estimated_sales_increase
0,1,Sales Promotion Campaign Response,Sales Promotion Campaign,2024-01-13,2024-01-20,108657,113645,108468,1035,0.06,2.45
1,1,Sales Promotion Campaign Response,Sales Promotion Campaign,2024-01-13,2024-01-20,108657,113645,108468,1150,0.03,2.45
2,1,Sales Promotion Campaign Response,Sales Promotion Campaign,2024-01-13,2024-01-20,108657,113645,108468,1331,0.07,2.45
3,1,Sales Promotion Campaign Response,Sales Promotion Campaign,2024-01-13,2024-01-20,108657,113645,108468,860,0.03,2.45
4,1,Sales Promotion Campaign Response,Sales Promotion Campaign,2024-01-13,2024-01-20,108657,113645,108468,1207,0.04,2.45


### ads_v3

In [564]:
# =============================
# Now: Generate Online Advertisement Cost Table
# =============================

ads_campaign_types = ["online-search", "online-displays", "online-event", "retargeting-ads", "email-ads"]

ad_cost_records = []

# Filter only Google Ads Cost rows
google_ads_rows = daily_budget_monitor_df[daily_budget_monitor_df["activity"] == "Google Ads Cost"]

for idx,row in google_ads_rows.iterrows():
    date = row["date"]
    campaign_id = row["campaign_id"]
    total_cost = row["spent"]
    agencies = row["agency_or_specialist"]

    # Simulate ad group data (1-10)
    ad_group = random.randint(1, 10)
    
    # Iterate over each product id everyday from campaign_logs_df to check its ads performance. 
    product_id_list = copy.deepcopy(row["product_id_list"])
    # copy.deepcopy(campaign_logs_df[campaign_logs_df["campaign_id"] == campaign_id]["product_ids"].values[0])
    random.shuffle(product_id_list)

    while total_cost > 0:
        if len(product_id_list) > 0:
            product_id = product_id_list.pop(0)
            impressions = random.randint(10000, 30000)
            ctr = random.uniform(0.01, 0.03)  # click-through rate
            clicks = max(1, int(impressions * ctr))
            cpc = max(0.1, min(0.3, total_cost / clicks))  # cost per click ($0.2 to $0.8)
            conversions = max(0, int(clicks * random.uniform(0.02, 0.2))) # conversions estimated from google ads data.
            cost = round(clicks * cpc, 2)
        else:
            # deal with the shortage of product_list
            # solution: put all the remaining cost to the last product
            cost = total_cost
            clicks = round(cost / cpc, 0)
            ctr = clicks / impressions
            conversions = max(0, int(clicks * random.uniform(0.02, 0.2)))
            
        if total_cost - cost >= 0:
            total_cost = total_cost - cost
        else:
            # deal with the overdraft total ads cost
            # solution: match clicks with left cost.
            cost = total_cost
            clicks = round(cost / cpc, 0)
            ctr = clicks / impressions
            conversions = max(0, int(clicks * random.uniform(0.02, 0.2)))
            total_cost = total_cost - cost

        ad_cost_records.append({
                "date": date,
                "ad_group": ad_group,
                "product_id": product_id,
                "impressions": impressions,
                "clicks": clicks,
                "cpc": round(cpc, 2),
                "conversions": conversions,
                "cost": round(clicks * cpc, 2),  # match clicks * CPC
                "campaign_id": campaign_id,
                "ads_campaign_type": random.choice(ads_campaign_types),
                "agency_or_specialist": agencies
            })

# Convert to DataFrame
ad_cost_df = pd.DataFrame(ad_cost_records)

# save the data to a CSV file
ad_cost_df.to_csv('ads.csv', index=False)

In [565]:
print("\n=== Demo: Online Advertisement Cost Table (5 rows) ===")
ad_cost_df.head(5)


=== Demo: Online Advertisement Cost Table (5 rows) ===


Unnamed: 0,date,ad_group,product_id,impressions,clicks,cpc,conversions,cost,campaign_id,ads_campaign_type,agency_or_specialist
0,2024-02-29,2,1449,15182,289.0,0.3,55,86.7,3,online-event,specialist_Donald
1,2024-02-29,2,1553,26403,666.0,0.3,74,199.8,3,online-search,specialist_Donald
2,2024-02-29,2,874,19779,530.0,0.3,59,159.0,3,online-displays,specialist_Donald
3,2024-02-29,2,1333,15008,262.0,0.3,12,78.6,3,email-ads,specialist_Donald
4,2024-02-29,2,1603,10018,220.0,0.3,23,66.0,3,online-search,specialist_Donald


In [566]:
ad_cost_df

Unnamed: 0,date,ad_group,product_id,impressions,clicks,cpc,conversions,cost,campaign_id,ads_campaign_type,agency_or_specialist
0,2024-02-29,2,1449,15182,289.0,0.30,55,86.7,3,online-event,specialist_Donald
1,2024-02-29,2,1553,26403,666.0,0.30,74,199.8,3,online-search,specialist_Donald
2,2024-02-29,2,874,19779,530.0,0.30,59,159.0,3,online-displays,specialist_Donald
3,2024-02-29,2,1333,15008,262.0,0.30,12,78.6,3,email-ads,specialist_Donald
4,2024-02-29,2,1603,10018,220.0,0.30,23,66.0,3,online-search,specialist_Donald
...,...,...,...,...,...,...,...,...,...,...,...
2386,2025-04-18,3,1537,27925,596.0,0.30,118,178.8,19,online-displays,specialist_Donald
2387,2025-04-18,3,1194,10018,298.0,0.30,14,89.4,19,online-search,specialist_Donald
2388,2025-04-18,3,1297,18700,458.0,0.30,74,137.4,19,online-displays,specialist_Donald
2389,2025-04-18,3,272,25345,717.0,0.30,35,215.1,19,retargeting-ads,specialist_Donald


## sale & inventory - v3

In [567]:
fake = Faker()

# Setting the seed for reproducibility
Faker.seed(42)
random.seed(42)

# Constants
START_DATE = "2023-12-30"
END_DATE = "2025-04-25"
NUM_STORES = 34
NUM_PRODUCTS = 2000
NUM_CUSTOMERS = 50000

ca_bc_holidays = holidays.country_holidays('CA', subdiv='BC') 

# Helper function to generate sale data
def generate_sale_data(date, sale_ratio):
    num_orders = random.randint(200, 500) * random.randint(sale_ratio[0], sale_ratio[1]) # Simulating orders for this date
    sales_data = []

    campaign_row = campaign_logs_df_split.loc[(campaign_logs_df_split['start_date'] <= date.date()) & 
                                            (campaign_logs_df_split['end_date'] >= date.date())] # TODO: add 7 days decay effect?
    
    if campaign_row_.size > 0: # non empty
        product_discount_df = copy.deepcopy(campaign_row.loc[:, ['product_id', 'discount']])
        product_discount_dict = product_discount_df.set_index('product_id').to_dict().get('discount')
    else:
        product_discount_dict = dict()
    
    for _ in range(num_orders):
        order_id = len(sales_data) + 1  # Incremental order ID
        store_id = random.randint(1, NUM_STORES)
        customer_id = random.randint(1, NUM_CUSTOMERS)
        purchase_type = random.choice(["online", "instore"])  # 80/20 ratio
        num_items = random.randint(3, 15)
        
        for _ in range(num_items):
            product_id = random.randint(1, NUM_PRODUCTS)
            max_items = 5
            if product_discount_dict.get(product_id):
                number_of_item = random.randint(1, int(max_items * np.exp(5 * product_discount_dict.get(product_id)))) # exponential increase relationship of the discount
            else:
                number_of_item = random.randint(1, max_items)
                
            
            sales_data.append({
                "sale_date": date,
                "customer_id": customer_id,
                "order_id": order_id,
                "store_id": store_id,
                "product_id": product_id,
                "number_of_item": number_of_item,
                "purchase_type": purchase_type
            })
    
    return sales_data

# Helper function to generate inventory data
def generate_inventory_data(date, sales_data):
    inventory_data = []
    product_stock = {product_id: random.randint(50, 100) for product_id in range(1, NUM_PRODUCTS + 1)}
    
    for store_id in range(1, NUM_STORES + 1):
        for product_id, stock in product_stock.items():
            sold_quantity = sum(
                sale["number_of_item"] for sale in sales_data if sale["store_id"] == store_id and sale["product_id"] == product_id
            )
            remaining_stock = max(0, stock - sold_quantity)
            
            # Simulating weekly replenishment (Every Wednesday, add random stock)
            if date.weekday() == 2:
                product_stock[product_id] += random.randint(50, 100)
            
            inventory_data.append({
                "date": date,
                "product_id": product_id,
                "store_id": store_id,
                "stock_quantity": remaining_stock
            })
    
    return inventory_data

# Generating the sales and inventory logs
def generate_logs(start_date, end_date):
    current_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    
    sales_logs = []
    inventory_logs = []
    
    while current_date <= end_date:
        # progress printing
        if current_date.day == 1:
            print(current_date.year, '-', current_date.month, 'Start.')
        # weekend and holiday has more sales:
        if current_date.weekday() < 4:
            sale_ratio = [1,1]
        elif current_date in ca_bc_holidays:
            sale_ratio = [2,4]
        elif current_date.weekday() > 4:
            sale_ratio = [3,5]
        
        sales_data = generate_sale_data(current_date, sale_ratio)
        sales_logs.extend(sales_data)
        
        # Generate inventory data based on sales logs
        inventory_data = generate_inventory_data(current_date, sales_data)
        inventory_logs.extend(inventory_data)
        
        current_date += timedelta(days=1)

    # Convert lists to pandas DataFrames
    sales_df = pd.DataFrame(sales_logs)
    inventory_df = pd.DataFrame(inventory_logs)
    
    return sales_df, inventory_df

# Generate sales and inventory logs
sales_df, inventory_df = generate_logs(START_DATE, END_DATE)

2024 - 1 Start.
2024 - 2 Start.
2024 - 3 Start.
2024 - 4 Start.
2024 - 5 Start.
2024 - 6 Start.
2024 - 7 Start.
2024 - 8 Start.
2024 - 9 Start.
2024 - 10 Start.
2024 - 11 Start.
2024 - 12 Start.
2025 - 1 Start.
2025 - 2 Start.
2025 - 3 Start.
2025 - 4 Start.


In [568]:
# save to CSV
sales_df.to_csv("sale.csv", index=False)
inventory_df.to_csv("inventory.csv", index=False)

In [569]:
# Display sample outputs
print("\n=== Demo: Sales Table (5 rows) ===")
sales_df.head()


=== Demo: Sales Table (5 rows) ===


Unnamed: 0,sale_date,customer_id,order_id,store_id,product_id,number_of_item,purchase_type
0,2023-12-30,16050,1,18,1509,1,online
1,2023-12-30,16050,1,18,1386,5,online
2,2023-12-30,16050,1,18,179,5,online
3,2023-12-30,16050,1,18,865,1,online
4,2023-12-30,16050,1,18,62,1,online


In [570]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2838395 entries, 0 to 2838394
Data columns (total 7 columns):
 #   Column          Dtype         
---  ------          -----         
 0   sale_date       datetime64[ns]
 1   customer_id     int64         
 2   order_id        int64         
 3   store_id        int64         
 4   product_id      int64         
 5   number_of_item  int64         
 6   purchase_type   object        
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 151.6+ MB


In [571]:
# Display sample outputs
print("\n=== Demo: Inventory Table (5 rows) ===")
inventory_df.head()


=== Demo: Inventory Table (5 rows) ===


Unnamed: 0,date,product_id,store_id,stock_quantity
0,2023-12-30,1,1,77
1,2023-12-30,2,1,59
2,2023-12-30,3,1,66
3,2023-12-30,4,1,64
4,2023-12-30,5,1,87


## sale & inventory - v2

In [417]:
fake = Faker()

# Setting the seed for reproducibility
Faker.seed(42)
random.seed(42)

# Constants
START_DATE = "2024-01-01"
END_DATE = "2025-04-25"
NUM_STORES = 37
NUM_PRODUCTS = 2000
NUM_CUSTOMERS = 50000

ca_bc_holidays = holidays.country_holidays('CA', subdiv='BC') 

# Helper function to generate sale data
def generate_sale_data(date, sale_ratio):
    num_orders = random.randint(200, 500) * random.randint(sale_ratio[0], sale_ratio[1]) # Simulating orders for this date
    sales_data = []
    
    for _ in range(num_orders):
        order_id = len(sales_data) + 1  # Incremental order ID
        store_id = random.randint(1, NUM_STORES)
        customer_id = random.randint(1, NUM_CUSTOMERS)
        purchase_type = random.choice(["online", "instore"])  # 80/20 ratio
        num_items = random.randint(3, 15)
        
        for _ in range(num_items):
            product_id = random.randint(1, NUM_PRODUCTS)
            number_of_item = random.randint(1, 5)
            
            sales_data.append({
                "sale_date": date,
                "customer_id": customer_id,
                "order_id": order_id,
                "store_id": store_id,
                "product_id": product_id,
                "number_of_item": number_of_item,
                "purchase_type": purchase_type
            })
    
    return sales_data

# Helper function to generate inventory data
def generate_inventory_data(date, sales_data):
    inventory_data = []
    product_stock = {product_id: random.randint(50, 100) for product_id in range(1, NUM_PRODUCTS + 1)}
    
    for store_id in range(1, NUM_STORES + 1):
        for product_id, stock in product_stock.items():
            sold_quantity = sum(
                sale["number_of_item"] for sale in sales_data if sale["store_id"] == store_id and sale["product_id"] == product_id
            )
            remaining_stock = max(0, stock - sold_quantity)
            
            # Simulating weekly replenishment (Every Wednesday, add random stock)
            if date.weekday() == 2:
                product_stock[product_id] += random.randint(50, 100)
            
            inventory_data.append({
                "date": date,
                "product_id": product_id,
                "store_id": store_id,
                "stock_quantity": remaining_stock
            })
    
    return inventory_data

# Generating the sales and inventory logs
def generate_logs(start_date, end_date):
    current_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    
    sales_logs = []
    inventory_logs = []
    
    while current_date <= end_date:
        # progress printing
        if current_date.day == 1:
            print(current_date.year, '-', current_date.month, 'Start.')
        # weekend and holiday has more sales:
        if current_date.weekday() < 4:
            sale_ratio = [1,1]
        elif current_date in ca_bc_holidays:
            sale_ratio = [2,4]
        elif current_date.weekday() > 4:
            sale_ratio = [3,5]
        
        sales_data = generate_sale_data(current_date.strftime("%Y-%m-%d"), sale_ratio)
        sales_logs.extend(sales_data)
        
        # Generate inventory data based on sales logs
        inventory_data = generate_inventory_data(current_date, sales_data)
        inventory_logs.extend(inventory_data)
        
        current_date += timedelta(days=1)

    # Convert lists to pandas DataFrames
    sales_df = pd.DataFrame(sales_logs)
    inventory_df = pd.DataFrame(inventory_logs)
    
    return sales_df, inventory_df

# Generate sales and inventory logs
sales_df, inventory_df = generate_logs(START_DATE, END_DATE)

2024 - 1 Start.
2024 - 2 Start.
2024 - 3 Start.
2024 - 4 Start.
2024 - 5 Start.
2024 - 6 Start.
2024 - 7 Start.
2024 - 8 Start.
2024 - 9 Start.
2024 - 10 Start.
2024 - 11 Start.
2024 - 12 Start.
2025 - 1 Start.
2025 - 2 Start.
2025 - 3 Start.
2025 - 4 Start.


In [418]:
# save to CSV
sales_df.to_csv("sale.csv", index=False)
inventory_df.to_csv("inventory.csv", index=False)

In [419]:
# Display sample outputs
print("\n=== Demo: Sales Table (5 rows) ===")
sales_df.head()


=== Demo: Sales Table (5 rows) ===


Unnamed: 0,sale_date,customer_id,order_id,store_id,product_id,number_of_item,purchase_type
0,2024-01-01,16050,1,18,1509,1,online
1,2024-01-01,16050,1,18,1386,5,online
2,2024-01-01,16050,1,18,179,5,online
3,2024-01-01,16050,1,18,865,1,online
4,2024-01-01,16050,1,18,62,1,online


In [420]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2815873 entries, 0 to 2815872
Data columns (total 7 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   sale_date       object
 1   customer_id     int64 
 2   order_id        int64 
 3   store_id        int64 
 4   product_id      int64 
 5   number_of_item  int64 
 6   purchase_type   object
dtypes: int64(5), object(2)
memory usage: 150.4+ MB


In [421]:
# Display sample outputs
print("\n=== Demo: Inventory Table (5 rows) ===")
inventory_df.head()


=== Demo: Inventory Table (5 rows) ===


Unnamed: 0,date,product_id,store_id,stock_quantity
0,2024-01-01,1,1,89
1,2024-01-01,2,1,92
2,2024-01-01,3,1,82
3,2024-01-01,4,1,66
4,2024-01-01,5,1,60


# Upload .csv Data to Google BigQuery

- Reference: [Loading CSV data into a table](https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#loading_csv_data_into_a_table)
- [Install Google Cloud SDK](https://cloud.google.com/sdk/docs/install)

In [None]:
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# TODO(developer): Set table_id to the ID of the table to create.
# table_id = "your-project.your_dataset.your_table_name"

job_config = bigquery.LoadJobConfig(
    schema=[
        bigquery.SchemaField("name", "STRING"),
        bigquery.SchemaField("post_abbr", "STRING"),
    ],
    skip_leading_rows=1,
    # The source format defaults to CSV, so the line below is optional.
    source_format=bigquery.SourceFormat.CSV,
)
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"

load_job = client.load_table_from_uri(
    uri, table_id, job_config=job_config
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

destination_table = client.get_table(table_id)  # Make an API request.
print("Loaded {} rows.".format(destination_table.num_rows))

# Reference

## Marketing

### Google Ads Data

In [None]:
import pandas as pd
import numpy as np
from faker import Faker

fake = Faker()

# Settings
num_rows = 500

# Generate data
ads_data = {
    "Date": pd.date_range(start="2025-04-01", periods=num_rows//10).repeat(10),
    "Campaign Name": np.random.choice(["Spring_Sale", "Summer_Promo", "New_Arrival", "Clearance_Event"], num_rows),
    "Ad Group": np.random.choice(["Sneakers_AdGroup", "Apparel_AdGroup", "Accessories_AdGroup"], num_rows),
    "Impressions": np.random.randint(1000, 10000, num_rows),
    "Clicks": np.random.randint(50, 500, num_rows),
    "CPC ($)": np.round(np.random.uniform(0.5, 2.5, num_rows), 2),
    "Conversions": np.random.randint(0, 50, num_rows),
    "Cost ($)": lambda df=None: df["Clicks"] * df["CPC ($)"],  # We'll compute this later
    "Device": np.random.choice(["Mobile", "Desktop", "Tablet"], num_rows),
    "Campaign Type": np.random.choice(["Search", "Display", "Shopping"], num_rows)
}

ads_df = pd.DataFrame(ads_data)
ads_df["Cost ($)"] = ads_df["Clicks"] * ads_df["CPC ($)"]

ads_df.to_csv("mock_google_ads.csv", index=False)
print("Mock Google Ads data generated!")

### Google Analytics 4 Data

In [None]:
# Settings
num_rows = 500

# Generate data
ga4_data = {
    "Date": pd.date_range(start="2025-04-01", periods=num_rows//10).repeat(10),
    "Session ID": [f"GA4-{fake.uuid4()[:8]}" for _ in range(num_rows)],
    "User ID": [f"user_{fake.random_int(1000, 9999)}" for _ in range(num_rows)],
    "Source / Medium": np.random.choice(["google / organic", "facebook / cpc", "direct / none", "instagram / referral"], num_rows),
    "Device Category": np.random.choice(["mobile", "desktop", "tablet"], num_rows),
    "Sessions": np.ones(num_rows),
    "Engaged Sessions": np.random.choice([0, 1], num_rows, p=[0.3, 0.7]),
    "Page Views": np.random.randint(1, 10, num_rows),
    "Conversions": np.random.choice([0, 1], num_rows, p=[0.8, 0.2]),
    "Event Name": np.random.choice(["page_view", "purchase", "add_to_cart", "begin_checkout"], num_rows)
}

ga4_df = pd.DataFrame(ga4_data)

ga4_df.to_csv("mock_ga4_data.csv", index=False)
print("Mock GA4 data generated!")


### GA 360 Data

In [None]:
# Settings
num_rows = 500

# Generate data
ga360_data = {
    "Date": pd.date_range(start="2025-04-01", periods=num_rows//10).repeat(10),
    "Client ID": [f"{fake.random_int(100,999)}.{fake.random_int(100,999)}" for _ in range(num_rows)],
    "Session ID": [f"GA360-{fake.uuid4()[:8]}" for _ in range(num_rows)],
    "Hit Time": [fake.time(pattern="%H:%M:%S") for _ in range(num_rows)],
    "Page Path": np.random.choice(["/product/sneakers", "/product/jackets", "/checkout", "/home", "/cart"], num_rows),
    "Event Category": np.random.choice(["eCommerce", "Navigation", "Engagement"], num_rows),
    "Event Action": np.random.choice(["ViewProduct", "AddToCart", "Purchase", "ClickLink"], num_rows),
    "Event Label": [f"Item_{fake.random_int(100,999)}" for _ in range(num_rows)],
    "Campaign": np.random.choice(["Spring_Sale", "Summer_Collection", "VIP_Member_Deals"], num_rows),
    "Transaction Revenue": np.round(np.random.choice([0, 0, 0, 50, 100, 150, 200], num_rows), 2)
}

ga360_df = pd.DataFrame(ga360_data)

ga360_df.to_csv("mock_ga360_data.csv", index=False)
print("Mock GA360 data generated!")


## Obselete Code

### budget_v2

In [None]:
from faker import Faker
import pandas as pd
import random
from datetime import timedelta, datetime

# Initialize
fake = Faker()
random.seed(42)

# Define Constants
campaign_types = [
    "Sales Promotion", "Seasonal Sales", "Loyalty Program", "Customer Retention",
    "Location-Based Campaign", "Holiday Campaign", "Back-to-School"
]
activities = [
    "In-store Setup", "Flyer Launch", "Google Ads", "Local Community Events",
    "Loyalty Push", "Digital Push", "Loyalty Program Bonuses"
]
agencies_or_specialists = [fake.name() for _ in range(10)]  # Create random 10 specialists

# Create Campaigns
campaigns = []
campaign_start_dates = []
for i in range(1, 21):
    start_date = fake.date_between_dates(date_start=datetime(2024, 1, 1), date_end=datetime(2025, 3, 31))
    duration_weeks = random.randint(1, 4)
    end_date = start_date + timedelta(weeks=duration_weeks)
    campaign_start_dates.append(start_date)
    campaigns.append({
        "campaign_id": f"CAMP_{i:03d}",
        "campaign_type": random.choice(campaign_types),
        "start_date": start_date,
        "end_date": end_date,
        "duration_weeks": duration_weeks
    })

# Create Daily Budget Monitor
daily_budget_monitor = []
for camp in campaigns:
    current_date = camp["start_date"]
    activities_list = random.sample(activities, random.randint(3, 7))  # Each campaign 3–7 activities
    est_budget_total = random.randint(30000, 120000)
    est_budget_per_activity = est_budget_total // len(activities_list)
    
    for activity in activities_list:
        spent_total = 0
        days_span = (camp["end_date"] - camp["start_date"]).days + 1
        for day in range(days_span):
            date = camp["start_date"] + timedelta(days=day)
            # Simulate daily spent between 1%-5% of activity budget
            spent = round(est_budget_per_activity * random.uniform(0.01, 0.05))
            spent_total += spent
            left_budget = est_budget_per_activity - spent_total
            daily_budget_monitor.append({
                "date": date,
                "campaign_id": camp["campaign_id"],
                "activity": activity,
                "spent": spent,
                "estimated_budget": est_budget_per_activity,
                "left_budget": max(left_budget, 0),
                "agency_or_specialist": random.choice(agencies_or_specialists)
            })

# Convert to DataFrame
daily_df = pd.DataFrame(daily_budget_monitor)

# Display Demo 5 Rows
print(daily_df.head(5))

### campaign_v2

In [None]:
# Aggregate Real Spent Budget per Campaign
campaign_spent_summary = daily_df.groupby("campaign_id")["spent"].sum().reset_index()
campaign_spent_summary.rename(columns={"spent": "real_spent_budget"}, inplace=True)

# Create Campaign Management Table
campaign_management = []
used_product_ids = set()

for camp in campaigns:
    num_products = random.randint(3, 10)
    product_ids = []
    discounts = []
    for _ in range(num_products):
        # Ensure product_id is unique per campaign date
        pid = random.randint(1, 2000)
        while pid in used_product_ids:
            pid = random.randint(1, 2000)
        used_product_ids.add(pid)
        product_ids.append(pid)
        discounts.append(round(random.uniform(0.05, 0.50), 2))  # 5% to 50% discount

    est_budget = random.randint(30000, 120000)
    approved_budget = int(est_budget * random.uniform(0.9, 1.1))
    real_spent = campaign_spent_summary.loc[campaign_spent_summary["campaign_id"] == camp["campaign_id"], "real_spent_budget"].values[0]

    campaign_management.append({
        "campaign_id": camp["campaign_id"],
        "campaign_name": f"{camp['campaign_type']} {fake.word().capitalize()}",
        "product_ids": product_ids,
        "discounts": discounts,
        "start_date": camp["start_date"],
        "end_date": camp["end_date"],
        "campaign_type": camp["campaign_type"],
        "assumed_budget": est_budget,
        "approved_budget": approved_budget,
        "real_spent_budget": real_spent,
        "estimated_sales_increase (%)": round(random.uniform(2, 15), 2),
        "in_store_setup_cost": random.randint(1000, 5000),
        "flyer_cost": random.randint(1000, 5000),
        "digital_ads_cost": random.randint(5000, 20000),
        "event_cost": random.randint(1000, 7000),
        "loyalty_push_cost": random.randint(1000, 7000),
        "digital_push_cost": random.randint(1000, 7000),
        "loyalty_program_bonus_cost": random.randint(1000, 7000)
    })

# Convert to DataFrame
campaign_df = pd.DataFrame(campaign_management)

# Display Demo 5 Rows
print(campaign_df.head(5))

### campaign_v1

In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta

# List of possible campaign types
campaign_types = [
    "Sales Promotion Campaign", "Seasonal Sales", "Loyalty Program", 
    "Customer Retention Campaign (Weekly Flyers)", "Location-Based Campaign", 
    "Holiday Campaign", "Back-to-School"
]

# Function to generate random dates
def random_date(start_date, end_date):
    return start_date + timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds())))

# Function to generate simulated campaign log
def generate_campaign_log(num_rows=100):
    start_date = datetime(2024, 1, 1)
    end_date = datetime.now()

    data = []
    used_products = {}  # Dictionary to track product_id and their assigned dates

    for campaign_id in range(num_rows):
        campaign_name = random.choice(campaign_types)
        campaign_start_date = random_date(start_date, end_date)
        campaign_end_date = campaign_start_date + timedelta(days=random.randint(7, 30))  # Campaign duration of 7-30 days
        
        # Select products that are not already assigned to campaigns on this date
        product_ids = []
        discounts = []  # List to store discounts for each product_id
        while len(product_ids) < random.randint(1, 10):  # Assign 1-10 products
            product_id = random.randint(1, 2000)
            if product_id not in used_products or not any(used_products[product_id] == campaign_start_date for campaign_start_date in used_products[product_id]):
                product_ids.append(product_id)
                discounts.append(round(random.uniform(0.05, 0.30), 2))  # Random discount between 5% and 30%
                if product_id not in used_products:
                    used_products[product_id] = []
                used_products[product_id].append(campaign_start_date)  # Mark this product as used for this date

        budget = round(random.uniform(10000, 100000), 2)  # Budget between 10k and 100k
        estimated_sales_increase = round(random.uniform(0.05, 0.20), 2)  # Estimated sales increase between 5% and 20%
        
        # Budget distribution across various costs
        marketing_cost = round(budget * random.uniform(0.3, 0.5), 2)
        in_store_cost = round(budget * random.uniform(0.2, 0.4), 2)
        influencer_cost = round(budget * random.uniform(0.1, 0.2), 2)
        other_cost = round(budget * random.uniform(0.05, 0.1), 2)
        
        data.append({
            "campaign_id": f"CAMP{campaign_id + 1}",
            "campaign_name": campaign_name,
            "product_id": product_ids,
            "discounts": discounts,
            "budget": budget,
            "start_date": campaign_start_date,
            "end_date": campaign_end_date,
            "estimated_sales_increase": estimated_sales_increase,
            "marketing_cost": marketing_cost,
            "in_store_cost": in_store_cost,
            "influencer_cost": influencer_cost,
            "other_cost": other_cost
        })
    
    return pd.DataFrame(data)

# Generate the campaign log
campaign_log_df = generate_campaign_log(num_rows=5)

# Show the first 5 rows of the campaign log table
print("Sample Campaign Log (First 5 Rows):")
print(campaign_log_df)


### ads_v1

In [None]:
import pandas as pd
import random
import numpy as np
from datetime import datetime, timedelta

# Define constants for simulation
start_date = datetime(2024, 1, 1)
end_date = datetime.now()
num_records = 1000  # number of records to generate
ad_campaign_types = ["online-search", "online-displays", "online-event", "retargeting-ads", "email-ads"]

# Function to generate random dates within the given range
def random_date(start_date, end_date):
    return start_date + timedelta(days=random.randint(0, (end_date - start_date).days))

# Simulate Advertisement Data
data = []
for i in range(num_records):
    date = random_date(start_date, end_date).strftime('%Y-%m-%d')
    ad_group = f"ad_group_{random.randint(1, 20)}"  # Random Ad Group
    product_id = random.randint(1, 2000)  # Random product_id
    impressions = random.randint(1000, 10000)  # Random impressions
    clicks = int(impressions * random.uniform(0.02, 0.1))  # Clicks as a percentage of impressions
    cpc = round(random.uniform(0.1, 3.0), 2)  # Cost per click ($0.1 to $3.0)
    conversions = int(clicks * random.uniform(0.1, 0.5))  # Conversion as a percentage of clicks
    cost = round(cpc * clicks, 2)  # Total cost calculation
    customer_id = random.randint(1, 100000)  # Random customer ID
    ads_campaign_id = f"campaign_{random.randint(1, 100)}"  # Random Ads Campaign ID
    ads_campaign_type = random.choice(ad_campaign_types)  # Random Ads Campaign Type
    
    # Append the generated data to the list
    data.append([date, ad_group, product_id, impressions, clicks, cpc, conversions, cost, customer_id, ads_campaign_id, ads_campaign_type])

# Convert the data to a pandas DataFrame
ads_data = pd.DataFrame(data, columns=[
    'Date', 'Ad_Group', 'Product_ID', 'Impressions', 'Clicks', 'CPC', 
    'Conversions', 'Cost', 'Customer_ID', 'Ads_Campaign_ID', 'Ads_Campaign_Type'
])

# Show the first 5 rows of the generated data
print(ads_data.head())

# Optionally, save to a CSV file
# ads_data.to_csv('simulated_ads_data.csv', index=False)


### sale and inventory - v1

In [169]:
# Constants for the simulation
start_date = datetime(2024, 1, 1)
end_date = datetime(2025,4,25) # datetime.now()
# num_products = 2000  # Number of products
num_stores = 37  # Number of stores
num_records_per_day = random.randint(100, 1000)  # Number of sales records per day (to ensure enough sales data)
order_value_target = 93  # Average order value
sales_campaign_types = [
    "Sales Promotion Campaign", "Seasonal Sales", "Loyalty Program", 
    "Customer Retention Campaign (Weekly Flyers)", "Location-Based Campaign", 
    "Holiday Campaign", "Back-to-School"
]

# Function to generate random dates within the range
def random_date(start_date, end_date):
    return start_date + timedelta(days=random.randint(0, (end_date - start_date).days))

# Simulating sales data
sales_data = []

while len(sales_data) < num_records_per_day * (end_date - start_date).days:
    sale_date = random_date(start_date, end_date).strftime('%Y-%m-%d')
    customer_id = random.randint(1, 100000)  # Random customer_id
    order_id = random.randint(1, 1000000)  # Random order_id
    store_id = random.randint(1, num_stores)  # Random store_id
    product_id = random.randint(1, num_products)  # Random product_id
    price = random.randint(10, 200)  # Price per product
    number_of_items = random.randint(1, 5)  # Random number of items in the order
    discount = random.uniform(0, 0.3)  # Random discount between 0 and 30%
    total_price = price * number_of_items * (1 - discount)
    
    campaign_type = random.choice(sales_campaign_types)
    
    # Adjust order value to meet the target average order value
    if abs(total_price - order_value_target) <= 5:
        sales_data.append([sale_date, customer_id, order_id, store_id, product_id, price, number_of_items, discount, campaign_type, total_price])

# Convert sales data to DataFrame
sales_df = pd.DataFrame(sales_data, columns=[
    'Sale_Date', 'Customer_ID', 'Order_ID', 'Store_ID', 'Product_ID', 
    'Price', 'Number_of_Items', 'Discount', 'Campaign_Type', 'Total_Price'
])

# save the data to a CSV file
sales_df.to_csv('sale.csv', index=False)
# Display the first few rows of sales data
print(sales_df.head())

    Sale_Date  Customer_ID  Order_ID  Store_ID  Product_ID  Price  \
0  2024-04-26        45294    982761         5         681    113   
1  2024-04-27        70764    407143         8         215     95   
2  2024-08-10        22829    907040        31        1733     34   
3  2024-11-11        25272    276033         8          74    110   
4  2024-02-12        93124    406193        11          34     31   

   Number_of_Items  Discount                                Campaign_Type  \
0                1  0.170042                     Sales Promotion Campaign   
1                1  0.035245                             Holiday Campaign   
2                3  0.062342                              Loyalty Program   
3                1  0.194147  Customer Retention Campaign (Weekly Flyers)   
4                4  0.247880                               Seasonal Sales   

   Total_Price  
0    93.785219  
1    91.651769  
2    95.641133  
3    88.643820  
4    93.262911  


In [None]:
# Simulating inventory data
inventory_data = []
inventory_initial_stock = {product_id: random.randint(50, 100) for product_id in range(1, num_products + 1)}  # Initial stock

# Start inventory tracking
for single_date in pd.date_range(start=start_date, end=end_date, freq='D'):
    date_str = single_date.strftime('%Y-%m-%d')
    
    for product_id in range(1, num_products + 1):
        store_id = random.randint(1, num_stores)
        # Get product initial stock from inventory
        stock_quantity = inventory_initial_stock[product_id]
        
        # Deduct the sold items from the inventory
        sales_in_day = sales_df[sales_df['Sale_Date'] == date_str]
        product_sales = sales_in_day[sales_in_day['Product_ID'] == product_id]
        total_sold = product_sales['Number_of_Items'].sum()
        
        stock_quantity -= total_sold  # Deduct sold products from inventory
        
        # Weekly stock additions (mimicking weekly inventory restocking)
        if single_date.weekday() == 0:  # Every Monday (i.e., weekly restocking)
            stock_quantity += random.randint(50, 150)  # Restock with a random number

        # Ensure that stock doesn't go below 0
        stock_quantity = max(0, stock_quantity)
        
        # Update the inventory data
        inventory_data.append([date_str, product_id, store_id, stock_quantity])

# Convert inventory data to DataFrame
inventory_df = pd.DataFrame(inventory_data, columns=[
    'Date', 'Product_ID', 'Store_ID', 'Stock_Quantity'
])

# Display the first few rows of inventory data
print(inventory_df.head())

# Optionally, save the DataFrames to CSVs

# inventory_df.to_csv('simulated_inventory_data.csv', index=False)
