In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from faker import Faker
import uuid

# Create output directory
output_dir = 'output'
import os
os.makedirs(output_dir, exist_ok=True)

# Initialize Faker for realistic data generation
fake = Faker()
random.seed(42)
np.random.seed(42)
Faker.seed(42)

print("Starting data generation for Sales Analytics Pipeline...")

# Configuration for data generation
NUM_LOCATIONS = 100
NUM_CUSTOMERS = 10000
NUM_PRODUCTS = 1000
NUM_SALES = 100000

print(f"Generating {NUM_LOCATIONS} locations, {NUM_CUSTOMERS} customers, {NUM_PRODUCTS} products, {NUM_SALES} sales transactions")

Starting data generation for Sales Analytics Pipeline...
Generating 100 locations, 10000 customers, 1000 products, 100000 sales transactions


In [6]:
# Generate Locations Data
print("Generating locations.csv...")
locations_data = []
states = ['CA', 'NY', 'TX', 'FL', 'IL', 'PA', 'OH', 'GA', 'NC', 'MI']
regions = ['North', 'South', 'East', 'West', 'Central']

for i in range(NUM_LOCATIONS):
    location_id = f"LOC_{i+1:04d}"
    state = random.choice(states)
    region = random.choice(regions)
    
    locations_data.append({
        'location_id': location_id,
        'city': fake.city(),
        'state': state,
        'region': region,
        'country': 'USA',
        'zip_code': fake.zipcode(),
        'store_type': random.choice(['Mall', 'Strip', 'Standalone', 'Outlet']),
        'store_size_sqft': random.randint(1000, 50000),
        'opening_date': fake.date_between(start_date='-10y', end_date='-1y')
    })

locations_df = pd.DataFrame(locations_data)
locations_df.to_csv(f'{output_dir}/locations.csv', index=False)
print(f"Generated {len(locations_df)} locations")

Generating locations.csv...
Generated 100 locations


In [7]:
# Generate Products Data
print("Generating products.csv...")
categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books', 'Beauty', 'Toys', 'Automotive']
brands = ['BrandA', 'BrandB', 'BrandC', 'BrandD', 'BrandE', 'BrandF', 'BrandG', 'BrandH']

products_data = []
for i in range(NUM_PRODUCTS):
    product_id = f"PROD_{i+1:06d}"
    category = random.choice(categories)
    brand = random.choice(brands)
    base_price = random.uniform(10, 500)
    
    products_data.append({
        'product_id': product_id,
        'product_name': f"{fake.catch_phrase()} {fake.word().title()}",
        'category': category,
        'subcategory': f"{category}_{random.randint(1,5)}",
        'brand': brand,
        'unit_price': round(base_price, 2),
        'cost_price': round(base_price * 0.6, 2),
        'weight_lbs': round(random.uniform(0.1, 20), 2),
        'launch_date': fake.date_between(start_date='-5y', end_date='today'),
        'status': random.choice(['Active', 'Discontinued', 'Active', 'Active'])  # 75% active
    })

products_df = pd.DataFrame(products_data)
products_df.to_csv(f'{output_dir}/products.csv', index=False)
print(f"Generated {len(products_df)} products")

Generating products.csv...
Generated 1000 products


In [8]:
# Generate Customers Data
print("Generating customers.csv...")
customer_segments = ['Premium', 'Standard', 'Budget', 'VIP']
age_groups = ['18-25', '26-35', '36-45', '46-55', '56-65', '65+']

customers_data = []
for i in range(NUM_CUSTOMERS):
    customer_id = f"CUST_{i+1:08d}"
    
    # Generate some duplicate customers to test deduplication
    if i < 50:  # First 50 customers will have duplicates
        email = f"customer_{i+1}@example.com"
    else:
        email = fake.email()
    
    customers_data.append({
        'customer_id': customer_id,
        'first_name': fake.first_name(),
        'last_name': fake.last_name(),
        'email': email,
        'phone': fake.phone_number(),
        'date_of_birth': fake.date_of_birth(minimum_age=18, maximum_age=80),
        'gender': random.choice(['M', 'F', 'Other']),
        'age_group': random.choice(age_groups),
        'customer_segment': random.choice(customer_segments),
        'registration_date': fake.date_between(start_date='-3y', end_date='today'),
        'preferred_contact': random.choice(['Email', 'Phone', 'SMS']),
        'loyalty_points': random.randint(0, 10000),
        'address': fake.address().replace('\n', ', '),
        'city': fake.city(),
        'state': random.choice(['CA', 'NY', 'TX', 'FL', 'IL']),
        'zip_code': fake.zipcode()
    })

customers_df = pd.DataFrame(customers_data)
customers_df.to_csv(f'{output_dir}/customers.csv', index=False)
print(f"Generated {len(customers_df)} customers")

Generating customers.csv...
Generated 10000 customers


In [9]:
# Generate Sales Data
print("Generating sales.csv...")
channels = ['Online', 'In-Store', 'Mobile App', 'Phone']
payment_methods = ['Credit Card', 'Cash', 'Debit Card', 'PayPal', 'Gift Card']

# Get lists for foreign keys
location_ids = locations_df['location_id'].tolist()
customer_ids = customers_df['customer_id'].tolist()
product_ids = products_df['product_id'].tolist()

sales_data = []
start_date = datetime(2022, 1, 1)
end_date = datetime(2024, 12, 31)

for i in range(NUM_SALES):
    transaction_id = f"TXN_{i+1:010d}"
    
    # Random transaction date
    transaction_date = fake.date_between(start_date=start_date, end_date=end_date)
    
    # Select random customer, product, location
    customer_id = random.choice(customer_ids)
    product_id = random.choice(product_ids)
    location_id = random.choice(location_ids)
    
    # Get product price for calculation
    product_price = products_df[products_df['product_id'] == product_id]['unit_price'].iloc[0]
    
    # Quantity and calculations
    quantity = random.randint(1, 5)
    gross_amount = product_price * quantity
    discount_pct = random.choice([0, 0, 0, 0.05, 0.1, 0.15, 0.2])  # 50% no discount
    discount_amount = gross_amount * discount_pct
    net_amount = gross_amount - discount_amount
    
    # Some records with nulls for testing
    sales_record = {
        'transaction_id': transaction_id,
        'customer_id': customer_id,
        'product_id': product_id,
        'location_id': location_id,
        'transaction_date': transaction_date,
        'quantity': quantity,
        'unit_price': product_price,
        'gross_amount': round(gross_amount, 2),
        'discount_amount': round(discount_amount, 2),
        'net_amount': round(net_amount, 2),
        'channel': random.choice(channels),
        'payment_method': random.choice(payment_methods),
        'sales_rep_id': f"REP_{random.randint(1, 50):03d}",
        'promo_code': random.choice([None, None, None, 'SAVE10', 'WELCOME', 'HOLIDAY'])
    }
    
    # Introduce some null values for testing (5% of records)
    if random.random() < 0.05:
        sales_record['sales_rep_id'] = None
    
    sales_data.append(sales_record)

sales_df = pd.DataFrame(sales_data)

# Add some duplicate transactions for testing deduplication
duplicate_transactions = sales_df.sample(n=100).copy()
duplicate_transactions['transaction_id'] = duplicate_transactions['transaction_id'] + '_DUP'
sales_df = pd.concat([sales_df, duplicate_transactions], ignore_index=True)

sales_df.to_csv(f'{output_dir}/sales.csv', index=False)
print(f"Generated {len(sales_df)} sales transactions (including {len(duplicate_transactions)} duplicates for testing)")

Generating sales.csv...
Generated 100100 sales transactions (including 100 duplicates for testing)


In [10]:
# Save the sales data
print("Saving sales data...")
sales_df.to_csv(f'{output_dir}/sales.csv', index=False)
print(f"✓ Generated sales.csv with {len(sales_df)} records")

# Show sample data
print("\n=== Sample Data Preview ===")
print("\nLocations sample:")
print(locations_df.head(3))
print(f"\nProducts sample:")
print(products_df.head(3))
print(f"\nCustomers sample:")
print(customers_df.head(3))
print(f"\nSales sample:")
print(sales_df.head(3))

Saving sales data...
✓ Generated sales.csv with 100100 records

=== Sample Data Preview ===

Locations sample:
  location_id               city state region country zip_code  store_type  \
0    LOC_0001   North Judithbury    NY  North     USA    29757  Standalone   
1    LOC_0002       Lake Joyside    FL  South     USA    11896        Mall   
2    LOC_0003  North Claytonbury    NC  North     USA    66738      Outlet   

   store_size_sqft opening_date  
0            17049   2016-09-09  
1            45348   2020-09-30  
2             3082   2020-11-07  

Products sample:
    product_id                           product_name       category  \
0  PROD_000001  Focused human-resource solution Claim  Home & Garden   
1  PROD_000002    Cloned intangible installation Give          Books   
2  PROD_000003     Operative analyzing database There     Automotive   

       subcategory   brand  unit_price  cost_price  weight_lbs launch_date  \
0  Home & Garden_4  BrandF      393.37      236.02     