[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wasim/Data-Science/blob/main/data-analyst-roadmap/09_end_to_end_projects/ecommerce_analysis/01_data_generation.ipynb)

# E-Commerce Data Generation

Create a realistic transaction dataset for analysis.

## Fields
- **InvoiceNo:** Unique transaction ID.
- **StockCode:** Product ID.
- **Description:** Product Name.
- **Quantity:** Items sold.
- **InvoiceDate:** Time of sale.
- **UnitPrice:** Price per item.
- **CustomerID:** User ID.
- **Country:** User location.

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Settings
np.random.seed(42)
NUM_TRANSACTIONS = 10000
NUM_CUSTOMERS = 1000
NUM_PRODUCTS = 50

# Generate Customers
customer_ids = np.arange(1000, 1000 + NUM_CUSTOMERS)
countries = ['USA', 'UK', 'Canada', 'Germany', 'France']
customer_country = {cid: np.random.choice(countries, p=[0.4, 0.3, 0.1, 0.1, 0.1]) for cid in customer_ids}

# Generate Products
products = {
    f'PROD{i}': {
        'desc': f'Product {i}', 
        'price': round(np.random.uniform(5, 500), 2)
    } for i in range(NUM_PRODUCTS)
}
product_keys = list(products.keys())

# Generate Transactions
data = []
start_date = datetime(2023, 1, 1)

for i in range(NUM_TRANSACTIONS):
    # Random Invoice Date
    date = start_date + timedelta(days=np.random.randint(0, 365))
    
    # Random Customer
    cid = np.random.choice(customer_ids)
    
    # Random Product
    pid = np.random.choice(product_keys)
    prod = products[pid]
    
    # Random Quantity (skewed lower)
    qty = np.random.randint(1, 10)
    if np.random.random() > 0.9:
        qty += np.random.randint(10, 50) # Bulk buy
        
    data.append([
        f'INV{10000+i}',
        pid,
        prod['desc'],
        qty,
        date,
        prod['price'],
        cid,
        customer_country[cid]
    ])

# Create DataFrame
df = pd.DataFrame(data, columns=[
    'InvoiceNo', 'StockCode', 'Description', 'Quantity', 
    'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country'
])

# Add Total Amount
df['TotalAmount'] = df['Quantity'] * df['UnitPrice']

print(df.head())
print(df.info())

In [None]:
# Save data for next steps
df.to_csv('ecommerce_data.csv', index=False)
print("Data saved to ecommerce_data.csv")