In [0]:
# %sql

# create catalog if not exists data_modeling;
# use catalog data_modeling;

# drop schema if exists raw cascade;
# drop schema if exists bronze cascade;
# drop schema if exists silver cascade;
# drop schema if exists gold cascade;

# create schema raw;
# create schema bronze;
# create schema silver;
# create schema gold;

# create volume data_modeling.raw.landing_zone;

In [0]:
# # Reference the volume
# landing_zone_path = "/Volumes/data_modeling/raw/landing_zone"

# # Create subdirectories
# dbutils.fs.mkdirs(f"{landing_zone_path}/cust")
# dbutils.fs.mkdirs(f"{landing_zone_path}/product")
# dbutils.fs.mkdirs(f"{landing_zone_path}/loc")
# dbutils.fs.mkdirs(f"{landing_zone_path}/sales")

# # List contents
# dbutils.fs.ls(landing_zone_path)

In [0]:
%sql
use catalog data_modeling;

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from datetime import datetime
from pyspark.sql.functions import col, current_timestamp, lit, trim
from pyspark.sql import DataFrame
import re
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType
from datetime import datetime, timedelta
import random


In [0]:
"""
PySpark Code to Generate Multi-Dimensional Customer-Store-Product Dataset
"""

# Run ID
run_id = datetime.now().strftime('%Y%m%d')

# Set random seed for reproducibility
random.seed(42)

# ============================================================================
# STEP 1: Define Base Customer Data
# ============================================================================
customers_data = [
    (1, "1234567", "Kartik", "F", "26-35", 35, 1, "Andhra Pradesh", "Southern", "Govt"),
    (2, "2345678", "Bindu", "F", "26-35", 35, 1, "Uttar Pradesh", "Central", "Automobile"),
    (3, "3456789", "Sudevi", "M", "0-17", 16, 0, "Karnataka", "Southern", "Construction"),
    (4, "4567890", "Joni", "M", "26-35", 28, 1, "Gujarat", "Western", "Food Processing"),
    (5, "5678901", "Joni", "M", "26-35", 28, 1, "Himachal Pradesh", "Northern", "Food Processing"),
    (6, "6789012", "Balk", "F", "18-25", 25, 1, "Uttar Pradesh", "Central", "Lawyer"),
    (7, "7890123", "Shivangi", "F", "55+", 61, 0, "Maharashtra", "Western", "IT Sector"),
    (8, "8901234", "Kushal", "M", "26-35", 35, 0, "Uttar Pradesh", "Central", "Govt"),
    (9, "9012345", "Ginny", "F", "26-35", 26, 1, "Andhra Pradesh", "Southern", "Media"),
    (10, "7654321", "Harshita", "M", "26-35", 34, 0, "Delhi", "Central", "Banking"),
    (11, "6543210", "Kargatis", "F", "18-25", 20, 0, "Andhra Pradesh", "Southern", "Retail"),
    (12, "5432109", "Elijah", "F", "18-25", 20, 1, "Andhra Pradesh", "Southern", "IT Sector"),
    (13, "4321098", "Vasudev", "M", "26-35", 26, 1, "Andhra Pradesh", "Southern", "Automobile"),
    (14, "3210987", "Cano", "M", "46-50", 46, 1, "Madhya Pradesh", "Central", "Hospitality"),
]

# ============================================================================
# STEP 2: Define Store Data
# ============================================================================
stores_data = [
    (1, "Central"),
    (2, "Northern"),
    (3, "Southern"),
    (4, "Western"),
    (5, "Eastern"),
]

# ============================================================================
# STEP 3: Define Product Data
# ============================================================================
products_data = [
    (1, "P001104566", "Auto Care Premium", "Auto", 2540),
    (2, "P0011854245", "Auto Protection Plus", "Auto", 4360),
    (3, "P00237842356", "Engine Guard Pro", "Auto", 6789),
    (4, "P0005794233", "Fuel Saver", "Auto", 4582),
    (5, "P00057942453", "Brake Fluid Advanced", "Auto", 3456),
]

# ============================================================================
# STEP 4: Generate Transactional Data
# ============================================================================
def generate_transactions():
    """Generate 350 transaction records (14 customers Ã— 5 stores Ã— 5 products)"""
    transactions = []
    created_time = datetime.now()
    
    for cust_id, phone, name, gender, age_group, age, marital, state, zone, occupation in customers_data:
        for store_id, region in stores_data:
            for product_id, product_code, product_desc, category, product_cost in products_data:
                # Generate random variance
                orders = random.randint(1, 4)
                transaction_id = random.randint(10000000, 99999999)
                # ProductCost = 23000 + random.randint(-500, 1000)
                amount = round((orders * product_cost), 2)
                
                # Generate timestamps with variance (0-30 days offset)
                days_offset = random.randint(0, 30)
                order_time = created_time + timedelta(days=days_offset)
                
                # Create transaction record
                record = (
                    transaction_id,
                    cust_id,                          # User_ID
                    phone,                            # phone_number
                    name,                             # Cust_name
                    product_code,                     # Product_Code
                    product_cost,
                    gender,                           # Gender
                    age_group,                        # Age_Group
                    age,                              # Age
                    marital,                          # Marital_Status
                    state,                            # State
                    zone,                             # Zone
                    occupation,                       # Occupation
                    category,                         # Product_Category
                    category,
                    orders,                           # Orders
                    amount,                           # Amount
                    product_id,                       # Product_ID
                    product_desc,                     # Product_Desc
                    order_time.isoformat(),           # OrderTms
                    "India",                          # Country
                    store_id,                         # StoreID
                    region,                           # Region
                    run_id,                           # RunID
                    created_time.isoformat()          # CreatedDate
                )
                transactions.append(record)
    
    return transactions

# ============================================================================
# STEP 5: Define Schema
# ============================================================================
schema = StructType([
    StructField("Transaction_ID", IntegerType(), True),
    StructField("User_ID", IntegerType(), True),
    StructField("phone_number", StringType(), True),
    StructField("Cust_name", StringType(), True),
    StructField("Product_Code", StringType(), True),
    StructField("ProductCost", DoubleType(), True),
    StructField("Gender", StringType(), True),
    StructField("Age_Group", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Marital_Status", IntegerType(), True),
    StructField("State", StringType(), True),
    StructField("Zone", StringType(), True),
    StructField("Occupation", StringType(), True),
    StructField("Product_Category", StringType(), True),
    StructField("Product_SubCategory", StringType(), True),
    StructField("Orders", IntegerType(), True),
    StructField("Amount", DoubleType(), True),
    StructField("Product_ID", IntegerType(), True),
    StructField("Product_Desc", StringType(), True),
    StructField("OrderTms", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("StoreID", IntegerType(), True),
    StructField("Region", StringType(), True),
    StructField("RunID", StringType(), True),
    StructField("CreatedDate", StringType(), True),
])

transactions = generate_transactions()

print("ðŸ”„ Creating Spark DataFrame...")
sales_df = spark.createDataFrame(transactions, schema=schema)
sales_df.display()

In [0]:
# Define the volume path
volume_path = f"/Volumes/data_modeling/raw/landing_zone/sales_{run_id}.csv"

# Save the DataFrame as CSV to the volume
sales_df.coalesce(1).write \
    .format("csv") \
    .mode("overwrite") \
    .option("header", "true") \
    .save(volume_path)

print(f"âœ… Data saved to: {volume_path}")