In [1]:
import os
from pathlib import Path
import random
from datetime import datetime, timedelta
import uuid
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import round as spark_round
from notebookutils import mssparkutils

# rvu64a4orr5udfgwallmuq4ecy-ao2bnbs7cogexi6npsjyqaafqq.datawarehouse.fabric.microsoft.com



client = "800"  # SAP client
company_codes = ["1000", "2000", "3000"]
plants = ["1000", "2000", "3000", "4000"]
storage_locations = ["0001", "0002", "0003", "0004"]
sales_orgs = ["1000", "2000"]
distr_channels = ["10", "20"]
divisions = ["00", "01"]
currencies = ["USD", "EUR", "GBP"]
countries = ["US", "DE", "FR", "GB", "IT", "ES", "JP", "CN"]
languages = ["E", "D", "F", "S", "I", "J"]
start_sequence = 1000

def generate_id(prefix, length, num):
    num_str = str(num).zfill(length - len(prefix))
    return f"{prefix}{num_str}"

def random_date(start_date, end_date):
    time_between = end_date - start_date
    days_between = time_between.days
    random_days = random.randrange(days_between)
    return start_date + timedelta(days=random_days)

def write_to_partitioned_path(df, table_name, base_path="/lakehouse/default/Files/inbound-sap-seed", format="parquet"):
    
    current_time = datetime.now()
    year = current_time.strftime("%Y")
    month = current_time.strftime("%m")
    day = current_time.strftime("%d")
    timestamp = current_time.strftime("%Y%m%d_%H%M%S")
    
    output_path = f"{base_path}/{table_name}"
    Path(output_path).mkdir(parents=True, exist_ok=True)
    
    print(f"Writing {table_name} to {output_path}")
    df.to_parquet(f"{output_path}/{table_name}_{timestamp}.parquet")
    
    return output_path

companies = [
    {"code": "1000", "name": "Bunn Corporation", "country": "US", "currency": "USD"},
    {"code": "2000", "name": "Premia", "country": "DE", "currency": "EUR"},
    {"code": "3000", "name": "Bunn-o-matic", "country": "GB", "currency": "GBP"}
]

materials = []
materials_plant_mapping = []
customers = []
vendors = []
sales_headers = []
sales_items = []
purchase_headers = []
purchase_items = []
accounting_headers = []
accounting_items = []

# Generate Material Data
def generate_materials(count=500):
    
    materials_plant_counter = 0
    for i in range(1, count + 1):
        matnr = generate_id("MAT", 18, i)
        material_type = random.choice(["ROH", "HALB", "FERT", "HAWA"])
        
        materials.append({
            "mandt": client,
            "matnr": matnr,
            "mtart": material_type,
            "mbrsh": "M" if material_type in ["ROH", "HALB"] else "W",
            "matkl": random.choice(["01", "02", "03", "04", "05"]) + random.choice(["100", "200", "300", "400"]),
            "meins": random.choice(["EA", "PC", "KG", "L", "M"]),
            "bstme": random.choice(["EA", "PC", "KG", "L", "M"]),
            "mstae": random.choice(["01", "02", "03"]),
            "mstav": random.choice(["01", "02", "03"]),
            "prdha": generate_id("PRD", 18, random.randint(1, 999)),
            "gewei": random.choice(["KG", "G", "LB"]),
            "volum": float(f"{random.uniform(0.1, 100):.3f}"),
            "voleh": random.choice(["L", "M3"]),
            "behvo": random.choice(["01", "02"]),
            "spart": random.choice(divisions),
            "kunnr": generate_id("", 10, random.randint(1, 10)) if random.random() < 0.1 else "",
            "eannr": str(random.randint(1000000000000, 9999999999999)),
            "wesch": float(f"{random.uniform(0.1, 10):.3f}"),
            "brgew": float(f"{random.uniform(0.5, 50):.3f}"),
            "ntgew": float(f"{random.uniform(0.5, 45):.3f}"),
            "action_type": "I",
            "row_insert_timestamp": datetime.now(),
            "row_update_timestamp": datetime.now(),
            "sequence_id": start_sequence + i
        })
    
        for plant in random.sample(plants, random.randint(1, len(plants))):
            materials_plant_mapping.append({
                "mandt": client,
                "matnr": matnr,
                "werks": plant,
                "pstat": random.choice(["", "I", "L"]),
                "dismm": random.choice(["VB", "ND", "VM"]),
                "beskz": random.choice(["E", "F"]),
                "sobsl": random.choice(["", "0", "8"]),
                "minbe": float(f"{random.uniform(0, 100):.3f}"),
                "eisbe": float(f"{random.uniform(100, 500):.3f}"),
                "bstmi": float(f"{random.uniform(10, 50):.3f}"),
                "bstma": float(f"{random.uniform(500, 1000):.3f}"),
                "bstfe": float(f"{random.uniform(0, 10):.3f}"),
                "mabst": float(f"{random.uniform(0, 5):.3f}"),
                "lgpro": random.choice(storage_locations),
                "lgfsb": random.choice(storage_locations),
                "dispo": str(random.randint(100, 999)),
                "disls": random.choice(["A1", "A2", "C1"]),
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": start_sequence + len(materials) + materials_plant_counter
            })
            materials_plant_counter += 1
    
    return materials

# Generate Customer Data
def generate_customers(count=50):
    
    for i in range(1, count + 1):
        kunnr = generate_id("", 10, i)
        country = random.choice(countries)
        
        customers.append({
            "mandt": client,
            "kunnr": kunnr,
            "land1": country,
            "name1": f"Customer {i} {country}",
            "name2": "Main Office" if random.random() < 0.3 else "",
            "ort01": f"City {i}",
            "pstlz": str(random.randint(10000, 99999)),
            "regio": random.choice(["CA", "NY", "TX", "IL", "BAV", "BER"]),
            "stras": f"{random.randint(1, 999)} Main St.",
            "telf1": f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
            "telfx": f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
            "spras": random.choice(languages),
            "brsch": random.choice(["0001", "0002", "0003", "0004"]),
            "ktokd": random.choice(["0001", "0002", "Z001", "Z002"]),
            "kukla": random.choice(["01", "02", "03"]),
            "adrnr": generate_id("", 10, i + 1000),
            "stcd1": str(random.randint(100000000, 999999999)),
            "stcd2": str(random.randint(10000000, 99999999)),
            "action_type": "I",
            "row_insert_timestamp": datetime.now(),
            "row_update_timestamp": datetime.now(),
            "sequence_id": start_sequence + i + len(materials) + materials_plant_counter
        })
    
    return customers

# Generate Vendor Data
def generate_vendors(count=30):
    
    for i in range(1, count + 1):
        lifnr = generate_id("", 10, i)
        country = random.choice(countries)
        
        vendors.append({
            "mandt": client,
            "lifnr": lifnr,
            "land1": country,
            "name1": f"Vendor {i} {country}",
            "name2": "Headquarters" if random.random() < 0.3 else "",
            "ort01": f"City {i}",
            "pstlz": str(random.randint(10000, 99999)),
            "regio": random.choice(["CA", "NY", "TX", "IL", "BAV", "BER"]),
            "stras": f"{random.randint(1, 999)} Supply Ave.",
            "telf1": f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
            "telfx": f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
            "spras": random.choice(languages),
            "brsch": random.choice(["0001", "0002", "0003", "0004"]),
            "ktokd": random.choice(["0001", "0002", "Z001", "Z002"]),
            "adrnr": generate_id("", 10, i + 2000),
            "stcd1": str(random.randint(100000000, 999999999)),
            "stcd2": str(random.randint(10000000, 99999999)),
            "action_type": "I",
            "row_insert_timestamp": datetime.now(),
            "row_update_timestamp": datetime.now(),
            "sequence_id": start_sequence + i + len(materials) + materials_plant_counter + len(customers)
        })
    
    return vendors

# Generate Sales Documents
def generate_sales_documents(count=50):
    
    sales_item_counter = 0
    
    for i in range(1, count + 1):
        vbeln = generate_id("", 10, i)
        document_date = random_date(datetime.now() - timedelta(days=365), datetime.now())
        customer = customer_list[random.randint(0, len(customer_list)-1)]
        
        sales_headers.append({
            "mandt": client,
            "vbeln": vbeln,
            "auart": random.choice(["OR", "TA", "ZOR"]),
            "erdat": document_date,
            "ernam": f"USER{random.randint(1, 999):03d}",
            "audat": document_date,
            "vdatu": document_date + timedelta(days=random.randint(1, 30)),
            "vkorg": random.choice(sales_orgs),
            "vtweg": random.choice(distr_channels),
            "spart": random.choice(divisions),
            "netwr": 0,  # Will be updated after items
            "waerk": random.choice(currencies),
            "kunnr": customer["kunnr"],
            "augru": random.choice(["001", "002", ""]),
            "bstnk": f"PO{random.randint(10000, 99999)}",
            "action_type": "I",
            "row_insert_timestamp": datetime.now(),
            "row_update_timestamp": datetime.now(),
            "sequence_id": start_sequence + i + len(materials) + materials_plant_counter + len(customers) + len(vendors)
        })
        
        # Generate 1-5 items per sales document
        item_count = random.randint(1, 5)
        total_value = 0
        
        for j in range(1, item_count + 1):
            material = materials_list[random.randint(0, len(materials_list)-1)]
            quantity = random.randint(1, 100)
            price = float(f"{random.uniform(10, 1000):.2}")
            net_value = float(f"{quantity * price:.2}")
            total_value += net_value
            
            sales_items.append({
                "mandt": client,
                "vbeln": vbeln,
                "posnr": f"{j:06d}",
                "matnr": material["matnr"],
                "werks": random.choice(plants),
                "pstyv": random.choice(["TAN", "AGN", "TANN"]),
                "menge": quantity,
                "meins": material["meins"],
                "netwr": net_value,
                "waerk": sales_headers[-1]["waerk"],
                "kwmeng": quantity,
                "vrkme": material["meins"],
                "vstel": random.choice(["1000", "2000"]),
                "lgort": random.choice(storage_locations),
                "kunnr": sales_headers[-1]["kunnr"],
                "uepos": "",
                "charg": "",
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": start_sequence + i + len(materials) + materials_plant_counter + len(customers) + len(vendors) + count + sales_item_counter
            })
            
            sales_item_counter += 1
        
        # Update header with total value
        sales_headers[-1]["netwr"] = total_value
    
    return sales_headers, sales_items

# Generate Purchase Documents
def generate_purchase_documents(count=40):
    
    purchase_item_counter = 0
    
    for i in range(1, count + 1):
        ebeln = generate_id("", 10, i)
        document_date = random_date(datetime.now() - timedelta(days=365), datetime.now())
        vendor = vendor_list[random.randint(0, len(vendor_list)-1)]
        
        purchase_headers.append({
            "mandt": client,
            "ebeln": ebeln,
            "bukrs": random.choice(company_codes),
            "bstyp": random.choice(["F", "L", "K"]),
            "bsart": random.choice(["NB", "ZNB", "ZDR"]),
            "lifnr": vendor["lifnr"],
            "ekorg": random.choice(["1000", "2000"]),
            "ekgrp": random.choice(["001", "002", "003"]),
            "waers": random.choice(currencies),
            "bedat": document_date,
            "kdatb": document_date,
            "kdate": document_date + timedelta(days=random.randint(30, 365)),
            "bwbdt": document_date,
            "angdt": document_date - timedelta(days=random.randint(1, 30)),
            "action_type": "I",
            "row_insert_timestamp": datetime.now(),
            "row_update_timestamp": datetime.now(),
            "sequence_id": start_sequence + i + len(materials) + materials_plant_counter + len(customers) + len(vendors) + len(sales_headers) + len(sales_items)
        })
        
        # Generate 1-5 items per purchase document
        item_count = random.randint(1, 5)
        
        for j in range(1, item_count + 1):
            material = materials_list[random.randint(0, len(materials_list)-1)]
            quantity = random.randint(1, 100)
            price = float(f"{random.uniform(10, 1000):.2}")
            net_value = float(f"{quantity * price:.2}")
            
            purchase_items.append({
                "mandt": client,
                "ebeln": ebeln,
                "ebelp": f"{j:05d}",
                "matnr": material["matnr"],
                "werks": random.choice(plants),
                "lgort": random.choice(storage_locations),
                "matkl": material["matkl"],
                "meins": material["meins"],
                "menge": quantity,
                "netpr": price,
                "peinh": 1,
                "netwr": net_value,
                "brtwr": float(f"{net_value * 1.2:.2}"),  # Gross value with tax
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": start_sequence + i + len(materials) + materials_plant_counter + len(customers) + len(vendors) + len(sales_headers) + len(sales_items) + count + purchase_item_counter
            })
            
            purchase_item_counter += 1
    
    return purchase_headers, purchase_items

# Generate Accounting Documents
def generate_accounting_documents(count=30):
    
    accounting_item_counter = 0
    
    for i in range(1, count + 1):
        belnr = generate_id("", 10, i)
        gjahr = str(random.randint(2020, 2023))
        document_date = random_date(datetime.now() - timedelta(days=365), datetime.now())
        company_code = random.choice(company_codes)
        
        accounting_headers.append({
            "mandt": client,
            "bukrs": company_code,
            "belnr": belnr,
            "gjahr": gjahr,
            "blart": random.choice(["SA", "KR", "DR"]),
            "bldat": document_date,
            "budat": document_date,
            "monat": str(document_date.month).zfill(2),
            "waers": random.choice(currencies),
            "xblnr": f"REF{random.randint(1000, 9999)}",
            "bktxt": f"Accounting Document {i}",
            "bstat": "",
            "stjah": gjahr,
            "stblg": "",
            "xnetb": "",
            "action_type": "I",
            "row_insert_timestamp": datetime.now(),
            "row_update_timestamp": datetime.now(),
            "sequence_id": start_sequence + i + len(materials) + materials_plant_counter + len(customers) + len(vendors) + len(sales_headers) + len(sales_items) + len(purchase_headers) + len(purchase_items)
        })
        
        # Generate 2-6 items per accounting document
        item_count = random.randint(2, 6)
        total_amount = float(f"{random.uniform(1000, 10000):.2}")
        
        for j in range(1, item_count):
            # Generate line items to distribute the total
            if j == item_count - 1:
                amount = total_amount
            else:
                amount = float(f"{total_amount / item_count:.2}")
                total_amount -= amount
            
            accounting_items.append({
                "mandt": client,
                "bukrs": company_code,
                "belnr": belnr,
                "gjahr": gjahr,
                "buzei": f"{j:03d}",
                "koart": random.choice(["S", "K", "D"]),
                "shkzg": "S" if j % 2 == 0 else "H",  # Alternate debit/credit
                "gsber": random.choice(["1000", "2000", ""]),
                "kostl": generate_id("", 10, random.randint(1, 999)),
                "wrbtr": amount,
                "dmbtr": amount,
                "mwskz": random.choice(["V1", "V2", ""]),
                "sgtxt": f"Line item {j} for doc {belnr}",
                "zuonr": generate_id("", 18, random.randint(1, 999)),
                "hkont": generate_id("", 10, random.randint(1, 999)),
                "prctr": generate_id("", 10, random.randint(1, 999)),
                "matnr": materials_list[random.randint(0, len(materials_list)-1)]["matnr"] if random.random() < 0.3 else "",
                "werks": random.choice(plants) if random.random() < 0.3 else "",
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": start_sequence + i + len(materials) + materials_plant_counter + len(customers) + len(vendors) + len(sales_headers) + len(sales_items) + len(purchase_headers) + len(purchase_items) + count + accounting_item_counter
            })
            
            accounting_item_counter += 1
    
    return accounting_headers, accounting_items

# Main execution
print("Generating SAP simulation data...")

# Initialize counters
materials_plant_counter = 0

# Generate all data
materials_list = generate_materials(100)
customer_list = generate_customers(50)
vendor_list = generate_vendors(30)
sales_headers, sales_items = generate_sales_documents(50)
purchase_headers, purchase_items = generate_purchase_documents(40)
accounting_headers, accounting_items = generate_accounting_documents(30)

# Create DataFrames
df_materials = pd.DataFrame(materials_list)
df_marc = pd.DataFrame(materials_plant_mapping)
df_customers = pd.DataFrame(customer_list)
df_vendors = pd.DataFrame(vendor_list)
df_sales_headers = pd.DataFrame(sales_headers)
df_sales_items = pd.DataFrame(sales_items)
df_purchase_headers = pd.DataFrame(purchase_headers)
df_purchase_items = pd.DataFrame(purchase_items)
df_accounting_headers = pd.DataFrame(accounting_headers)
df_accounting_items = pd.DataFrame(accounting_items)

# Write data to tables
print("Writing data to warehouse tables...")

materials_path = write_to_partitioned_path(df_materials, "sap_mara")
marc_path = write_to_partitioned_path(df_marc, "sap_marc")
customers_path = write_to_partitioned_path(df_customers, "sap_kna1")
vendors_path = write_to_partitioned_path(df_vendors, "sap_lfa1")
sales_headers_path = write_to_partitioned_path(df_sales_headers, "sap_vbak")
sales_items_path = write_to_partitioned_path(df_sales_items, "sap_vbap")
purchase_headers_path = write_to_partitioned_path(df_purchase_headers, "sap_ekko")
purchase_items_path = write_to_partitioned_path(df_purchase_items, "sap_ekpo")
accounting_headers_path = write_to_partitioned_path(df_accounting_headers, "sap_bkpf")
accounting_items_path = write_to_partitioned_path(df_accounting_items, "sap_bseg")

print("Data generation complete!")
print(f"Generated {len(materials_list)} materials with {len(materials_plant_mapping)} plant assignments")
print(f"Generated {len(customer_list)} customers and {len(vendor_list)} vendors")
print(f"Generated {len(sales_headers)} sales documents with {len(sales_items)} line items")
print(f"Generated {len(purchase_headers)} purchase documents with {len(purchase_items)} line items")
print(f"Generated {len(accounting_headers)} accounting documents with {len(accounting_items)} line items")


StatementMeta(, 4119e645-5ffd-4207-baab-b14fa4d3228d, 3, Finished, Available, Finished)

Generating SAP simulation data...
Writing data to warehouse tables...
Writing sap_mara to /lakehouse/default/Files/inbound-sap-seed/sap_mara
Writing sap_marc to /lakehouse/default/Files/inbound-sap-seed/sap_marc
Writing sap_kna1 to /lakehouse/default/Files/inbound-sap-seed/sap_kna1
Writing sap_lfa1 to /lakehouse/default/Files/inbound-sap-seed/sap_lfa1
Writing sap_vbak to /lakehouse/default/Files/inbound-sap-seed/sap_vbak
Writing sap_vbap to /lakehouse/default/Files/inbound-sap-seed/sap_vbap
Writing sap_ekko to /lakehouse/default/Files/inbound-sap-seed/sap_ekko
Writing sap_ekpo to /lakehouse/default/Files/inbound-sap-seed/sap_ekpo
Writing sap_bkpf to /lakehouse/default/Files/inbound-sap-seed/sap_bkpf
Writing sap_bseg to /lakehouse/default/Files/inbound-sap-seed/sap_bseg
Data generation complete!
Generated 100 materials with 241 plant assignments
Generated 50 customers and 30 vendors
Generated 50 sales documents with 149 line items
Generated 40 purchase documents with 114 line items
Gene