In [1]:
import os
from pathlib import Path
import random
from datetime import datetime, timedelta
import uuid
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import round as spark_round
from notebookutils import mssparkutils

# Constants
client = "800"
company_codes = ["1000", "2000", "3000"]
plants = ["1000", "2000", "3000", "4000"]
storage_locations = ["0001", "0002", "0003", "0004"]
sales_orgs = ["1000", "2000"]
distr_channels = ["10", "20"]
divisions = ["00", "01"]
currencies = ["USD", "EUR", "GBP"]
countries = ["US", "DE", "FR", "GB", "IT", "ES", "JP", "CN"]
languages = ["E", "D", "F", "S", "I", "J"]

# CDC Configuration
MAX_SEQUENCE_ID = 100000000  
DELETE_PROBABILITY = 0.05    
UPDATE_PROBABILITY = 0.60    
INSERT_PROBABILITY = 0.35    

TABLES = [
    {"name": "sap_mara", "weight": 10, "key_fields": ["mandt", "matnr"]},
    {"name": "sap_marc", "weight": 15, "key_fields": ["mandt", "matnr", "werks"]},
    {"name": "sap_kna1", "weight": 8, "key_fields": ["mandt", "kunnr"]},
    {"name": "sap_lfa1", "weight": 5, "key_fields": ["mandt", "lifnr"]},
    {"name": "sap_vbak", "weight": 12, "key_fields": ["mandt", "vbeln"]},
    {"name": "sap_vbap", "weight": 20, "key_fields": ["mandt", "vbeln", "posnr"]},
    {"name": "sap_ekko", "weight": 10, "key_fields": ["mandt", "ebeln"]},
    {"name": "sap_ekpo", "weight": 15, "key_fields": ["mandt", "ebeln", "ebelp"]},
    {"name": "sap_bkpf", "weight": 10, "key_fields": ["mandt", "bukrs", "belnr", "gjahr"]},
    {"name": "sap_bseg", "weight": 15, "key_fields": ["mandt", "bukrs", "belnr", "gjahr", "buzei"]}
]

def generate_id(prefix, length, num):
    num_str = str(num).zfill(length - len(prefix))
    return f"{prefix}{num_str}"

def random_date(start_date, end_date):
    time_between = end_date - start_date
    days_between = time_between.days
    random_days = random.randrange(days_between)
    return start_date + timedelta(days=random_days)

def get_parquet_files(table_name, base_path="/lakehouse/default/Files/inbound-sap-seed"):
    table_path = f"{base_path}/{table_name}"
    
    if not os.path.exists(table_path):
        print(f"Path not found: {table_path}")
        return []
    
    parquet_files = []
    
    for root, dirs, files in os.walk(table_path):
        for file in files:
            if file.endswith('.parquet'):
                parquet_files.append(os.path.join(root, file))
    
    return parquet_files

def read_latest_data(table_name, base_path="/lakehouse/default/Files/inbound-sap-seed"):
    parquet_files = get_parquet_files(table_name, base_path)
    
    if not parquet_files:
        print(f"No parquet files found for {table_name}")
        return pd.DataFrame()
    
    latest_file = sorted(parquet_files, key=os.path.getctime, reverse=True)[0]
    print(f"Reading latest data from: {latest_file}")
    
    return pd.read_parquet(latest_file)

def write_cdc_data(df, table_name, base_path="/lakehouse/default/Files/inbound-sap-cdc"):
    current_time = datetime.now()
    year = current_time.strftime("%Y")
    month = current_time.strftime("%m")
    day = current_time.strftime("%d")
    timestamp = current_time.strftime("%Y%m%d_%H%M%S")
    
    output_path = f"{base_path}/{table_name}/{year}/{month}/{day}"
    Path(output_path).mkdir(parents=True, exist_ok=True)
    
    print(f"Writing CDC data for {table_name} to {output_path}")
    df.to_parquet(f"{output_path}/{table_name}_CDC_{timestamp}.parquet")
    
    return output_path

def generate_random_material_changes(df, num_changes):
    changes = []
    
    if df.empty:
        print("No existing material data to modify")
        return pd.DataFrame()
    
    # Get existing materials to modify
    existing_materials = df[["mandt", "matnr"]].drop_duplicates().values.tolist()
    
    if len(existing_materials) == 0:
        return pd.DataFrame()
    
    for _ in range(num_changes):
        # Choose a random action type based on probabilities
        action_type = random.choices(
            ["I", "U", "D"], 
            weights=[INSERT_PROBABILITY, UPDATE_PROBABILITY, DELETE_PROBABILITY]
        )[0]
        
        if action_type == "I":
            # Create a new material
            new_id = generate_id("MAT", 18, random.randint(1000, 9999) + len(existing_materials) + _)
            material_type = random.choice(["ROH", "HALB", "FERT", "HAWA"])
            
            changes.append({
                "mandt": client,
                "matnr": new_id,
                "mtart": material_type,
                "mbrsh": "M" if material_type in ["ROH", "HALB"] else "W",
                "matkl": random.choice(["01", "02", "03", "04", "05"]) + random.choice(["100", "200", "300", "400"]),
                "meins": random.choice(["EA", "PC", "KG", "L", "M"]),
                "bstme": random.choice(["EA", "PC", "KG", "L", "M"]),
                "mstae": random.choice(["01", "02", "03"]),
                "mstav": random.choice(["01", "02", "03"]),
                "prdha": generate_id("PRD", 18, random.randint(1, 999)),
                "gewei": random.choice(["KG", "G", "LB"]),
                "volum": float(f"{random.uniform(0.1, 100):.3f}"),
                "voleh": random.choice(["L", "M3"]),
                "behvo": random.choice(["01", "02"]),
                "spart": random.choice(divisions),
                "kunnr": generate_id("", 10, random.randint(1, 10)) if random.random() < 0.1 else "",
                "eannr": str(random.randint(1000000000000, 9999999999999)),
                "wesch": float(f"{random.uniform(0.1, 10):.3f}"),
                "brgew": float(f"{random.uniform(0.5, 50):.3f}"),
                "ntgew": float(f"{random.uniform(0.5, 45):.3f}"),
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": random.randint(1, MAX_SEQUENCE_ID)
            })
        elif action_type == "U":
            # Update an existing material
            if existing_materials:
                random_material = random.choice(existing_materials)
                original_row = df[(df["mandt"] == random_material[0]) & (df["matnr"] == random_material[1])].iloc[0].to_dict()
                
                # Modify a few fields
                original_row["mstae"] = random.choice(["01", "02", "03"])
                original_row["mstav"] = random.choice(["01", "02", "03"])
                original_row["volum"] = float(f"{random.uniform(0.1, 100):.3f}")
                original_row["wesch"] = float(f"{random.uniform(0.1, 10):.3f}")
                original_row["brgew"] = float(f"{random.uniform(0.5, 50):.3f}")
                original_row["ntgew"] = float(f"{random.uniform(0.5, 45):.3f}")
                
                # Update CDC metadata
                original_row["action_type"] = "U"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
        elif action_type == "D":
            # Delete an existing material
            if existing_materials:
                random_material = random.choice(existing_materials)
                existing_materials.remove(random_material)  # Remove to prevent duplicate deletes
                
                original_row = df[(df["mandt"] == random_material[0]) & (df["matnr"] == random_material[1])].iloc[0].to_dict()
                
                # Update CDC metadata for deletion
                original_row["action_type"] = "D"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
    
    return pd.DataFrame(changes)

def generate_random_marc_changes(df, mara_df, num_changes):
    changes = []
    
    if df.empty or mara_df.empty:
        print("No existing material plant data or materials to modify")
        return pd.DataFrame()
    
    # Get existing material-plant combinations to modify
    existing_marc = df[["mandt", "matnr", "werks"]].drop_duplicates().values.tolist()
    
    # Get all materials for potential new plant assignments
    all_materials = mara_df["matnr"].unique().tolist()
    
    for _ in range(num_changes):
        # Choose a random action type based on probabilities
        action_type = random.choices(
            ["I", "U", "D"], 
            weights=[INSERT_PROBABILITY, UPDATE_PROBABILITY, DELETE_PROBABILITY]
        )[0]
        
        if action_type == "I":
            # Create a new material-plant assignment
            material = random.choice(all_materials)
            plant = random.choice(plants)
            
            # Check if this combination already exists
            if [client, material, plant] in existing_marc:
                continue
                
            changes.append({
                "mandt": client,
                "matnr": material,
                "werks": plant,
                "pstat": random.choice(["", "I", "L"]),
                "dismm": random.choice(["VB", "ND", "VM"]),
                "beskz": random.choice(["E", "F"]),
                "sobsl": random.choice(["", "0", "8"]),
                "minbe": float(f"{random.uniform(0, 100):.3f}"),
                "eisbe": float(f"{random.uniform(100, 500):.3f}"),
                "bstmi": float(f"{random.uniform(10, 50):.3f}"),
                "bstma": float(f"{random.uniform(500, 1000):.3f}"),
                "bstfe": float(f"{random.uniform(0, 10):.3f}"),
                "mabst": float(f"{random.uniform(0, 5):.3f}"),
                "lgpro": random.choice(storage_locations),
                "lgfsb": random.choice(storage_locations),
                "dispo": str(random.randint(100, 999)),
                "disls": random.choice(["A1", "A2", "C1"]),
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": random.randint(1, MAX_SEQUENCE_ID)
            })
        elif action_type == "U":
            # Update an existing material-plant assignment
            if existing_marc:
                random_marc = random.choice(existing_marc)
                original_row = df[
                    (df["mandt"] == random_marc[0]) & 
                    (df["matnr"] == random_marc[1]) & 
                    (df["werks"] == random_marc[2])
                ].iloc[0].to_dict()
                
                # Modify a few fields
                original_row["lgpro"] = random.choice(storage_locations)
                original_row["lgfsb"] = random.choice(storage_locations)
                original_row["minbe"] = float(f"{random.uniform(0, 100):.3f}")
                original_row["eisbe"] = float(f"{random.uniform(100, 500):.3f}")
                original_row["bstmi"] = float(f"{random.uniform(10, 50):.3f}")
                original_row["bstma"] = float(f"{random.uniform(500, 1000):.3f}")
                
                # Update CDC metadata
                original_row["action_type"] = "U"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
        elif action_type == "D":
            # Delete an existing material-plant assignment
            if existing_marc:
                random_marc = random.choice(existing_marc)
                existing_marc.remove(random_marc)  # Remove to prevent duplicate deletes
                
                original_row = df[
                    (df["mandt"] == random_marc[0]) & 
                    (df["matnr"] == random_marc[1]) & 
                    (df["werks"] == random_marc[2])
                ].iloc[0].to_dict()
                
                # Update CDC metadata for deletion
                original_row["action_type"] = "D"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
    
    return pd.DataFrame(changes)

def generate_random_customer_changes(df, num_changes):
    changes = []
    
    if df.empty:
        print("No existing customer data to modify")
        return pd.DataFrame()
    
    # Get existing customers to modify
    existing_customers = df[["mandt", "kunnr"]].drop_duplicates().values.tolist()
    
    for _ in range(num_changes):
        # Choose a random action type based on probabilities
        action_type = random.choices(
            ["I", "U", "D"], 
            weights=[INSERT_PROBABILITY, UPDATE_PROBABILITY, DELETE_PROBABILITY]
        )[0]
        
        if action_type == "I":
            # Create a new customer
            new_id = generate_id("", 10, random.randint(1000, 9999) + len(existing_customers) + _)
            country = random.choice(countries)
            
            changes.append({
                "mandt": client,
                "kunnr": new_id,
                "land1": country,
                "name1": f"Customer {new_id} {country}",
                "name2": "Main Office" if random.random() < 0.3 else "",
                "ort01": f"City {random.randint(1, 100)}",
                "pstlz": str(random.randint(10000, 99999)),
                "regio": random.choice(["CA", "NY", "TX", "IL", "BAV", "BER"]),
                "stras": f"{random.randint(1, 999)} Main St.",
                "telf1": f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
                "telfx": f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
                "spras": random.choice(languages),
                "brsch": random.choice(["0001", "0002", "0003", "0004"]),
                "ktokd": random.choice(["0001", "0002", "Z001", "Z002"]),
                "kukla": random.choice(["01", "02", "03"]),
                "adrnr": generate_id("", 10, random.randint(1, 9999)),
                "stcd1": str(random.randint(100000000, 999999999)),
                "stcd2": str(random.randint(10000000, 99999999)),
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": random.randint(1, MAX_SEQUENCE_ID)
            })
        elif action_type == "U":
            # Update an existing customer
            if existing_customers:
                random_customer = random.choice(existing_customers)
                original_row = df[(df["mandt"] == random_customer[0]) & (df["kunnr"] == random_customer[1])].iloc[0].to_dict()
                
                # Modify a few fields
                original_row["telf1"] = f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}"
                original_row["telfx"] = f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}"
                original_row["kukla"] = random.choice(["01", "02", "03"])
                
                # Update CDC metadata
                original_row["action_type"] = "U"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
        elif action_type == "D":
            # Delete an existing customer (rare)
            if existing_customers and random.random() < 0.3:  # Further reduce delete probability
                random_customer = random.choice(existing_customers)
                existing_customers.remove(random_customer)  # Remove to prevent duplicate deletes
                
                original_row = df[(df["mandt"] == random_customer[0]) & (df["kunnr"] == random_customer[1])].iloc[0].to_dict()
                
                # Update CDC metadata for deletion
                original_row["action_type"] = "D"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
    
    return pd.DataFrame(changes)

def generate_random_vendor_changes(df, num_changes):
    changes = []
    
    if df.empty:
        print("No existing vendor data to modify")
        return pd.DataFrame()
    
    # Get existing vendors to modify
    existing_vendors = df[["mandt", "lifnr"]].drop_duplicates().values.tolist()
    
    for _ in range(num_changes):
        # Choose a random action type based on probabilities
        action_type = random.choices(
            ["I", "U", "D"], 
            weights=[INSERT_PROBABILITY, UPDATE_PROBABILITY, DELETE_PROBABILITY]
        )[0]
        
        if action_type == "I":
            # Create a new vendor
            new_id = generate_id("", 10, random.randint(1000, 9999) + len(existing_vendors) + _)
            country = random.choice(countries)
            
            changes.append({
                "mandt": client,
                "lifnr": new_id,
                "land1": country,
                "name1": f"Vendor {new_id} {country}",
                "name2": "Headquarters" if random.random() < 0.3 else "",
                "ort01": f"City {random.randint(1, 100)}",
                "pstlz": str(random.randint(10000, 99999)),
                "regio": random.choice(["CA", "NY", "TX", "IL", "BAV", "BER"]),
                "stras": f"{random.randint(1, 999)} Supply Ave.",
                "telf1": f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
                "telfx": f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
                "spras": random.choice(languages),
                "brsch": random.choice(["0001", "0002", "0003", "0004"]),
                "ktokd": random.choice(["0001", "0002", "Z001", "Z002"]),
                "adrnr": generate_id("", 10, random.randint(1, 9999)),
                "stcd1": str(random.randint(100000000, 999999999)),
                "stcd2": str(random.randint(10000000, 99999999)),
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": random.randint(1, MAX_SEQUENCE_ID)
            })
        elif action_type == "U":
            # Update an existing vendor
            if existing_vendors:
                random_vendor = random.choice(existing_vendors)
                original_row = df[(df["mandt"] == random_vendor[0]) & (df["lifnr"] == random_vendor[1])].iloc[0].to_dict()
                
                # Modify a few fields
                original_row["telf1"] = f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}"
                original_row["telfx"] = f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}"
                
                # Update CDC metadata
                original_row["action_type"] = "U"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
        elif action_type == "D":
            # Delete an existing vendor (rare)
            if existing_vendors and random.random() < 0.3:  # Further reduce delete probability
                random_vendor = random.choice(existing_vendors)
                existing_vendors.remove(random_vendor)  # Remove to prevent duplicate deletes
                
                original_row = df[(df["mandt"] == random_vendor[0]) & (df["lifnr"] == random_vendor[1])].iloc[0].to_dict()
                
                # Update CDC metadata for deletion
                original_row["action_type"] = "D"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
    
    return pd.DataFrame(changes)

def generate_random_sales_header_changes(df, customer_df, num_changes):
    changes = []
    
    if df.empty or customer_df.empty:
        print("No existing sales header data or customers to modify")
        return pd.DataFrame()
    
    # Get existing sales documents to modify
    existing_sales = df[["mandt", "vbeln"]].drop_duplicates().values.tolist()
    
    # Get all customers for new sales documents
    all_customers = customer_df["kunnr"].unique().tolist()
    
    for _ in range(num_changes):
        # Choose a random action type based on probabilities
        action_type = random.choices(
            ["I", "U", "D"], 
            weights=[INSERT_PROBABILITY, UPDATE_PROBABILITY, DELETE_PROBABILITY]
        )[0]
        
        if action_type == "I":
            # Create a new sales document
            new_id = generate_id("", 10, random.randint(10000, 99999) + _)
            document_date = random_date(datetime.now() - timedelta(days=30), datetime.now())
            
            changes.append({
                "mandt": client,
                "vbeln": new_id,
                "auart": random.choice(["OR", "TA", "ZOR"]),
                "erdat": document_date,
                "ernam": f"USER{random.randint(1, 999):03d}",
                "audat": document_date,
                "vdatu": document_date + timedelta(days=random.randint(1, 30)),
                "vkorg": random.choice(sales_orgs),
                "vtweg": random.choice(distr_channels),
                "spart": random.choice(divisions),
                "netwr": float(f"{random.uniform(100, 10000):.2f}"),
                "waerk": random.choice(currencies),
                "kunnr": random.choice(all_customers),
                "augru": random.choice(["001", "002", ""]),
                "bstnk": f"PO{random.randint(10000, 99999)}",
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": random.randint(1, MAX_SEQUENCE_ID)
            })
        elif action_type == "U":
            # Update an existing sales document
            if existing_sales:
                random_sales = random.choice(existing_sales)
                original_row = df[(df["mandt"] == random_sales[0]) & (df["vbeln"] == random_sales[1])].iloc[0].to_dict()
                
                # Modify a few fields
                original_row["netwr"] = float(f"{original_row['netwr'] * random.uniform(0.9, 1.1):.2f}")  # Adjust value by ±10%
                
                # Update CDC metadata
                original_row["action_type"] = "U"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
        elif action_type == "D":
            # Delete an existing sales document (rare)
            if existing_sales and random.random() < 0.3:  # Further reduce delete probability
                random_sales = random.choice(existing_sales)
                existing_sales.remove(random_sales)  # Remove to prevent duplicate deletes
                
                original_row = df[(df["mandt"] == random_sales[0]) & (df["vbeln"] == random_sales[1])].iloc[0].to_dict()
                
                # Update CDC metadata for deletion
                original_row["action_type"] = "D"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
    
    return pd.DataFrame(changes)

def generate_random_purchase_header_changes(df, vendor_df, num_changes):
    changes = []
    
    if df.empty or vendor_df.empty:
        print("No existing purchase header data or vendors to modify")
        return pd.DataFrame()
    
    # Get existing purchase documents to modify
    existing_purchases = df[["mandt", "ebeln"]].drop_duplicates().values.tolist()
    
    # Get all vendors for new purchase documents
    all_vendors = vendor_df["lifnr"].unique().tolist()
    
    for _ in range(num_changes):
        # Choose a random action type based on probabilities
        action_type = random.choices(
            ["I", "U", "D"], 
            weights=[INSERT_PROBABILITY, UPDATE_PROBABILITY, DELETE_PROBABILITY]
        )[0]
        
        if action_type == "I":
            # Create a new purchase document
            new_id = generate_id("", 10, random.randint(10000, 99999) + _)
            document_date = random_date(datetime.now() - timedelta(days=30), datetime.now())
            
            changes.append({
                "mandt": client,
                "ebeln": new_id,
                "bukrs": random.choice(company_codes),
                "bstyp": random.choice(["F", "L", "K"]),
                "bsart": random.choice(["NB", "ZNB", "ZDR"]),
                "lifnr": random.choice(all_vendors),
                "ekorg": random.choice(["1000", "2000"]),
                "ekgrp": random.choice(["001", "002", "003"]),
                "waers": random.choice(currencies),
                "bedat": document_date,
                "kdatb": document_date,
                "kdate": document_date + timedelta(days=random.randint(30, 365)),
                "bwbdt": document_date,
                "angdt": document_date - timedelta(days=random.randint(1, 30)),
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": random.randint(1, MAX_SEQUENCE_ID)
            })
        elif action_type == "U":
            # Update an existing purchase document
            if existing_purchases:
                random_purchase = random.choice(existing_purchases)
                original_row = df[(df["mandt"] == random_purchase[0]) & (df["ebeln"] == random_purchase[1])].iloc[0].to_dict()
                
                # Modify a date field
                original_row["kdate"] = original_row["kdatb"] + timedelta(days=random.randint(30, 365))
                
                # Update CDC metadata
                original_row["action_type"] = "U"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
        elif action_type == "D":
            # Delete an existing purchase document (rare)
            if existing_purchases and random.random() < 0.3:  # Further reduce delete probability
                random_purchase = random.choice(existing_purchases)
                existing_purchases.remove(random_purchase)  # Remove to prevent duplicate deletes
                
                original_row = df[(df["mandt"] == random_purchase[0]) & (df["ebeln"] == random_purchase[1])].iloc[0].to_dict()
                
                # Update CDC metadata for deletion
                original_row["action_type"] = "D"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
    
    return pd.DataFrame(changes)

def generate_random_purchase_item_changes(df, header_df, material_df, num_changes):
    changes = []
    
    if df.empty or header_df.empty or material_df.empty:
        print("No existing purchase item data, headers, or materials to modify")
        return pd.DataFrame()
    
    # Get existing purchase items to modify
    existing_items = df[["mandt", "ebeln", "ebelp"]].drop_duplicates().values.tolist()
    
    # Get all purchase documents and materials for potential new items
    all_purchase_docs = header_df["ebeln"].unique().tolist()
    all_materials = material_df["matnr"].unique().tolist()
    
    for _ in range(num_changes):
        # Choose a random action type based on probabilities
        action_type = random.choices(
            ["I", "U", "D"], 
            weights=[INSERT_PROBABILITY, UPDATE_PROBABILITY, DELETE_PROBABILITY]
        )[0]
        
        if action_type == "I":
            # Create a new purchase item
            purchase_doc = random.choice(all_purchase_docs)
            
            # Find existing positions for this document to avoid duplicates
            existing_positions = [item[2] for item in existing_items if item[1] == purchase_doc]
            
            # Generate a new position number
            if existing_positions:
                max_pos = max([int(pos) for pos in existing_positions])
                new_pos = f"{max_pos + 10:05d}"
            else:
                new_pos = "00010"
                
            material = random.choice(all_materials)
            quantity = random.randint(1, 100)
            price = float(f"{random.uniform(10, 1000):.2f}")
            net_value = float(f"{quantity * price:.2f}")
            
            changes.append({
                "mandt": client,
                "ebeln": purchase_doc,
                "ebelp": new_pos,
                "matnr": material,
                "werks": random.choice(plants),
                "lgort": random.choice(storage_locations),
                "matkl": random.choice(["01", "02", "03", "04", "05"]) + random.choice(["100", "200", "300", "400"]),
                "meins": random.choice(["EA", "PC", "KG", "L", "M"]),
                "menge": quantity,
                "netpr": price,
                "peinh": 1,
                "netwr": net_value,
                "brtwr": float(f"{net_value * 1.2:.2f}"),  # Gross value with tax
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": random.randint(1, MAX_SEQUENCE_ID)
            })
        elif action_type == "U":
            # Update an existing purchase item
            if existing_items:
                random_item = random.choice(existing_items)
                original_row = df[
                    (df["mandt"] == random_item[0]) & 
                    (df["ebeln"] == random_item[1]) & 
                    (df["ebelp"] == random_item[2])
                ].iloc[0].to_dict()
                
                # Modify quantity and values
                old_quantity = original_row["menge"]
                new_quantity = int(old_quantity * random.uniform(0.8, 1.2))  # Adjust by ±20%
                
                if "netpr" in original_row and new_quantity > 0:
                    # Update net and gross values
                    original_row["netwr"] = float(f"{new_quantity * original_row['netpr']:.2f}")
                    original_row["brtwr"] = float(f"{original_row['netwr'] * 1.2:.2f}")
                
                original_row["menge"] = new_quantity
                
                # Update CDC metadata
                original_row["action_type"] = "U"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
        elif action_type == "D":
            # Delete an existing purchase item (rare)
            if existing_items and random.random() < 0.3:  # Further reduce delete probability
                random_item = random.choice(existing_items)
                existing_items.remove(random_item)  # Remove to prevent duplicate deletes
                
                original_row = df[
                    (df["mandt"] == random_item[0]) & 
                    (df["ebeln"] == random_item[1]) & 
                    (df["ebelp"] == random_item[2])
                ].iloc[0].to_dict()
                
                # Update CDC metadata for deletion
                original_row["action_type"] = "D"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
    
    return pd.DataFrame(changes)

def generate_random_accounting_header_changes(df, num_changes):
    changes = []
    
    if df.empty:
        print("No existing accounting header data to modify")
        return pd.DataFrame()
    
    # Get existing accounting documents to modify
    existing_documents = df[["mandt", "bukrs", "belnr", "gjahr"]].drop_duplicates().values.tolist()
    
    for _ in range(num_changes):
        # Choose a random action type based on probabilities (lower insert probability for accounting)
        action_type = random.choices(
            ["I", "U", "D"], 
            weights=[0.25, 0.70, 0.05]  # More updates for accounting
        )[0]
        
        if action_type == "I":
            # Create a new accounting document
            new_id = generate_id("", 10, random.randint(10000, 99999) + _)
            gjahr = str(random.randint(2023, 2024))
            document_date = random_date(datetime.now() - timedelta(days=30), datetime.now())
            company_code = random.choice(company_codes)
            
            changes.append({
                "mandt": client,
                "bukrs": company_code,
                "belnr": new_id,
                "gjahr": gjahr,
                "blart": random.choice(["SA", "KR", "DR"]),
                "bldat": document_date,
                "budat": document_date,
                "monat": str(document_date.month).zfill(2),
                "waers": random.choice(currencies),
                "xblnr": f"REF{random.randint(1000, 9999)}",
                "bktxt": f"Accounting Document {new_id}",
                "bstat": "",
                "stjah": gjahr,
                "stblg": "",
                "xnetb": "",
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": random.randint(1, MAX_SEQUENCE_ID)
            })
        elif action_type == "U":
            # Update an existing accounting document
            if existing_documents:
                random_doc = random.choice(existing_documents)
                original_row = df[
                    (df["mandt"] == random_doc[0]) & 
                    (df["bukrs"] == random_doc[1]) & 
                    (df["belnr"] == random_doc[2]) & 
                    (df["gjahr"] == random_doc[3])
                ].iloc[0].to_dict()
                
                # Modify text field
                original_row["bktxt"] = f"Updated: {original_row['bktxt']}"
                
                # Update CDC metadata
                original_row["action_type"] = "U"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
        elif action_type == "D":
            # Delete an existing accounting document (very rare)
            if existing_documents and random.random() < 0.1:  # Even further reduce delete probability for accounting
                random_doc = random.choice(existing_documents)
                existing_documents.remove(random_doc)  # Remove to prevent duplicate deletes
                
                original_row = df[
                    (df["mandt"] == random_doc[0]) & 
                    (df["bukrs"] == random_doc[1]) & 
                    (df["belnr"] == random_doc[2]) & 
                    (df["gjahr"] == random_doc[3])
                ].iloc[0].to_dict()
                
                # Update CDC metadata for deletion
                original_row["action_type"] = "D"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
    
    return pd.DataFrame(changes)

def generate_random_accounting_item_changes(df, header_df, num_changes):
    changes = []
    
    if df.empty or header_df.empty:
        print("No existing accounting item data or headers to modify")
        return pd.DataFrame()
    
    # Get existing accounting items to modify
    existing_items = df[["mandt", "bukrs", "belnr", "gjahr", "buzei"]].drop_duplicates().values.tolist()
    
    # Get all accounting documents for potential new items
    all_docs = header_df[["bukrs", "belnr", "gjahr"]].drop_duplicates().values.tolist()
    
    for _ in range(num_changes):
        # Choose a random action type based on probabilities
        action_type = random.choices(
            ["I", "U", "D"], 
            weights=[0.25, 0.70, 0.05]  # More updates for accounting
        )[0]
        
        if action_type == "I":
            # Create a new accounting item
            if all_docs:
                random_doc = random.choice(all_docs)
                doc_bukrs, doc_belnr, doc_gjahr = random_doc
                
                # Find existing line items for this document
                existing_positions = [
                    item[4] for item in existing_items 
                    if item[1] == doc_bukrs and item[2] == doc_belnr and item[3] == doc_gjahr
                ]
                
                # Generate a new line item number
                if existing_positions:
                    max_pos = max([int(pos) for pos in existing_positions])
                    new_pos = f"{max_pos + 1:03d}"
                else:
                    new_pos = "001"
                
                amount = float(f"{random.uniform(100, 5000):.2f}")
                
                changes.append({
                    "mandt": client,
                    "bukrs": doc_bukrs,
                    "belnr": doc_belnr,
                    "gjahr": doc_gjahr,
                    "buzei": new_pos,
                    "koart": random.choice(["S", "K", "D"]),
                    "shkzg": "S" if int(new_pos) % 2 == 0 else "H",  # Alternate debit/credit
                    "gsber": random.choice(["1000", "2000", ""]),
                    "kostl": generate_id("", 10, random.randint(1, 999)),
                    "wrbtr": amount,
                    "dmbtr": amount,
                    "mwskz": random.choice(["V1", "V2", ""]),
                    "sgtxt": f"New line item for doc {doc_belnr}",
                    "zuonr": generate_id("", 18, random.randint(1, 999)),
                    "hkont": generate_id("", 10, random.randint(1, 999)),
                    "prctr": generate_id("", 10, random.randint(1, 999)),
                    "matnr": "",
                    "werks": "",
                    "action_type": "I",
                    "row_insert_timestamp": datetime.now(),
                    "row_update_timestamp": datetime.now(),
                    "sequence_id": random.randint(1, MAX_SEQUENCE_ID)
                })
        elif action_type == "U":
            # Update an existing accounting item
            if existing_items:
                random_item = random.choice(existing_items)
                original_row = df[
                    (df["mandt"] == random_item[0]) & 
                    (df["bukrs"] == random_item[1]) & 
                    (df["belnr"] == random_item[2]) & 
                    (df["gjahr"] == random_item[3]) &
                    (df["buzei"] == random_item[4])
                ].iloc[0].to_dict()
                
                # Modify amount and text
                adjustment = random.uniform(0.95, 1.05)  # Small adjustments ±5%
                original_row["wrbtr"] = float(f"{original_row['wrbtr'] * adjustment:.2f}")
                original_row["dmbtr"] = float(f"{original_row['dmbtr'] * adjustment:.2f}")
                original_row["sgtxt"] = f"Updated: {original_row['sgtxt']}"
                
                # Update CDC metadata
                original_row["action_type"] = "U"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
        elif action_type == "D":
            # Delete an existing accounting item (very rare)
            if existing_items and random.random() < 0.1:  # Further reduce delete probability
                random_item = random.choice(existing_items)
                existing_items.remove(random_item)  # Remove to prevent duplicate deletes
                
                original_row = df[
                    (df["mandt"] == random_item[0]) & 
                    (df["bukrs"] == random_item[1]) & 
                    (df["belnr"] == random_item[2]) & 
                    (df["gjahr"] == random_item[3]) &
                    (df["buzei"] == random_item[4])
                ].iloc[0].to_dict()
                
                # Update CDC metadata for deletion
                original_row["action_type"] = "D"
                original_row["row_update_timestamp"] = datetime.now()
                original_row["sequence_id"] = random.randint(1, MAX_SEQUENCE_ID)
                
                changes.append(original_row)
    
    return pd.DataFrame(changes)

def generate_random_sales_item_changes(df, header_df, material_df, num_changes):
    changes = []
    
    if df.empty or header_df.empty or material_df.empty:
        print("No existing sales item data, headers, or materials to modify")
        return pd.DataFrame()
    
    # Get existing sales items to modify
    existing_items = df[["mandt", "vbeln", "posnr"]].drop_duplicates().values.tolist()
    
    # Get all sales documents and materials for potential new items
    all_sales_docs = header_df["vbeln"].unique().tolist()
    all_materials = material_df["matnr"].unique().tolist()
    
    for _ in range(num_changes):
        # Choose a random action type based on probabilities
        action_type = random.choices(
            ["I", "U", "D"], 
            weights=[INSERT_PROBABILITY, UPDATE_PROBABILITY, DELETE_PROBABILITY]
        )[0]
        
        if action_type == "I":
            # Create a new sales item
            sales_doc = random.choice(all_sales_docs)
            
            # Find existing positions for this document to avoid duplicates
            existing_positions = [item[2] for item in existing_items if item[1] == sales_doc]
            
            # Generate a new position number
            if existing_positions:
                max_pos = max([int(pos) for pos in existing_positions])
                new_pos = f"{max_pos + 10:06d}"
            else:
                new_pos = "000010"
                
            material = random.choice(all_materials)
            quantity = random.randint(1, 100)
            price = float(f"{random.uniform(10, 1000):.2f}")
            net_value = float(f"{quantity * price:.2f}")
            
            # Get header info for reference
            header_info = header_df[header_df["vbeln"] == sales_doc].iloc[0]
            
            changes.append({
                "mandt": client,
                "vbeln": sales_doc,
                "posnr": new_pos,
                "matnr": material,
                "werks": random.choice(plants),
                "pstyv": random.choice(["TAN", "AGN", "TANN"]),
                "menge": quantity,
                "meins": random.choice(["EA", "PC", "KG", "L", "M"]),
                "netwr": net_value,
                "waerk": header_info["waerk"],
                "kwmeng": quantity,
                "vrkme": random.choice(["EA", "PC", "KG", "L", "M"]),
                "vstel": random.choice(["1000", "2000"]),
                "lgort": random.choice(storage_locations),
                "kunnr": header_info["kunnr"],
                "uepos": "",
                "charg": "",
                "action_type": "I",
                "row_insert_timestamp": datetime.now(),
                "row_update_timestamp": datetime.now(),
                "sequence_id": get_next_sequence_id()
            })
        elif action_type == "U":
            # Update an existing sales item
            if existing_items:
                random_item = random.choice(existing_items)
                try:
                    original_row = df[
                        (df["mandt"] == random_item[0]) & 
                        (df["vbeln"] == random_item[1]) & 
                        (df["posnr"] == random_item[2])
                    ].iloc[0].to_dict()
                    
                    # Modify quantity and values
                    old_quantity = original_row["menge"]
                    new_quantity = int(old_quantity * random.uniform(0.8, 1.2))  # Adjust by ±20%
                    if new_quantity < 1:
                        new_quantity = 1
                    
                    # Update net value based on quantity change
                    price_per_unit = original_row["netwr"] / old_quantity if old_quantity > 0 else 0
                    original_row["netwr"] = float(f"{new_quantity * price_per_unit:.2f}")
                    original_row["menge"] = new_quantity
                    original_row["kwmeng"] = new_quantity
                    
                    # Update CDC metadata
                    original_row["action_type"] = "U"
                    original_row["row_update_timestamp"] = datetime.now()
                    original_row["sequence_id"] = get_next_sequence_id()
                    
                    changes.append(original_row)
                except IndexError:
                    # Skip if the row doesn't exist anymore
                    pass
        elif action_type == "D":
            # Delete an existing sales item
            if existing_items:
                random_item = random.choice(existing_items)
                existing_items.remove(random_item)  # Remove to prevent duplicate deletes
                
                try:
                    original_row = df[
                        (df["mandt"] == random_item[0]) & 
                        (df["vbeln"] == random_item[1]) & 
                        (df["posnr"] == random_item[2])
                    ].iloc[0].to_dict()
                    
                    # Update CDC metadata for deletion
                    original_row["action_type"] = "D"
                    original_row["row_update_timestamp"] = datetime.now()
                    original_row["sequence_id"] = get_next_sequence_id()
                    
                    changes.append(original_row)
                except IndexError:
                    # Skip if the row doesn't exist anymore
                    pass
    
    return pd.DataFrame(changes)


StatementMeta(, 6d97781e-2d3d-404b-a7c0-9abbed3e7f98, 3, Finished, Available, Finished)