In [1]:
import os
from pathlib import Path
import random
from datetime import datetime, timedelta
import uuid
import pandas as pd
import json
import sys

client = "800"

DELETE_PROBABILITY = 0.05
UPDATE_PROBABILITY = 0.60
INSERT_PROBABILITY = 0.35

last_sequence_id = 0


class TableConfig:
    def __init__(self, name, key_fields, weight=10, field_generators=None, related_tables=None):
        self.name = name
        self.key_fields = key_fields
        self.weight = weight
        self.field_generators = field_generators or {}
        self.related_tables = related_tables or []


class CDCGenerator:
    def __init__(self, base_seed_path="/lakehouse/default/Files/inbound-sap-seed",
                 base_cdc_path="/lakehouse/default/Files/inbound-sap-cdc"):
        self.base_seed_path = base_seed_path
        self.base_cdc_path = base_cdc_path
        self.table_configs = {}
        self.data_cache = {}
        self._init_sequence_id()

    def _init_sequence_id(self):
        global last_sequence_id
        last_sequence_id = int(datetime.now().timestamp() * 1000)

    def get_next_sequence_id(self):
        global last_sequence_id
        last_sequence_id += 1
        return last_sequence_id

    def register_table(self, table_config):
        self.table_configs[table_config.name] = table_config
        return self

    def get_parquet_files(self, table_name):
        table_path = f"{self.base_seed_path}/{table_name}"

        if not os.path.exists(table_path):
            print(f"Path not found: {table_path}")
            return []

        parquet_files = []

        for root, dirs, files in os.walk(table_path):
            for file in files:
                if file.endswith('.parquet'):
                    parquet_files.append(os.path.join(root, file))

        return parquet_files

    def read_latest_data(self, table_name):
        if table_name in self.data_cache:
            return self.data_cache[table_name]

        parquet_files = self.get_parquet_files(table_name)

        if not parquet_files:
            print(f"No parquet files found for {table_name}")
            return pd.DataFrame()

        def sort_key(file_path):
            parts = file_path.split('/')
            date_parts = []
            for part in parts:
                if part.isdigit() and (len(part) == 4 or len(part) == 2):
                    date_parts.append(part)
            
            if len(date_parts) >= 3:
                date_str = ''.join(date_parts[:3])
                return date_str + str(os.path.getctime(file_path))
            else:
                return str(os.path.getctime(file_path))
        
        latest_file = sorted(parquet_files, key=sort_key, reverse=True)[0]
        print(f"Reading latest data from: {latest_file}")

        df = pd.read_parquet(latest_file)
        self.data_cache[table_name] = df
        return df

    def write_cdc_data(self, df, table_name):
        if df.empty:
            print(f"No CDC data to write for {table_name}")
            return None

        current_time = datetime.now()
        year = current_time.strftime("%Y")
        month = current_time.strftime("%m")
        day = current_time.strftime("%d")
        timestamp = current_time.strftime("%Y%m%d_%H%M%S")

        output_path = f"{self.base_cdc_path}/{table_name}"
        Path(output_path).mkdir(parents=True, exist_ok=True)

        table_config = self.table_configs.get(table_name)
        metadata = {
            "table_name": table_name,
            "key_fields": table_config.key_fields if table_config else [],
            "cdc_timestamp": timestamp,
            "record_count": len(df),
            "insert_count": len(df[df["action_type"] == "I"]),
            "update_count": len(df[df["action_type"] == "U"]),
            "delete_count": len(df[df["action_type"] == "D"])
        }

        metadata_file = f"{output_path}/{table_name}_CDC_{timestamp}_metadata.json"
        with open(metadata_file, 'w') as f:
            json.dump(metadata, f, indent=2)

        output_file = f"{output_path}/{table_name}_CDC_{timestamp}.csv"
        df.to_parquet(output_file)

        print(f"Writing CDC data for {table_name} to {output_path}")
        return output_path

    def generate_changes(self, table_name, num_changes):
        if table_name not in self.table_configs:
            print(f"No configuration found for table {table_name}")
            return pd.DataFrame()

        table_config = self.table_configs[table_name]
        df = self.read_latest_data(table_name)

        if df.empty:
            print(f"No existing data for {table_name}")
            return pd.DataFrame()

        key_fields = table_config.key_fields
        existing_records = df[key_fields].drop_duplicates().values.tolist()

        changes = []

        for _ in range(num_changes):
            action_type = random.choices(
                ["I", "U", "D"],
                weights=[INSERT_PROBABILITY, UPDATE_PROBABILITY, DELETE_PROBABILITY]
            )[0]

            if action_type == "I":
                new_record = self._generate_insert(table_name, df, existing_records)
                if new_record:
                    changes.append(new_record)

            elif action_type == "U":
                updated_record = self._generate_update(table_name, df, existing_records)
                if updated_record:
                    changes.append(updated_record)

            elif action_type == "D":
                deleted_record = self._generate_delete(table_name, df, existing_records)
                if deleted_record:
                    key_values = [deleted_record[key] for key in key_fields]
                    try:
                        existing_records.remove(key_values)
                    except ValueError:
                        pass
                    changes.append(deleted_record)

        return pd.DataFrame(changes)

    def _generate_insert(self, table_name, df, existing_records):
        table_config = self.table_configs[table_name]

        new_record = {"mandt": client}

        for field, generator in table_config.field_generators.items():
            if callable(generator):
                new_record[field] = generator(self, table_name, df)
            elif isinstance(generator, dict) and "related_table" in generator:
                related_table = generator["related_table"]
                related_df = self.read_latest_data(related_table)
                if not related_df.empty:
                    field_values = related_df[generator["field"]].unique().tolist()
                    if field_values:
                        new_record[field] = random.choice(field_values)

        new_record["action_type"] = "I"
        new_record["row_insert_timestamp"] = datetime.now()
        new_record["row_update_timestamp"] = datetime.now()
        new_record["sequence_id"] = self.get_next_sequence_id()

        return new_record

    def _generate_update(self, table_name, df, existing_records):
        if not existing_records:
            return None

        table_config = self.table_configs[table_name]
        key_fields = table_config.key_fields

        random_key = random.choice(existing_records)

        filter_condition = True
        for i, key_field in enumerate(key_fields):
            filter_condition = filter_condition & (df[key_field] == random_key[i])

        try:
            original_row = df[filter_condition].iloc[0].to_dict()
        except (IndexError, KeyError):
            return None

        for field, generator in table_config.field_generators.items():
            if field in key_fields:
                continue

            if callable(generator):
                original_row[field] = generator(self, table_name, df, original_row)
            elif isinstance(generator, dict) and "updatable" in generator and generator["updatable"]:
                if "values" in generator:
                    original_row[field] = random.choice(generator["values"])
                elif "range" in generator:
                    min_val, max_val = generator["range"]
                    original_row[field] = random.uniform(min_val, max_val)

        original_row["action_type"] = "U"
        original_row["row_update_timestamp"] = datetime.now()
        original_row["sequence_id"] = self.get_next_sequence_id()

        return original_row

    def _generate_delete(self, table_name, df, existing_records):
        if not existing_records:
            return None

        table_config = self.table_configs[table_name]
        key_fields = table_config.key_fields

        random_key = random.choice(existing_records)

        filter_condition = True
        for i, key_field in enumerate(key_fields):
            filter_condition = filter_condition & (df[key_field] == random_key[i])

        try:
            original_row = df[filter_condition].iloc[0].to_dict()
        except (IndexError, KeyError):
            return None

        original_row["action_type"] = "D"
        original_row["row_update_timestamp"] = datetime.now()
        original_row["sequence_id"] = self.get_next_sequence_id()

        return original_row

    def generate_transaction_group_changes(self, header_table, item_tables, num_transactions):
        header_df = self.read_latest_data(header_table)
        if header_df.empty:
            print(f"No header data for {header_table}")
            return {}

        header_config = self.table_configs[header_table]
        id_field = header_config.key_fields[1]

        all_changes = {header_table: []}
        for item_table in item_tables:
            all_changes[item_table] = []

        for _ in range(num_transactions):
            action_type = random.choices(
                ["I", "U", "D"],
                weights=[INSERT_PROBABILITY, UPDATE_PROBABILITY, DELETE_PROBABILITY]
            )[0]

            sequence_id = self.get_next_sequence_id()

            if action_type == "I":
                pass
            elif action_type == "U":
                if header_df.empty:
                    continue

                random_header = header_df.sample(1).iloc[0]
                doc_id = random_header[id_field]

                header_change = random_header.to_dict()
                header_change["action_type"] = "U"
                header_change["row_update_timestamp"] = datetime.now()
                header_change["sequence_id"] = sequence_id
                all_changes[header_table].append(header_change)

                for item_table in item_tables:
                    item_df = self.read_latest_data(item_table)
                    if item_df.empty:
                        continue

                    items = item_df[item_df[id_field] == doc_id]
                    for _, item in items.iterrows():
                        item_change = item.to_dict()
                        item_change["action_type"] = "U"
                        item_change["row_update_timestamp"] = datetime.now()
                        item_change["sequence_id"] = sequence_id
                        all_changes[item_table].append(item_change)

            elif action_type == "D":
                if header_df.empty:
                    continue

                random_header = header_df.sample(1).iloc[0]
                doc_id = random_header[id_field]

                header_change = random_header.to_dict()
                header_change["action_type"] = "D"
                header_change["row_update_timestamp"] = datetime.now()
                header_change["sequence_id"] = sequence_id
                all_changes[header_table].append(header_change)

                for item_table in item_tables:
                    item_df = self.read_latest_data(item_table)
                    if item_df.empty:
                        continue

                    items = item_df[item_df[id_field] == doc_id]
                    for _, item in items.iterrows():
                        item_change = item.to_dict()
                        item_change["action_type"] = "D"
                        item_change["row_update_timestamp"] = datetime.now()
                        item_change["sequence_id"] = sequence_id
                        all_changes[item_table].append(item_change)

        results = {}
        for table, changes in all_changes.items():
            if changes:
                results[table] = pd.DataFrame(changes)
            else:
                results[table] = pd.DataFrame()

        return results

    def run_simulation(self, changes_per_table=25):
        print(f"Starting CDC simulation with {changes_per_table} changes per table...")

        self._init_sequence_id()
        total_weight = sum(config.weight for config in self.table_configs.values())

        table_changes = {}
        for table_name, config in self.table_configs.items():
            weight = config.weight
            changes_for_table = max(1, int((weight / total_weight) * changes_per_table * 10))
            table_changes[table_name] = changes_for_table

        results = {}

        processed_tables = set()

        for table_name, config in self.table_configs.items():
            if table_name in processed_tables or not config.related_tables:
                continue

            item_tables = config.related_tables

            print(f"Generating transaction group changes for {table_name} and related tables...")
            group_changes = self.generate_transaction_group_changes(
                table_name, item_tables, table_changes[table_name]
            )

            for group_table, changes_df in group_changes.items():
                if not changes_df.empty:
                    self.write_cdc_data(changes_df, group_table)
                    results[group_table] = len(changes_df)

            processed_tables.add(table_name)
            processed_tables.update(item_tables)

        for table_name, config in self.table_configs.items():
            if table_name in processed_tables:
                continue

            print(f"Generating changes for {table_name}...")
            changes_df = self.generate_changes(table_name, table_changes[table_name])

            if not changes_df.empty:
                self.write_cdc_data(changes_df, table_name)
                results[table_name] = len(changes_df)

        total_changes = sum(results.values())
        results["total"] = total_changes

        print(f"CDC simulation complete. Generated {total_changes} total changes across all tables.")
        return results


def generate_material_number(generator, table_name, df, existing_record=None):
    if existing_record is not None:
        return existing_record["matnr"]
    return f"MAT{random.randint(10000, 99999):05d}"


def generate_customer_number(generator, table_name, df, existing_record=None):
    if existing_record is not None:
        return existing_record["kunnr"]
    return f"CUST{random.randint(1000, 9999):04d}"


def generate_vendor_number(generator, table_name, df, existing_record=None):
    if existing_record is not None:
        return existing_record["lifnr"]
    return f"VEND{random.randint(1000, 9999):04d}"


def generate_random_quantity(generator, table_name, df, existing_record=None):
    if existing_record is not None:
        old_quantity = existing_record["menge"]
        new_quantity = int(old_quantity * random.uniform(0.8, 1.2))
        return max(1, new_quantity)
    return random.randint(1, 100)


def configure_sap_cdc_generator():
    generator = CDCGenerator()

    generator.register_table(TableConfig(
        name="sap_mara",
        key_fields=["mandt", "matnr"],
        weight=10,
        field_generators={
            "matnr": generate_material_number,
            "mtart": {"values": ["ROH", "HALB", "FERT", "HAWA"], "updatable": True},
            "mbrsh": {"values": ["M", "W"], "updatable": False},
            "matkl": {"values": ["0100", "0200", "0300", "0400", "0500"], "updatable": True},
            "meins": {"values": ["EA", "PC", "KG", "L", "M"], "updatable": False},
            "bstme": {"values": ["EA", "PC", "KG", "L", "M"], "updatable": True},
            "volum": {"range": [0.1, 100], "updatable": True},
            "ntgew": {"range": [0.5, 45], "updatable": True}
        }
    ))

    generator.register_table(TableConfig(
        name="sap_marc",
        key_fields=["mandt", "matnr", "werks"],
        weight=15,
        field_generators={
            "matnr": {"related_table": "sap_mara", "field": "matnr"},
            "werks": {"values": ["1000", "2000", "3000", "4000"], "updatable": False},
            "lgpro": {"values": ["0001", "0002", "0003", "0004"], "updatable": True},
            "lgfsb": {"values": ["0001", "0002", "0003", "0004"], "updatable": True},
            "dismm": {"values": ["VB", "ND", "VM"], "updatable": True},
            "minbe": {"range": [0, 100], "updatable": True},
            "eisbe": {"range": [100, 500], "updatable": True}
        }
    ))

    generator.register_table(TableConfig(
        name="sap_kna1",
        key_fields=["mandt", "kunnr"],
        weight=8,
        field_generators={
            "kunnr": generate_customer_number,
            "land1": {"values": ["US", "DE", "FR", "GB", "IT", "ES", "JP", "CN"], "updatable": True},
            "name1": lambda g, t, df, r=None: f"Customer {r['kunnr'] if r else generate_customer_number(g, t, df)}",
            "telf1": lambda g, t, df,
                            r=None: f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
            "kukla": {"values": ["01", "02", "03"], "updatable": True}
        }
    ))

    generator.register_table(TableConfig(
        name="sap_lfa1",
        key_fields=["mandt", "lifnr"],
        weight=5,
        field_generators={
            "lifnr": generate_vendor_number,
            "land1": {"values": ["US", "DE", "FR", "GB", "IT", "ES", "JP", "CN"], "updatable": True},
            "name1": lambda g, t, df, r=None: f"Vendor {r['lifnr'] if r else generate_vendor_number(g, t, df)}",
            "telf1": lambda g, t, df,
                            r=None: f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}"
        }
    ))

    generator.register_table(TableConfig(
        name="sap_vbak",
        key_fields=["mandt", "vbeln"],
        weight=12,
        related_tables=["sap_vbap"],
        field_generators={
            "vbeln": lambda g, t, df, r=None: r["vbeln"] if r else f"OR{random.randint(100000, 999999):06d}",
            "auart": {"values": ["OR", "TA", "ZOR"], "updatable": True},
            "vkorg": {"values": ["1000", "2000"], "updatable": False},
            "vtweg": {"values": ["10", "20"], "updatable": False},
            "spart": {"values": ["00", "01"], "updatable": False},
            "netwr": {"range": [100, 10000], "updatable": True},
            "waerk": {"values": ["USD", "EUR", "GBP"], "updatable": False},
            "kunnr": {"related_table": "sap_kna1", "field": "kunnr"}
        }
    ))

    generator.register_table(TableConfig(
        name="sap_vbap",
        key_fields=["mandt", "vbeln", "posnr"],
        weight=20,
        field_generators={
            "vbeln": {"related_table": "sap_vbak", "field": "vbeln"},
            "posnr": lambda g, t, df, r=None: r["posnr"] if r else f"{random.randint(1, 99):06d}",
            "matnr": {"related_table": "sap_mara", "field": "matnr"},
            "werks": {"values": ["1000", "2000", "3000", "4000"], "updatable": True},
            "menge": generate_random_quantity,
            "netwr": lambda g, t, df, r=None: r["netwr"] if r else random.uniform(10, 1000) * (
                r["menge"] if r else random.randint(1, 100))
        }
    ))

    generator.register_table(TableConfig(
        name="sap_ekko",
        key_fields=["mandt", "ebeln"],
        weight=10,
        related_tables=["sap_ekpo"],
        field_generators={
            "ebeln": lambda g, t, df, r=None: r["ebeln"] if r else f"PO{random.randint(100000, 999999):06d}",
            "bukrs": {"values": ["1000", "2000", "3000"], "updatable": False},
            "bstyp": {"values": ["F", "L", "K"], "updatable": False},
            "lifnr": {"related_table": "sap_lfa1", "field": "lifnr"},
            "ekorg": {"values": ["1000", "2000"], "updatable": False},
            "waers": {"values": ["USD", "EUR", "GBP"], "updatable": False}
        }
    ))

    generator.register_table(TableConfig(
        name="sap_ekpo",
        key_fields=["mandt", "ebeln", "ebelp"],
        weight=15,
        field_generators={
            "ebeln": {"related_table": "sap_ekko", "field": "ebeln"},
            "ebelp": lambda g, t, df, r=None: r["ebelp"] if r else f"{random.randint(1, 99):05d}",
            "matnr": {"related_table": "sap_mara", "field": "matnr"},
            "werks": {"values": ["1000", "2000", "3000", "4000"], "updatable": True},
            "lgort": {"values": ["0001", "0002", "0003", "0004"], "updatable": True},
            "menge": generate_random_quantity,
            "netpr": {"range": [10, 1000], "updatable": True}
        }
    ))

    generator.register_table(TableConfig(
        name="sap_bkpf",
        key_fields=["mandt", "bukrs", "belnr", "gjahr"],
        weight=10,
        related_tables=["sap_bseg"],
        field_generators={
            "bukrs": {"values": ["1000", "2000", "3000"], "updatable": False},
            "belnr": lambda g, t, df, r=None: r["belnr"] if r else f"AC{random.randint(100000, 999999):06d}",
            "gjahr": lambda g, t, df, r=None: r["gjahr"] if r else str(random.randint(2023, 2024)),
            "blart": {"values": ["SA", "KR", "DR"], "updatable": True},
            "waers": {"values": ["USD", "EUR", "GBP"], "updatable": False},
            "bktxt": lambda g, t, df, r=None: f"Accounting Doc {r['belnr'] if r else 'New'}"
        }
    ))

    generator.register_table(TableConfig(
        name="sap_bseg",
        key_fields=["mandt", "bukrs", "belnr", "gjahr", "buzei"],
        weight=15,
        field_generators={
            "bukrs": {"related_table": "sap_bkpf", "field": "bukrs"},
            "belnr": {"related_table": "sap_bkpf", "field": "belnr"},
            "gjahr": {"related_table": "sap_bkpf", "field": "gjahr"},
            "buzei": lambda g, t, df, r=None: r["buzei"] if r else f"{random.randint(1, 99):03d}",
            "koart": {"values": ["S", "K", "D"], "updatable": False},
            "shkzg": {"values": ["S", "H"], "updatable": True},
            "wrbtr": {"range": [100, 5000], "updatable": True},
            "dmbtr": lambda g, t, df, r=None: r["wrbtr"] if r and "wrbtr" in r else random.uniform(100, 5000)
        }
    ))

    return generator


if __name__ == "__main__" or 'ipykernel' in sys.modules:
    if 'ipykernel' in sys.modules:
        changes_per_table = 25
        
        cdc_generator = configure_sap_cdc_generator()
        results = cdc_generator.run_simulation(changes_per_table=changes_per_table)
    else:
        import argparse

        parser = argparse.ArgumentParser(description='Generate SAP CDC simulation data')
        parser.add_argument('--changes', type=int, default=25,
                            help='Number of changes per table (weighted by table importance)')

        args = parser.parse_args()

        cdc_generator = configure_sap_cdc_generator()
        results = cdc_generator.run_simulation(changes_per_table=args.changes)

    print("\nChange summary:")
    for table, count in results.items():
        if table != "total":
            print(f"  - {table}: {count} changes")
    print(f"Total changes: {results['total']}")


    

StatementMeta(, 4ba59e54-e416-410e-8d77-92d7228ab886, 3, Finished, Available, Finished)

Starting CDC simulation with 25 changes per table...
Generating transaction group changes for sap_vbak and related tables...
Reading latest data from: /lakehouse/default/Files/inbound-sap-seed/sap_vbak/sap_vbak_20250320_052654.parquet
Reading latest data from: /lakehouse/default/Files/inbound-sap-seed/sap_vbap/sap_vbap_20250320_052654.parquet
Writing CDC data for sap_vbak to /lakehouse/default/Files/inbound-sap-cdc/sap_vbak
Writing CDC data for sap_vbap to /lakehouse/default/Files/inbound-sap-cdc/sap_vbap
Generating transaction group changes for sap_ekko and related tables...
Reading latest data from: /lakehouse/default/Files/inbound-sap-seed/sap_ekko/sap_ekko_20250320_052654.parquet
Reading latest data from: /lakehouse/default/Files/inbound-sap-seed/sap_ekpo/sap_ekpo_20250320_052654.parquet
Writing CDC data for sap_ekko to /lakehouse/default/Files/inbound-sap-cdc/sap_ekko
Writing CDC data for sap_ekpo to /lakehouse/default/Files/inbound-sap-cdc/sap_ekpo
Generating transaction group ch