# Data Generation

## Factory Design
- 3 Assembly Lines producing same goods
- Each assembly line has 4 machines:
  - Feeder : has sensor that counts Raw Materials Fed
  - DrillCutter : has sensor that measures temp and vibration
  - Polisher : has sensor that measures temp(in C) & vibration(in mms)
  - Inspector : has sensor that counts good & defective products

- Each machine is controlled by one PLC that sends telemetry to the cloud every few seconds

## Synthetic Data Key Features
- event_time: when the machine reading was taken (in UTC)
- ingest_time: when the data reached Databricks/ADLS
- artificial latency added to simulate network delay
- each PLC sends only the fields relevant to its sensors

### Folder Layout at Landing Zone in ADLS
```
landing/
└── factory_streaming/
    ├── feeder/
    │    ├── _delta_log/
    │    ├── part-00000-...snappy.parquet
    ├── drillcutter/
    │    ├── _delta_log/
    │    ├── part-00000-...
    ├── polisher/
    │    ├── _delta_log/
    │    ├── part-00000-...
    └── inspector/
         ├── _delta_log/
         ├── part-00000-...
```

In [0]:
import time
import random
import uuid
import queue
import threading
from datetime import datetime, timezone, timedelta
from concurrent.futures import ThreadPoolExecutor
from pyspark.sql import SparkSession

# ===========================================================
# Spark + Configs
# ===========================================================
spark = SparkSession.builder.getOrCreate()

STORAGE_ACC = "storageadlsmanufacture"
BASE_PATH = f"abfss://landing@{STORAGE_ACC}.dfs.core.windows.net/factory_streaming"

LINES = ["L1", "L2", "L3"]
MACHINE_TYPES = ["Feeder", "DrillCutter", "Polisher", "Inspector"]

INTERVAL_S = 4               # seconds between sensor readings
LATENCY_MAX_S = 30           # max artificial network delay
SEED = 2025
BUFFER_SIZE = 20             # how many records to batch before writing
random.seed(SEED)

# ===========================================================
# Sensor parameters (Normal distributions)
# ===========================================================
NOMINALS = {
    "DrillCutter": {
        "temp_mu": 450.0, "temp_sigma": 100.0,
        "vib_mu": 0.945, "vib_sigma": 0.133
    },
    "Polisher": {
        "temp_mu": 20.0, "temp_sigma": 6.6,
        "vib_mu": 0.500, "vib_sigma": 0.104
    }
}

# ===========================================================
# Build PLC registry (3 lines × 4 machines)
# ===========================================================
PLCS = []
plc_id_counter = 1
for line in LINES:
    for mtype in MACHINE_TYPES:
        PLCS.append({
            "plc_id": plc_id_counter,
            "machine_type": mtype
        })
        plc_id_counter += 1

# ===========================================================
# Thread-safe queue for all PLC outputs
# ===========================================================
record_queue = queue.Queue()

# ===========================================================
# PLC emitter function (simulates one PLC)
# ===========================================================
def plc_emitter(plc_id: int, mtype: str):
    random.seed(uuid.uuid4().int % 1000000)
    write_path = f"{BASE_PATH}/{mtype.lower()}"

    while True:
        event_time = datetime.now(timezone.utc) - timedelta(seconds=random.randint(0, LATENCY_MAX_S))
        ingest_time = datetime.now(timezone.utc)

        record = {
            "snapshot_id": str(uuid.uuid4()),
            "event_time": event_time.isoformat(),
            "ingest_time": ingest_time.isoformat(),
            "plc_id": plc_id
        }

        if mtype == "Feeder":
            record["blank_count"] = max(0, int(random.gauss(5, 1)))
        elif mtype == "DrillCutter":
            p = NOMINALS["DrillCutter"]
            record["temp"] = round(random.gauss(p["temp_mu"], p["temp_sigma"]), 3)
            record["vibration"] = round(abs(random.gauss(p["vib_mu"], p["vib_sigma"])), 4)
        elif mtype == "Polisher":
            p = NOMINALS["Polisher"]
            record["temp"] = round(random.gauss(p["temp_mu"], p["temp_sigma"]), 3)
            record["vibration"] = round(abs(random.gauss(p["vib_mu"], p["vib_sigma"])), 4)
        elif mtype == "Inspector":
            record["produced_count"] = max(0, int(random.gauss(5, 1)))
            record["defective_count"] = max(0, int(random.gauss(0.3, 0.6)))

        # Push record to queue with destination path
        record_queue.put((write_path, record))
        time.sleep(INTERVAL_S)

# ===========================================================
# Dedicated Spark writer (single thread for safety)
# ===========================================================
def writer_thread():
    """
    Pulls records from queue, groups them by write_path,
    and writes to Delta in batches (BUFFER_SIZE per batch).
    """
    buffers = {}  # {path: [records]}

    while True:
        try:
            path, rec = record_queue.get(timeout=2)
            if path not in buffers:
                buffers[path] = []
            buffers[path].append(rec)

            # Flush if buffer full
            if len(buffers[path]) >= BUFFER_SIZE:
                spark.createDataFrame(buffers[path]).write.format("delta").mode("append").option("mergeSchema", "true").save(path)
                buffers[path].clear()

            record_queue.task_done()

        except queue.Empty:
            # Flush any remaining records periodically
            for path, buf in list(buffers.items()):
                if buf:
                    spark.createDataFrame(buf).write.format("delta").mode("append").option("mergeSchema", "true").save(path)
                    buffers[path].clear()
            time.sleep(1)

# ===========================================================
# Launch threads
# ===========================================================
threading.Thread(target=writer_thread, daemon=True).start()

with ThreadPoolExecutor(max_workers=len(PLCS)) as executor:
    for plc in PLCS:
        executor.submit(plc_emitter, plc["plc_id"], plc["machine_type"])


{'application/vnd.databricks.toolz-hint+json': {'trigger': 'hungCommand'}}

Python Execution Stuck!


{'application/vnd.databricks.toolz-hint+json': {'trigger': 'hungCommand'}}

Python Execution Stuck!


{'application/vnd.databricks.toolz-hint+json': {'trigger': 'hungCommand'}}

Python Execution Stuck!


{'application/vnd.databricks.toolz-hint+json': {'trigger': 'hungCommand'}}

Python Execution Stuck!


com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:139)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:139)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:136)
	at scala.collection.immutable.Range.foreach(Range.scala:192)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:136)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:721)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:441)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:441)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can