In [0]:
# 0) PREP: build a tiny “document event” dataset in driver memory & land as raw JSON on DBFS
#    (this simulates your external source)
import json, random, uuid
from datetime import datetime, timedelta

# parameters
dbfs_raw = "/tmp/raw/doc_events/"
local_tmp = "/tmp/doc_events/"               # driver-local staging
dbutils.fs.rm(dbfs_raw, recurse=True)
dbutils.fs.mkdirs(dbfs_raw)

# generate ~100 sample records
actions = ["open","edit","save","close"]
devices = [{"os":"Windows","region":"NA"},{"os":"macOS","region":"EU"},{"os":"Android","region":"APAC"}]
start = datetime(2025,8,1,8)
rows = []
for i in range(100):
    ts = start + timedelta(minutes=random.randint(0, 60*24))
    rows.append({
      "event_id":    str(uuid.uuid4()),
      "user_id":     f"user_{random.randint(1,10)}",
      "doc_id":      f"doc_{random.randint(1,5)}",
      "action":      random.choice(actions),
      "event_time":  ts.isoformat(),
      "device":      random.choice(devices)
    })

# write out as one-line JSON
import os
os.makedirs(local_tmp, exist_ok=True)
with open(local_tmp + "doc_events.json","w") as f:
    for r in rows:
        f.write(json.dumps(r)+"\n")

# copy into DBFS raw folder
dbutils.fs.cp("file:"+local_tmp+"doc_events.json", dbfs_raw+"doc_events.json")
print("✅ Raw JSON landed at", dbfs_raw)