In [1]:
pip install faker confluent_kafka

Collecting faker
  Downloading faker-40.4.0-py3-none-any.whl.metadata (16 kB)
Collecting confluent_kafka
  Downloading confluent_kafka-2.13.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (32 kB)
Downloading faker-40.4.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading confluent_kafka-2.13.0-cp311-cp311-manylinux_2_28_x86_64.whl (4.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faker, confluent_kafka
Successfully installed confluent_kafka-2.13.0 faker-40.4.0
Note: you may need to restart the kernel to use updated packages.


In [18]:
import json
import os
import random
import time
import uuid
from datetime import datetime, timezone, timedelta
from faker import Faker
from confluent_kafka import Producer

fake = Faker()
random.seed(7)

In [3]:


CLICK_RATE_PER_SEC = float(os.environ.get("CLICK_RATE_PER_SEC", "50"))
ORDER_RATE_PER_SEC = float(os.environ.get("ORDER_RATE_PER_SEC", "2"))

# probability controls
DUP_PROB = float(os.environ.get("DUP_PROB", "0.01"))          # exact dup
LATE_PROB = float(os.environ.get("LATE_PROB", "0.02"))        # late events
LATE_MAX_MIN = int(os.environ.get("LATE_MAX_MIN", "30"))      # up to 30 min late
BAD_SCHEMA_PROB = float(os.environ.get("BAD_SCHEMA_PROB", "0.005"))



In [16]:

recent_event_cache = []  # for duplicates (store serialized payloads)

def now_iso():
    return datetime.now(timezone.utc).isoformat()

def maybe_late_ts():
    if random.random() < LATE_PROB:
        mins = random.randint(1, LATE_MAX_MIN)
        seconds = random.randint(0, 59)
        return (
            datetime.now(timezone.utc) - timedelta(minutes=mins, seconds=seconds)
        ).isoformat()
    return now_iso()


In [5]:

def click_event():
    evt = {
        "event_id": str(uuid.uuid4()),
        "event_type": "click",
        "event_ts": now_iso(),
        "user_id": f"u{random.randint(1, 200000)}",
        "session_id": str(uuid.uuid4()),
        "page": random.choice(["home", "search", "product", "cart", "checkout"]),
        "product_id": f"p{random.randint(1, 50000)}",
        "device": {"os": random.choice(["ios", "android", "web"]), "app_ver": f"1.{random.randint(0,9)}.{random.randint(0,9)}"},
        "ip": fake.ipv4_public(),
    }
    # set event time (late sometimes)
    evt["event_ts"] = maybe_late_ts()
    return evt

In [6]:

def order_event():
    evt = {
        "event_id": str(uuid.uuid4()),
        "event_type": random.choice(["order_placed", "order_paid", "order_shipped"]),
        "event_ts": now_iso(),
        "order_id": f"o{random.randint(1, 5000000)}",
        "user_id": f"u{random.randint(1, 200000)}",
        "product_id": f"p{random.randint(1, 50000)}",
        "amount": round(random.uniform(5, 500), 2),
        "status": random.choice(["PLACED", "PAID", "SHIPPED"]),
        "ip": fake.ipv4_public(),
    }
    evt["event_ts"] = maybe_late_ts()
    return evt

In [7]:

def maybe_make_bad_schema(payload: dict) -> str:
    # occasionally send malformed JSON or missing required fields
    r = random.random()
    if r < BAD_SCHEMA_PROB / 2:
        return '{"event_id":'  # broken JSON
    if r < BAD_SCHEMA_PROB:
        payload = payload.copy()
        payload.pop("event_id", None)  # missing required field
    return json.dumps(payload)



In [8]:

def maybe_duplicate(serialized: str) -> str:
    if random.random() < DUP_PROB and recent_event_cache:
        return random.choice(recent_event_cache)
    # keep a small cache
    recent_event_cache.append(serialized)
    if len(recent_event_cache) > 10000:
        del recent_event_cache[:5000]
    return serialized

In [9]:

def delivery_report(err, msg):
    if err is not None:
        print(f"Delivery failed: {err}")

In [10]:

def send(topic: str, payload: dict):
    s = maybe_make_bad_schema(payload)
    s = maybe_duplicate(s)
    # use event_id as key when present
    key = payload.get("event_id", str(uuid.uuid4()))
    p.produce(topic, key=key.encode("utf-8"), value=s.encode("utf-8"), callback=delivery_report)


In [11]:

def main():
    next_click = time.time()
    next_order = time.time()

    while True:
        t = time.time()

        if t >= next_click:
            send(TOPIC_CLICK, click_event())
            next_click = t + (1.0 / CLICK_RATE_PER_SEC)

        if t >= next_order:
            send(TOPIC_ORDER, order_event())
            next_order = t + (1.0 / ORDER_RATE_PER_SEC)

        p.poll(0)  # serve callbacks
        # small sleep to avoid busy loop
        time.sleep(0.001)

In [12]:

BOOTSTRAP = os.environ.get("KAFKA_BOOTSTRAP", "kafka:9092")
TOPIC_CLICK = os.environ.get("TOPIC_CLICK", "clickstream_events")
TOPIC_ORDER = os.environ.get("TOPIC_ORDER", "order_events")


In [13]:
BOOTSTRAP = 'kafka:9092'
p = Producer({"bootstrap.servers": BOOTSTRAP})

recent_event_cache = []  # for duplicates (store serialized payloads)


In [14]:
BOOTSTRAP

'kafka:9092'

In [None]:
main()