In [0]:
dbutils.widgets.text("produce_time_sec", "300", "How long to produce events:")

In [0]:
%pip install faker confluent-kafka

## Function: Sending messages to Kafka.

In [0]:
from confluent_kafka import Producer
import json
import random

conf = {
    'bootstrap.servers': '<your-eventhub>.servicebus.windows.net',
    'security.protocol': 'SASL_SSL',
    'sasl.mechanism': 'PLAIN',
    'sasl.username': '$ConnectionString',
    'sasl.password': '<your-endpoint>',
    'client.id': 'streaming-learning'
}

producer = Producer(conf)

def delivery_callback(err, msg):
    """Callback for delivery reports."""
    if err is not None:
        print(f"ERROR: Message delivery failed: {err}")
    else:
        print(f"SUCCEDD: Message delivered to {msg.topic()} [{msg.partition()}]")

def produce_event(event, topic = 'streaming-learning'):
    """Produce an event."""
    event_json = json.dumps(event)
    producer.produce(topic, value=event_json, callback=delivery_callback)
    producer.poll(0) # Trigger delivery report callbacks.

    # Simulate 4% of duplicate events:
    if random.uniform(0, 1) > 0.96:
        producer.produce(topic, value=event_json, callback=delivery_callback)
        producer.poll(0) # Trigger delivery report callbacks.
    
    producer.flush()

In [0]:
produce_event({"test": "toto"},  'test')

## Function: Generating events.

In [0]:
from faker import Faker
from collections import OrderedDict
import re
import random
import uuid

fake = Faker()
platforms = OrderedDict([("ios", 0.5), ("android", 0.3), ("other", 0.1), (None, 0.01)])
action_types = OrderedDict([("view", 0.5), ("log", 0.1), ("click", 0.3), (None, 0.01)])

def generate_event(user_id, timestamp):
    """Generate a fake event"""
    fake_platform = fake.random_elements(elements=platforms, length=1)[0]
    fake_action = fake.random_elements(elements=action_types, length=1)[0]
    fake_uri = re.sub(r'https?:\/\/.*?\/', "https://databricks.com/", fake.uri())

    # Adds some noise in the timestamp to simulate out-of order events:
    timestamp = timestamp + random.randrange(10) - 5

    # Event id with 2% of null event to have some errors/cleanup:
    fake_id = str(uuid.uuid4()) if random.uniform(0, 1) < 0.98 else None

    return {"user_id": user_id,
            "platform": fake_platform,
            "event_id": fake_id,
            "event_date": timestamp,
            "action": fake_action,
            "uri": fake_uri}

## Producing events to Kafka.

In [0]:
import time

# How long to produce messages:
produce_time_sec = int(dbutils.widgets.get("produce_time_sec"))
# How many new users join the website per second:
user_creation_rate = 2
# Max duration for a user stays in the website (after this time, the user will stop producing events):
user_max_duration_time = 120

users = {}

for _ in range(produce_time_sec):
    for id in list(users.keys()):
        user = users[id]
        now = int(time.time())

        if (user["end_time"] < now):
            del users[id]
        else:
            # 30% chance to create an event:
            if (random.randrange(100) > 70):
                event = generate_event(id, now)
                produce_event(event)

    # Create new users:
    for i in range(user_creation_rate):
        # Add new user:
        user_id = str(uuid.uuid4())
        now = int(time.time())

        # End_time is when the user will leave and the session stops:
        user = {"id": user_id, "creation_time": now, "end_time": now + random.randrange(user_max_duration_time)}
        users[user_id] = user
    
    time.sleep(1)

# Ensure all messages are delivered before exiting
producer.flush()