### Create a stream of website visits records

In [7]:
from kafka import KafkaProducer
import datetime
import random
import json
import time

visits_topic="spark.streaming.website.visits"

def string_serializer(value):
    return value.encode('utf-8')
    
visits_producer=KafkaProducer( 
                bootstrap_servers=["localhost:9092"],
                key_serializer=string_serializer,
                value_serializer=string_serializer
                )

countries = ["USA","India","Brazil","Australia","Russia"]
last_actions = ["Catalog","FAQ","Order","ShoppingCart"]

#Generate 100 sample visit records
for i in range(1,100):

    #Create a json string with generated data
    json_record={}
    json_record["country"]=countries[random.randint(0,4)]
    json_record["last_action"]=last_actions[random.randint(0,3)]
    json_record["visit_date"]=datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")
    json_record["duration"]=random.randint(1,20)

    #Use country as Key. Each country will go through the same partition, 
    #hence updates to a given country can be handled sequencially
    kafka_key=json_record["country"]
    kafka_value = json.dumps(json_record)
    print(kafka_value)
    visits_producer.send(visits_topic, key=kafka_key,value=kafka_value)

    #sleep for 1-3 seconds
    time.sleep(random.randint(1,3))

visits_producer.flush()
visits_producer.close()

{"country": "Russia", "last_action": "FAQ", "visit_date": "2026-01-01 17:45:48", "duration": 20}
{"country": "Brazil", "last_action": "Order", "visit_date": "2026-01-01 17:45:50", "duration": 10}
{"country": "India", "last_action": "Catalog", "visit_date": "2026-01-01 17:45:53", "duration": 16}
{"country": "Russia", "last_action": "Catalog", "visit_date": "2026-01-01 17:45:56", "duration": 4}
{"country": "Australia", "last_action": "ShoppingCart", "visit_date": "2026-01-01 17:45:59", "duration": 5}
{"country": "Brazil", "last_action": "Order", "visit_date": "2026-01-01 17:46:01", "duration": 12}
{"country": "India", "last_action": "Order", "visit_date": "2026-01-01 17:46:04", "duration": 9}
{"country": "Brazil", "last_action": "Order", "visit_date": "2026-01-01 17:46:05", "duration": 9}
{"country": "USA", "last_action": "FAQ", "visit_date": "2026-01-01 17:46:06", "duration": 1}
{"country": "USA", "last_action": "Catalog", "visit_date": "2026-01-01 17:46:09", "duration": 5}
{"country": 

KeyboardInterrupt: 

In [5]:
from kafka import KafkaConsumer
import uuid

def string_deserializer(value):
    return value.decode("utf-8") if value is not None else None

group = f"visit-consumer-test-{uuid.uuid4()}"

consumer = KafkaConsumer(
    visits_topic,
    bootstrap_servers=["localhost:9092"],
    value_deserializer=string_deserializer,
    auto_offset_reset="earliest",
    enable_auto_commit=False,
    group_id=group,
)

# poll a few times to allow partition assignment
for _ in range(5):
    msgs = consumer.poll(timeout_ms=2000)
    total = sum(len(v) for v in msgs.values())
    if total:
        break

print("Group:", group)
print("Total messages:", total)

for tp, records in msgs.items():
    for r in records:
        print(f"{tp.topic}:{tp.partition}:{r.offset} value={r.value}")


Group: visit-consumer-test-ac09e616-399d-4b44-a18f-a49837f3d3fc
Total messages: 99
spark.streaming.website.visits:0:0 value={"country": "USA", "last_action": "Catalog", "visit_date": "2025-12-31 19:44:53", "duration": 19}
spark.streaming.website.visits:0:1 value={"country": "Brazil", "last_action": "Order", "visit_date": "2025-12-31 19:44:54", "duration": 9}
spark.streaming.website.visits:0:2 value={"country": "Brazil", "last_action": "Catalog", "visit_date": "2025-12-31 19:44:55", "duration": 5}
spark.streaming.website.visits:0:3 value={"country": "Brazil", "last_action": "FAQ", "visit_date": "2025-12-31 19:44:57", "duration": 8}
spark.streaming.website.visits:0:4 value={"country": "Russia", "last_action": "ShoppingCart", "visit_date": "2025-12-31 19:45:00", "duration": 11}
spark.streaming.website.visits:0:5 value={"country": "Australia", "last_action": "FAQ", "visit_date": "2025-12-31 19:45:03", "duration": 7}
spark.streaming.website.visits:0:6 value={"country": "Brazil", "last_actio