## Winton Kafka Streams

In [1]:
import sys
sys.path.insert(0, '/home/mario/git/winton-kafka-streams/')

In [2]:
import logging
import time
from collections import deque
import pandas as pd

from winton_kafka_streams.processor import BaseProcessor, TopologyBuilder
from winton_kafka_streams.state.simple import SimpleStore
import winton_kafka_streams.kafka_config as kafka_config
import winton_kafka_streams.kafka_streams as kafka_streams
import winton_kafka_streams.state as kafka_store

Configuration

In [3]:
car_events_topic = 'car'
weather_topic = 'weather'
logging.basicConfig(level=logging.INFO)
kafka_config.AUTO_OFFSET_RESET = 'latest'
short_window = 1 # minutes
long_window = 10 # minutes

Loading points DB

In [4]:
points_db = pd.read_csv('road_points.csv').set_index('point_id')
points_db['point_id'] = points_db.index

JSON deserializer

In [5]:
import json

class ReadJson(BaseProcessor):
    def process(self, key, value):
        self.context.forward(key, json.loads(value))

Weather state management

In [6]:
class UpdateState(BaseProcessor):
    def initialise(self, name, context):
        super(UpdateState, self).initialise(name, context)
        self.store = context.get_store('weather_store')
    
    def process(self, key, value):
        self.store.update_weather(key.decode('ascii'), value)
         
class WeatherStore:
    def __init__(self, name):
        self.data = {}
        
    def update_weather(self, station, results):
        self.data[station] = results
        logging.debug(f"Weather info for {station} updated (got {len(self.data)} so far)!")
        
    def initialized(self):
        return len(self.data) > 0
        
    def get_current(self):
        return pd.DataFrame(list(self.data.values())).set_index('station')

Cars events processor

In [7]:
class ProcessLoopEvent(BaseProcessor):
    def initialise(self, name, context):
        super(ProcessLoopEvent, self).initialise(name, context)
        self.context.schedule(short_window * 60)
        self.datastore = deque(maxlen=long_window) # deque of 10 short windows
        self.datastore.append([])
        
    def punctuate(self, timestamp):
        short_window_data = self.datastore[-1]
        long_window_data = [item for sublist in self.datastore for item in sublist]
        self.context.forward(None, json.dumps((short_window_data, long_window_data)))
        self.datastore.append([])
    
    def process(self, key, value):
        self.datastore[-1].append(value)
                
class CalculateStatsAndJoin(BaseProcessor):
    def stats(self, df, label):
        result = df.groupby('point_id').agg({'speed': ['mean', 'count'], 'gap_meters': ['mean', 'min']})
        result.columns = ['%d_%s' % (label, stat) for stat in ['avg_speed', 'num_cars', 'avg_gap', 'min_gap']]
        return result

    def compute_model(self, short_window_data, long_window_data):
        if len(short_window_data) == 0 or len(long_window_data) == 0:
            return "empty loop windows"
        
        if not self.context.get_store('weather_store').initialized():
            return "initializing"
        
        one_minute_stats = self.stats(pd.DataFrame(short_window_data), short_window)
        ten_minutes_stats = self.stats(pd.DataFrame(long_window_data), long_window)
        joined_windows = ten_minutes_stats.join(one_minute_stats, how='left')
        result = joined_windows.join(points_db).set_index('nearest_weather_station') \
            .join(self.context.get_store('weather_store').get_current()).set_index('point_id')

        return result['10_num_cars'].to_dict()
    
    def process(self, key, value):
        value = json.loads(value)
        print(self.compute_model(value[0], value[1]))

Build topology and stream!

In [8]:
with TopologyBuilder() as topology_builder:
        topology_builder \
            .source('loop-event-json', [car_events_topic]) \
            .source('weather-event-json', [weather_topic]) \
            .state_store('weather_store', WeatherStore, 'weather', 'stats') \
            .processor('weather-event', ReadJson, 'weather-event-json') \
            .processor('weather', UpdateState, 'weather-event') \
            .processor('loop-event', ReadJson, 'loop-event-json') \
            .processor('loops-windows', ProcessLoopEvent, 'loop-event') \
            .processor('stats', CalculateStatsAndJoin, 'loops-windows')

wks = kafka_streams.KafkaStreams(topology_builder, kafka_config)
wks.start()

INFO:winton_kafka_streams.processor._stream_thread(Thread-4):Topics for consumer are: ['car', 'weather']
INFO:winton_kafka_streams.processor._stream_thread(Thread-4):State transition from NOT_RUNNING to RUNNING.
INFO:winton_kafka_streams.kafka_streams:State transition from CREATED to RUNNING.


In [9]:
# wks.close()

INFO:winton_kafka_streams.kafka_streams:State transition from RUNNING to REBALANCING.
INFO:winton_kafka_streams.processor._stream_thread(Thread-4):State transition from ASSIGNING_PARTITIONS to RUNNING.
INFO:winton_kafka_streams.kafka_streams:State transition from REBALANCING to RUNNING.


initializing
{1: 21039, 3: 20502, 2: 20634}
{1: 41678, 3: 40893, 2: 40893}
{1: 62428, 3: 61589, 2: 61548}
{1: 82941, 3: 81956, 2: 81875}
{1: 103451, 3: 102630, 2: 102429}
