## Winton Kafka Streams

    sudo pip3 install git+https://github.com/wintoncode/winton-kafka-streams

In [None]:
import sys
sys.path.insert(0, '/home/mario/git/winton-kafka-streams/')

In [None]:
import logging
import time
from collections import deque
import pandas as pd

from winton_kafka_streams.processor import BaseProcessor, TopologyBuilder
from winton_kafka_streams.state.simple import SimpleStore
import winton_kafka_streams.kafka_config as kafka_config
import winton_kafka_streams.kafka_streams as kafka_streams
import winton_kafka_streams.state as kafka_store

Configuration

In [None]:
car_events_topic = 'car'
weather_topic = 'weather'
logging.basicConfig(level=logging.INFO)
kafka_config.AUTO_OFFSET_RESET = 'latest'

JSON deserializer

In [None]:
import json

class ReadJson(BaseProcessor):
    def process(self, key, value):
        self.context.forward(key, json.loads(value))

Weather state management

In [None]:
class UpdateState(BaseProcessor):
    def initialise(self, name, context):
        super(UpdateState, self).initialise(name, context)
        self.store = context.get_store('weather_store')
    
    def process(self, key, value):
        self.store.update_weather(key.decode('ascii'), value)
         
class WeatherStore:
    def __init__(self, name):
        self.data = {}
        
    def update_weather(self, station, results):
        self.data[station] = results
        logging.debug(f"Weather info for {station} updated (got {len(self.data)} so far)!")
        
    def initialized(self):
        return len(self.data) > 0
        
    def get_current(self):
        return pd.DataFrame(list(self.data.values())).set_index('station')

Cars events processor

In [None]:
class ProcessLoopEvent(BaseProcessor):
    def initialise(self, name, context):
        super(ProcessLoopEvent, self).initialise(name, context)
        self.context.schedule(1)
        self.datastore = deque(maxlen=10)
        self.datastore.append([])
        
    def punctuate(self, timestamp):
        small = self.datastore[-1]
        large = [item for sublist in self.datastore for item in sublist]
        self.context.forward(None, json.dumps({'1s': small, '10s': large}))
        self.datastore.append([])
    
    def process(self, key, value):
        self.datastore[-1].append(value)
                
class CalculateStatsAndJoin(BaseProcessor):
    def stats(self, df, label):
        result = df.groupby('point_id').agg({'speed': ['mean', 'count'], 'gap_meters': ['mean', 'min']})
        result.columns = ['%d_%s' % (label, stat) for stat in ['avg_speed', 'num_cars', 'avg_gap', 'min_gap']]
        return result

    def compute_model(self, small, large):
        if len(small) == 0 or len(large) == 0:
            return "empty loop windows"
        
        if not self.context.get_store('weather_store').initialized():
            return "initializing"
        
        one_minute_stats = self.stats(pd.DataFrame(small), 1)
        ten_minutes_stats = self.stats(pd.DataFrame(large), 10)
        joined_windows = ten_minutes_stats.join(one_minute_stats, how='left')
        points_db = pd.read_csv('/home/mario/pydata2017/generator/road_points.csv').set_index('point_id')
        points_db['point_id'] = points_db.index
        result = joined_windows.join(points_db).set_index('nearest_weather_station') \
            .join(self.context.get_store('weather_store').get_current()).set_index('point_id')

        return result['10_num_cars'].to_dict()
    
    def process(self, key, value):
        value = json.loads(value)
        print(self.compute_model(value['1s'], value['10s']))

Build topology and stream!

In [None]:
with TopologyBuilder() as topology_builder:
        topology_builder. \
            source('loop-event-json', [car_events_topic]). \
            source('weather-event-json', [weather_topic]). \
            state_store('weather_store', WeatherStore, 'weather', 'stats') .\
            processor('weather-event', ReadJson, 'weather-event-json'). \
            processor('weather', UpdateState, 'weather-event'). \
            processor('loop-event', ReadJson, 'loop-event-json'). \
            processor('loops-windows', ProcessLoopEvent, 'loop-event'). \
            processor('stats', CalculateStatsAndJoin, 'loops-windows')

wks = kafka_streams.KafkaStreams(topology_builder, kafka_config)
wks.start()

In [None]:
# wks.close()