## Raw client (from Confluent)

In [None]:
from confluent_kafka import Consumer, KafkaError, Message
import pandas as pd
import json

Consumer configuration

In [None]:
client_configuration = {
    'metadata.broker.list': 'localhost:9092', 
    'group.id': 'confluent_client_consumer', 
    'auto.offset.reset': 'latest'}
car_events_topic = 'car'
weather_topic = 'weather'
short_window = 1 # minutes
long_window = 10 # minutes

Weather state object - holds the newest weather state

In [None]:
class WeatherState:
    def __init__(self):
        self.state = {}
        
    def update(self, key, value):
        self.state[key] = value
        
    def get_current_data(self):
        return pd.DataFrame(list(self.state.values())).set_index('station')
        
weather_state = WeatherState()

LoopEventsWindows - performs windowing on loop events

In [None]:
from collections import deque
import pandas as pd
import time

class LoopEventsWindows:
    def __init__(self, windows):
        self.deque = deque()
        self.windows = windows
        self.timestamp_field = 'timestamp_out_2'
        self.ts = 0
        
    def indicate(self, key, event):
        self.ts = event['timestamp_out_2']
        
    def loop_event(self, event):
        self.deque.append(event)
    
    def generate_windows(self):
        self.clean_deque(time.time() - max(self.windows))
        if len(self.deque) == 0:
            return None
        return self.get_windows()
    
    def clean_deque(self, min_time):
        while len(self.deque) > 0 and self.deque[0][self.timestamp_field] < min_time:
            self.deque.popleft()
            
    def get_windows(self):
        current_time = time.time()
        data = pd.DataFrame(list(self.deque))
        ts_field = data[self.timestamp_field]
        return { 
            str(window): data[
                (ts_field >= current_time - window) & 
                (ts_field <= current_time)
            ] for window in self.windows }
    
window_manager = LoopEventsWindows([long_window * 60, short_window * 60 ])

Data join and model calculation (in separate thread, every 1 second)

In [None]:
import threading

def stats(windows, label):
    result = windows[str(label * 60)].groupby('point_id') \
        .agg({'speed': ['mean', 'count'], 'gap_meters': ['mean', 'min']})
    result.columns = ['%d_%s' % (label, stat) for stat in ['avg_speed', 'num_cars', 'avg_gap', 'min_gap']]
    return result

def compute_model():
    threading.Timer(short_window * 60, compute_model).start()
    windows = window_manager.generate_windows()
    if not windows:
        return
    
    one_minute_stats = stats(windows, short_window)
    ten_minutes_stats = stats(windows, long_window)
    joined_windows = ten_minutes_stats.join(one_minute_stats, how='left')
    points_db = pd.read_csv('road_points.csv').set_index('point_id')
    points_db['point_id'] = points_db.index
    result = joined_windows.join(points_db).set_index('nearest_weather_station') \
        .join(weather_state.get_current_data()).set_index('point_id')
    
    # here goes the ML
    print(result['10_num_cars'].to_dict())

compute_model()

Streaming start

In [None]:
load_key = lambda key: key.decode('ascii')
load_value = lambda value: json.loads(value.decode('ascii'))
consumer = Consumer(client_configuration)
consumer.subscribe([weather_topic, car_events_topic])

while True:
    msg = consumer.poll()
    if msg.error():
        if msg.error().code() == KafkaError._PARTITION_EOF:
            continue
        else:
            print(msg.error())
            time.sleep(0.1)
    if msg.topic() == weather_topic:
        weather_state.update(load_key(msg.key()), load_value(msg.value()))
    elif msg.topic() == car_events_topic:
        window_manager.loop_event(load_value(msg.value()))

In [None]:
# consumer.close()