# Standalone Python Function for Unified Data Batching and Aggregation with RAPIDS cuDF

## Installation

In [1]:
!pip install kafka

Collecting kafka
[?25l  Downloading https://files.pythonhosted.org/packages/21/71/73286e748ac5045b6a669c2fe44b03ac4c5d3d2af9291c4c6fc76438a9a9/kafka-1.3.5-py2.py3-none-any.whl (207kB)
[K    100% |████████████████████████████████| 215kB 20.1MB/s ta 0:00:01
[?25hInstalling collected packages: kafka
Successfully installed kafka-1.3.5


## Script

In [None]:
from kafka import KafkaConsumer
import os
import glob
from datetime import datetime, timedelta
import time
import itertools
import json

# Select the DataFrame factory
import cudf as pd
# Import pandas as pd

# Basic configuration
metric_names = ['cpu_utilization', 'latency', 'packet_loss', 'throughput']
batch_len = 100
batch = list()

# Kafka configuration
topic = ''
servers = []
offset = 'earliest'

def handler(event):
    '''
        Processing function
    '''
    global batch
    global metric_names
    
    # Aggregate event JSON objects
    batch.append(event.body)
    
    # Verify that there are enough events to perform aggregations
    if len(batch) >= interval:
        
        # Create a DataFrame from the batch of event JSON objects
        df = cudf.read_json('\n'.join(batch), lines=True)
        df = df.reset_index(drop=True)
        
        # Perform aggregations
        df = df.groupby(['company']).\
                    agg({k: ['min', 'max', 'mean'] for k in metric_names})
        
        # Save to Parquet
        filename = f'{time.time()}.parquet'
        filepath = os.path.join(sink, filename)
        new_index = [f'{e[0]}_{e[1]}' for e in list(df.columns)]
        df.columns = new_index
        df.to_parquet(filepath)
        
        # Reset the batch
        batch = list()

# Kafka handling
consumer = KafkaConsumer(
     topic,
     bootstrap_servers=servers,
     auto_offset_reset='offset',
     value_deserializer=lambda x: x.decode('utf-8'))

for message in consumer:
    message = message.value
    handler(message)