# Nuclio Function for Unified Data Batching and Aggregation with RAPIDS cuDF

In [1]:
# nuclio: ignore
import nuclio

## Environment

### Base Configuration

In [2]:
%%nuclio config

# Kafka trigger
# spec.triggers.hakafka.kind = "kafka"
# spec.trigger.url = "1.1.1.1"
# spec.triggers.hakafka.attributes.topic = "haproxy"
# spec.triggers.hakafka.attributes.partitions = [0, 1, 2]
# spec.triggers.hakafka.attributes.sasl.enable: true
# spec.triggers.hakafka.attributes.sasl.user: ""
# spec.triggers.hakafka.attributes.sasl.password: ""

# HTTP trigger      
spec.triggers.hahttp.kind="http"
spec.triggers.hahttp.maxWorkers=1
spec.triggers.hahttp.attributes.port=31001

# Base image
spec.build.baseImage = "rapidsai/rapidsai:cuda10.0-runtime-centos7"

%nuclio: setting spec.triggers.hahttp.kind to 'http'
%nuclio: setting spec.triggers.hahttp.maxWorkers to 1
%nuclio: setting spec.triggers.hahttp.attributes.port to 31001
%nuclio: setting spec.build.baseImage to 'rapidsai/rapidsai:cuda10.0-runtime-centos7'


### Environment Variables

In [3]:
%nuclio env SINK_PATH=./sink
%nuclio env INTERVAL=2
%nuclio env METRIC_NAMES=cpu_utilization,latency,packet_loss,throughput

%nuclio: setting 'SINK_PATH' environment variable
%nuclio: setting 'INTERVAL' environment variable
%nuclio: setting 'METRIC_NAMES' environment variable


## Function

In [4]:
import os
import glob
from datetime import datetime, timedelta
import time
import cudf
import itertools
import json

# Define sink and verify its availability
sink = os.getenv('SINK_PATH', './sink')
os.makedirs(sink, exist_ok=True)

# Expose metric names
metric_names = os.environ['METRIC_NAMES']
metric_names = metric_names.split(',')

# Define a batch and a batch interval
batch = list()
interval = int(os.getenv('INTERVAL', 100))

def handler(context, event):
    global batch
    global metric_names
    
    # Aggregate event JSON objects
    batch.append(event.body)
    
    # Verify that there are enough events to perform aggregations
    if len(batch) >= interval:
        
        # Create a cuDF DataFrame from the batch of event JSON objects
        df = cudf.read_json('\n'.join(batch), lines=True)
        df = df.reset_index(drop=True)
        
        # Perform aggregations
        df = df.groupby(['company']).\
                    agg({k: ['min', 'max', 'mean'] for k in metric_names})
        
        # Save to Parquet
        filename = f'{time.time()}.parquet'
        filepath = os.path.join(sink, filename)
        new_index = [f'{e[0]}_{e[1]}' for e in list(df.columns)]
        df.columns = new_index
        df.to_parquet(filepath)
        
        # Reset batch
        batch = list()

## Test

In [5]:
# nuclio: ignore
event = nuclio.Event(body='{"company":"Rios__Pope_and_Baird","cpu_utilization":70.6942165035,"cpu_utilization_is_error":false,"latency":3.1373003261,"latency_is_error":false,"packet_loss":0.0,"packet_loss_is_error":false,"throughput":249.7207880994,"throughput_is_error":false,"timestamp":1563795193534}')
out = handler(context, event)
out

## Deploy the Function

Deploy the Nuclio function, provided a Nuclio cluster is available.

In [6]:
%nuclio deploy -p nvidia -n cudf_batch_and_agg -c

[nuclio.deploy] 2019-08-07 12:29:26,987 (info) Building processor image


INFO:(info) Building processor image


[nuclio.deploy] 2019-08-07 12:29:30,024 (info) Pushing image


INFO:(info) Pushing image


[nuclio.deploy] 2019-08-07 12:29:30,026 (info) Build complete


INFO:(info) Build complete


[nuclio.deploy] 2019-08-07 12:29:34,085 (info) Function deploy complete


INFO:(info) Function deploy complete


[nuclio.deploy] 2019-08-07 12:29:34,096 done updating cudf-batch-and-agg, function address: 3.120.15.118:31001


INFO:done updating cudf-batch-and-agg, function address: 3.120.15.118:31001


%nuclio: function deployed
