# 2a. Stream to Parquet
--------------------------------------------------------------------

Store the input stream to a set of parquet files. The purpose is to store the input stream to a log of raw events.

![Model deployment with streaming Real-time operational Pipeline](../../assets/images/model-deployment-with-streaming.png)

In [1]:
import nuclio

In [2]:
%run config.py

In [3]:
import os
os.environ["MDWS_S2P_BATCH_SIZE"] = '1024'
os.environ["MDWS_S2P_TARGET_PATH"] = MDWS_S2P_TARGET_PATH
os.environ["MDWS_S2P_INPUT_URL"] = path.join(WEB_API_USERS, STREAM_CONFIGS['generated-stream']['path']) + "@stream2pq"

In [4]:
%nuclio env BATCH_SIZE = ${MDWS_S2P_BATCH_SIZE}
%nuclio env TARGET_PATH = ${MDWS_S2P_TARGET_PATH}

%nuclio: setting 'BATCH_SIZE' environment variable
%nuclio: setting 'TARGET_PATH' environment variable


In [5]:
# Define function spec
%nuclio config kind = "nuclio"

%nuclio: setting kind to 'nuclio'


In [6]:
%%nuclio cmd -c

python -m pip install pandas
python -m pip install pyarrow

In [7]:
%%nuclio config
spec.readinessTimeoutSeconds = 200
spec.triggers.v3io_stream.kind = "v3ioStream"
spec.triggers.v3io_stream.disabled = false
spec.triggers.v3io_stream.url = "${MDWS_S2P_INPUT_URL}"
spec.triggers.v3io_stream.maxWorkers = 10
spec.triggers.v3io_stream.password = "${V3IO_ACCESS_KEY}"
spec.triggers.v3io_stream.attributes.pollingIntervalMs = 500
spec.triggers.v3io_stream.attributes.seekTo = "earliest"
spec.triggers.v3io_stream.attributes.readBatchSize = 64

%nuclio: setting spec.readinessTimeoutSeconds to 200
%nuclio: setting spec.triggers.v3io_stream.kind to 'v3ioStream'
%nuclio: setting spec.triggers.v3io_stream.disabled to False
%nuclio: setting spec.triggers.v3io_stream.url to 'http://v3io-webapi:8081/users/iguazio/examples/model-deployment-with-streaming/generated-stream@stream2pq'
%nuclio: setting spec.triggers.v3io_stream.maxWorkers to 10
%nuclio: setting spec.triggers.v3io_stream.password to '4868d73d-1bd9-486e-91bd-d89ad83e3a56'
%nuclio: setting spec.triggers.v3io_stream.attributes.pollingIntervalMs to 500
%nuclio: setting spec.triggers.v3io_stream.attributes.seekTo to 'earliest'
%nuclio: setting spec.triggers.v3io_stream.attributes.readBatchSize to 64


In [8]:
%nuclio mount /User ~/

mounting volume path /User as ~/


In [9]:
# nuclio: start-code

In [10]:
import os
import pandas as pd
import numpy as np
import json
import datetime

In [11]:
def init_context(context):
    setattr(context, 'batch', [])
    setattr(context, 'batch_size', int(os.getenv('BATCH_SIZE', 1024)))
    setattr(context, 'batch_count',int(os.getenv('BATCH_COUNT', 0)))
    
    pq_partitions = os.getenv('PQ_PARTITIONS')
    if pq_partitions:
        setattr(context, 'pq_partitions', pq_partitions.split(','))
    else:
        setattr(context, 'pq_partitions', pq_partitions)
    
    setattr(context, 'target_path', os.getenv('TARGET_PATH'))
    os.makedirs(context.target_path, exist_ok=True)

In [12]:
def handler(context, event):
    if type(event.body) is dict:
        event_dict = event.body
    else:
        event_dict = json.loads(event.body)
        
    context.logger.info_with('Got invoked',
                             trigger_kind=event.trigger.kind,
                             event_body=event_dict)
    
    # add the incoming event to the current batch
    context.batch.append(event_dict)
    
    #check if batch size reached
    if context.batch_size == len(context.batch):
        context.logger.info_with('Writing batch',
                                 batch_count=context.batch_count,
                                 batch_size=len(context.batch))
        write_batch(context)
        context.logger.info_with('Written batch',
                                 batch_count=context.batch_count,
                                 batch_size=len(context.batch))
        
def write_batch(context):
    file_name = str(context.worker_id)+'_'+str(context.batch_count)
    df = pd.DataFrame.from_records(context.batch)
    df.to_parquet(path=os.path.join(context.target_path, file_name), partition_cols=context.pq_partitions)
    # post write cleanup and counter update
    context.batch = []
    context.batch_count += 1

In [13]:
# nuclio: end-code

## Test Locally

In [14]:
init_context(context)
#reduce the batch size to 10
context.batch_size = 10

# trigger with 9 events:

nine_events = [b'{"user_id" : 1 , "event_type": "spin"}',
              b'{"user_id" : 2 , "event_type": "spin"}',
              b'{"user_id" : 3 , "event_type": "spin"}',
              b'{"user_id" : 4 , "event_type": "spin"}',
              b'{"user_id" : 5 , "event_type": "spin"}',
              b'{"user_id" : 6 , "event_type": "spin"}',
              b'{"user_id" : 7 , "event_type": "spin"}',
              b'{"user_id" : 8 , "event_type": "spin"}',
              b'{"user_id" : 9 , "event_type": "spin"}']

for e in nine_events:
    event = nuclio.Event(body=e)
    handler(context, event)

Python> 2020-08-04 22:00:58,014 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 1, 'event_type': 'spin'}}
Python> 2020-08-04 22:00:58,015 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 2, 'event_type': 'spin'}}
Python> 2020-08-04 22:00:58,016 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 3, 'event_type': 'spin'}}
Python> 2020-08-04 22:00:58,017 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 4, 'event_type': 'spin'}}
Python> 2020-08-04 22:00:58,017 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 5, 'event_type': 'spin'}}
Python> 2020-08-04 22:00:58,018 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 6, 'event_type': 'spin'}}
Python> 2020-08-04 22:00:58,019 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 7, 'event_type': 'spin'}}
Python> 2020-08-04 22:00:58,019 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 8, 'event_type': 'spin'}}


In [15]:
# check weather a parquet has been created
!ls -l ${TARGET_PATH}

total 0


In [16]:
# trigger the tenth event which should trigger the creation of the parquet file.
tenth_event = b'{"user_id" : 10 , "event_type": "spin"}'
event = nuclio.Event(body=tenth_event)
handler(context, event)

Python> 2020-08-04 22:00:58,667 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 10, 'event_type': 'spin'}}
Python> 2020-08-04 22:00:58,667 [info] Writing batch: {'batch_count': 0, 'batch_size': 10}
Python> 2020-08-04 22:00:58,741 [info] Written batch: {'batch_count': 1, 'batch_size': 0}


In [17]:
# check weather a parquet has been created
!ls -l ${TARGET_PATH}

total 3
-rw-r--r-- 1 51 nogroup 2268 Aug  4 22:00 None_0


In [18]:
# cleanup
!rm ${TARGET_PATH}/*

## Deploy  function

In [19]:
%nuclio deploy -p {PROJECT_NAME} -n {V3IO_USERNAME}-stream-to-parquet

[nuclio] 2020-08-04 22:01:02,688 (info) Build complete
[nuclio] 2020-08-04 22:01:08,796 (info) Function deploy complete
[nuclio] 2020-08-04 22:01:08,802 done creating iguazio-stream-to-parquet, function address: 3.14.105.22:32249
%nuclio: function deployed
