# Parquet Generator

In [8]:
import nuclio
import os

In [2]:
%nuclio env BATCH_SIZE = 1024
%nuclio env TARGET_PATH = /v3io/bigdata/rapid-prototype/events-pq-arthur
%nuclio env FILE_NAME = test_arthur
%nuclio env OUTPUT_STREAM_PATH = ${V3IO_USERNAME}/examples/rapid-prototype/enriched-events-stream
%nuclio env SHARDS_COUNT = 8
%nuclio env SHARD_KEY = user_id
%nuclio env V3IO_ACCESS_KEY = ${V3IO_ACCESS_KEY}


#optional %nuclio env PQ_PARTITIONS = part1,part2

%nuclio: setting 'BATCH_SIZE' environment variable
%nuclio: setting 'TARGET_PATH' environment variable
%nuclio: setting 'FILE_NAME' environment variable
%nuclio: setting 'OUTPUT_STREAM_PATH' environment variable
%nuclio: setting 'SHARDS_COUNT' environment variable
%nuclio: setting 'SHARD_KEY' environment variable
%nuclio: setting 'V3IO_ACCESS_KEY' environment variable


In [3]:
# Define function spec
%nuclio config kind = "nuclio"

%nuclio: setting kind to 'nuclio'


In [4]:
%%nuclio cmd

python -m pip install pandas
python -m pip install pyarrow
python -m pip install fastparquet
python -m pip install v3io




In [5]:
event_dict = [{"user_id" : str(i) , "event_type": "spin" + str(i)} for i in range(10000000)]
file_name = "test_arthur"
target_path = os.getenv('TARGET_PATH')
df = pd.DataFrame.from_records(event_dict)
df.to_parquet(path=os.path.join(target_path, file_name), compression='gzip')


NameError: name 'os' is not defined

In [None]:
%%nuclio config
spec.readinessTimeoutSeconds = 200

In [None]:
%nuclio mount /User ~/

In [None]:
# nuclio: start-code

In [None]:
import os
import pandas as pd
import numpy as np
import json
import datetime
import fastparquet
import v3io.dataplane


In [9]:
def init_context(context):
    setattr(context, 'batch', [])
    V3IO_ACCESS_KEY = os.getenv('V3IO_ACCESS_KEY')
    v3io_client = v3io.dataplane.Client(endpoint='http://v3io-webapi:8081', access_key=V3IO_ACCESS_KEY)
    setattr(context, 'batch_size', int(os.getenv('BATCH_SIZE', 1024)))
    setattr(context, 'batch_count',int(os.getenv('BATCH_COUNT', 0)))
    CONTAINER = os.getenv('CONTAINER')
    OUTPUT_STREAM_PATH = os.getenv('OUTPUT_STREAM_PATH')
    #SHARDS_COUNT = os.getenv('SHARDS_COUNT')
    #SHARD_KEY = os.getenv('SHARD_KEY')


    
    setattr(context, 'v3io_client', v3io_client)
    #setattr(context, 'shard_key', SHARD_KEY)
    #setattr(context, 'shards_count', int(SHARDS_COUNT))
    setattr(context, 'container', CONTAINER)
    setattr(context, 'output_stream_path', OUTPUT_STREAM_PATH)
    
    setattr(context, 'target_path', os.getenv('TARGET_PATH'))
    os.makedirs(context.target_path, exist_ok=True)
    

In [None]:
def handler(context, event):
    pool = Pool()
    if type(event.body) is dict:
        event_dict = event.body
    else:
        event_dict = json.loads(event.body)
    parquet = event_dict["file"]
    partition_key = event_dict["partition_key"] or None
    pfile = fastparquet.ParquetFile(parquet)
    for i, df in enumerate(pfile.iter_row_groups()):
        write_header = (i==0)
        n = 1000  #chunk row size
        list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]
        for frame in list_df:
            records = []

            for n, row in frame.iterrows():
                records.append(event_to_record(row.to_dict(),partition_key))

            resp = context.v3io_client.put_records(container=context.container, 
                                           path=context.output_stream_path, 
                                           records=records, 
                                           raise_for_status=v3io.dataplane.RaiseForStatus.never)
            if(i%10 == 0):
                print(f'${resp.status_code} ${str(n)} ${str(len(records))}')
        
    
        
        
    
def event_to_record(event_dict, partition_key=None):
    event_str = json.dumps(event_dict)
    return {'data': event_str, 'partition_key': str(partition_key)}
        
    

In [6]:
# nuclio: end-code

## Test Locally

In [9]:
%%time
file_name = "test_arthur"
target_path = os.getenv('TARGET_PATH')
init_context(context)

path=os.path.join(target_path, file_name)
print(path)
context.file = path
#event = nuclio.Event(body={"file" :  context.file, "partition_key" : None})


handler(context, event)

NameError: name 'init_context' is not defined

# check weather a parquet has been created
!ls -l ~/examples/rapid-prototype/events-pq

In [15]:
# trigger the tenth event which should trigger the creation of the parquet file.
tenth_event = b'{"user_id" : 10 , "event_type": "spin"}'
event = nuclio.Event(body=tenth_event)
handler(context, event)

Python> 2020-06-21 14:38:14,748 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 10, 'event_type': 'spin'}}
Python> 2020-06-21 14:38:14,749 [info] Writing batch: {'batch_count': 0, 'batch_size': 10}
Python> 2020-06-21 14:38:14,874 [info] Written batch: {'batch_count': 1, 'batch_size': 0}


In [16]:
# check weather a parquet has been created
!ls -l ~/examples/rapid-prototype/events-pq/

total 0
-rw-r--r-- 1 iguazio iguazio 2268 Jun 21 14:38 None_0


In [17]:
# cleanup
!rm ~/examples/rapid-prototype/events-pq/*

## Deploy  function

In [12]:
%nuclio deploy -p rapid-prototype -n stream-to-parquet

[nuclio] 2020-06-22 14:22:17,910 (info) Build complete
[nuclio] 2020-06-22 14:22:23,976 (info) Function deploy complete
[nuclio] 2020-06-22 14:22:23,983 done creating stream-to-parquet, function address: 192.168.226.12:30583
%nuclio: function deployed
