# Nuclio - Generator

## Setup the environment

In [1]:
# nuclio: ignore
import nuclio

### Define environment variables

In [2]:
%%nuclio env

# Iguazio access
FRAMESD=${V3IO_FRAMESD}
V3IO_USERNAME=${V3IO_USERNAME}
V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}

# Function variables
BATCH_SIZE=100
DATA_STREAM=customers_stream
london_locations=03311311313011311011000321002320,03311311311233323013031101320003,03311311313010023000032330133111,03311311311222300331010333220231
ACCURACY=20
BASE_ACCURACY=14

%nuclio: setting 'FRAMESD' environment variable
%nuclio: setting 'V3IO_USERNAME' environment variable
%nuclio: setting 'V3IO_ACCESS_KEY' environment variable
%nuclio: setting 'BATCH_SIZE' environment variable
%nuclio: setting 'DATA_STREAM' environment variable
%nuclio: setting 'london_locations' environment variable
%nuclio: setting 'ACCURACY' environment variable
%nuclio: setting 'BASE_ACCURACY' environment variable


%nuclio: cannot find "=" in line
%nuclio: cannot find "=" in line


### Install packages

In [3]:
%%nuclio cmd

# General
pip install pandas
# pip install json

# DB
pip install v3io_frames

# Function
pip install faker



## Function code

### Imports

In [4]:
import os
import itertools
import random

# Data handling
import pandas as pd

# DB
import v3io_frames as v3f

# Function
import faker
from faker.providers import BaseProvider

### Helper classes definitions

In [5]:
class LocationProvider(BaseProvider):
    '''
    Creates locations within base_location

    Uses QuadTree for Geohashing
        @{http://tech.taskrabbit.com/blog/2015/06/09/elasticsearch-geohash-vs-geotree/}
        @{http://mapzen.github.io/leaflet-spatial-prefix-tree/}
        @{http://blog.notdot.net/2009/11/Damn-Cool-Algorithms-Spatial-indexing-with-Quadtrees-and-Hilbert-Curves}
    '''    
    def location(self, location_base: str, base_acc: int=10, acc: int=20):
        coordinates = location_base[:base_acc]
        for i in range(acc-len(coordinates)):
            coordinates += str(random.randint(0, 3))
        return coordinates

### Init context

In [40]:
def init_context(context):
    ##########
    # Setups #
    ##########
    
    # DB Contexts
    v3c_frames = v3f.Client('http://' + os.environ['V3IO_FRAMESD'])
    setattr(context, 'v3f', v3c_frames)
    
    # DB Tables
    customers_table = os.getenv('CUSTOMERS', 'customers')
    setattr(context, 'customers_table', customers_table)
    
    customers_stream = os.getenv('CUSTOMERS_STREAM', 'customers_stream')
    setattr(context, 'customers_stream', customers_stream)
    
    # Function
    fakers = faker.Faker()
    fakers.add_provider(LocationProvider)
    setattr(context, 'faker', fakers)
    
    locations = list(os.getenv('london_locations', '').split(','))
    setattr(context, 'locations', locations)
    
    customers = v3c_frames.read('kv', customers_table)
    customers = range(customers.shape[0])
    setattr(context, 'customers', customers)
    
    location_accuracy_params = [int(os.getenv('BASE_ACCURACY', 14)), int(os.getenv('ACCURACY', 20))]
    setattr(context, 'location_accuracy_params', location_accuracy_params)
    
    ###########
    # Actions #
    ###########
    v3c_frames.delete('stream', customers_stream)
    v3c_frames.create('stream', customers_stream, attrs={'retention_hours':48,'shards':1})

### Helper functions

In [7]:
def create_customer(context):
    customer_id = random.choice(context.customers)
    location = context.faker.location(random.choice(context.locations), *context.location_accuracy_params)
    customer = {
        'id': customer_id,
        'location': location
    }
    return customer

In [8]:
def create_batch(context, batch_size: int):
    customers = (create_customer(context) for i in range(batch_size))
    customers = pd.DataFrame.from_records(itertools.chain(customers))
    customers = customers.set_index(['id'])
    return customers

### Handler

In [37]:
def handler(context, event):
    # Create customers
    customers = create_batch(context, int(os.getenv('BATCH_SIZE', 100)))
    
    # Send to stream
    context.v3f.write('stream', context.customers_stream, [customers])

In [10]:
%nuclio show

%nuclio: notebook generator exported
Config:
apiVersion: nuclio.io/v1
kind: Function
metadata:
  annotations:
    nuclio.io/generated_by: function generated at 22-04-2019 by iguazio from /User/tutorials/demos/location_based_recommendation/generator.ipynb
  labels: {}
  name: generator
spec:
  build:
    commands:
    - pip install pandas
    - pip install v3io --upgrade
    - pip install v3io_frames
    - pip install faker
    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlciBvbiAyMDE5LTA0LTIyIDEyOjIwCgppbXBvcnQgb3MKaW1wb3J0IGl0ZXJ0b29scwppbXBvcnQgcmFuZG9tCgppbXBvcnQgcGFuZGFzIGFzIHBkCgppbXBvcnQgdjNpb19mcmFtZXMgYXMgdjNmCmltcG9ydCB2M2lvCmltcG9ydCB2M2lvLmRhdGFwbGFuZQoKaW1wb3J0IGZha2VyCmZyb20gZmFrZXIucHJvdmlkZXJzIGltcG9ydCBCYXNlUHJvdmlkZXIKCmNsYXNzIExvY2F0aW9uUHJvdmlkZXIoQmFzZVByb3ZpZGVyKToKICAgICcnJwogICAgQ3JlYXRlcyBsb2NhdGlvbnMgd2l0aGluIGJhc2VfbG9jYXRpb24KCiAgICBVc2VzIFF1YWRUcmVlIGZvciBHZW9oYXNoaW5nCiAgICAgICAgQHtodHRwOi8vdGVjaC50YXNrcmFiYml0LmNvbS9ibG9nLzIwM

In [50]:
%nuclio deploy -n generator -p recommendation_engine -c

[nuclio.deploy] 2019-04-23 11:25:27,434 (info) Building processor image
[nuclio.deploy] 2019-04-23 11:25:32,485 (warn) Docker command outputted to stderr - this may result in errors
[nuclio.deploy] 2019-04-23 11:25:32,486 (warn) Create function failed failed, setting function status
[nuclio.deploy] 2019-04-23 11:25:32,486 
Error - exit status 1
    .../nuclio/nuclio/pkg/cmdrunner/cmdrunner.go:131

Call stack:
stdout:
Sending build context to Docker daemon  22.12MB
Step 1/12 : FROM python:3.6-alpine
 ---> 1d981af1e3b4
Step 2/12 : ARG NUCLIO_LABEL
 ---> Using cache
 ---> ee4f45c7960f
Step 3/12 : ARG NUCLIO_ARCH
 ---> Using cache
 ---> a2b8c70d8fa4
Step 4/12 : ARG NUCLIO_BUILD_LOCAL_HANDLER_DIR
 ---> Using cache
 ---> 6eb96eee7768
Step 5/12 : RUN pip install pandas
 ---> Running in 5523986c52d3
Collecting pandas
  Downloading https://files.pythonhosted.org/packages/b2/4c/b6f966ac91c5670ba4ef0b0b5613b5379e3c7abdfad4e7b89a87d73bae13/pandas-0.24.2.tar.gz (11.8MB)
    Complete output from com

%nuclio: error: cannot deploy 


In [41]:
# nuclio: ignore
init_context(context)

In [49]:
# nuclio: ignore
event = nuclio.Event(body='')
handler(context, event)

In [47]:
list(v3f.Client('http://' + os.environ['V3IO_FRAMESD']).read('stream', table='customers_stream', seek='earliest', shard_id='0', iterator=True))

[                             stream_time      id              location
 seq_number                                                            
 1          2019-04-22 12:59:40.308844346  2356.0  03311311311222021131
 2          2019-04-22 12:59:40.308844346  1213.0  03311311313010113033
 3          2019-04-22 12:59:40.308844346  1340.0  03311311313011323302
 4          2019-04-22 12:59:40.308844346    29.0  03311311313011123223
 5          2019-04-22 12:59:40.308844346  1782.0  03311311311233120102
 6          2019-04-22 12:59:40.308844346  1542.0  03311311313010132123
 7          2019-04-22 12:59:40.308844346  1381.0  03311311313010303132
 8          2019-04-22 12:59:40.308844346  2666.0  03311311311222012332
 9          2019-04-22 12:59:40.308844346  1259.0  03311311313011023310
 10         2019-04-22 12:59:40.308844346  1775.0  03311311311233120123
 11         2019-04-22 12:59:40.308844346  1423.0  03311311311233032003
 12         2019-04-22 12:59:40.308844346  1825.0  0331131131301