# Nuclio - Generator

## Setup the environment

In [2]:
# nuclio: ignore
import nuclio

### Define environment variables

In [35]:
%%nuclio env

# Iguazio access
V3IO_FRAMESD=${V3IO_FRAMESD}
V3IO_USERNAME=${V3IO_USERNAME}
V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}

# Function variables
BATCH_SIZE=100
DATA_STREAM=customers_stream
london_locations=03311311313011311011000321002320,03311311311233323013031101320003,03311311313010023000032330133111,03311311311222300331010333220231
ACCURACY=20
BASE_ACCURACY=14

%nuclio: setting 'V3IO_FRAMESD' environment variable
%nuclio: setting 'V3IO_USERNAME' environment variable
%nuclio: setting 'V3IO_ACCESS_KEY' environment variable
%nuclio: setting 'BATCH_SIZE' environment variable
%nuclio: setting 'DATA_STREAM' environment variable
%nuclio: setting 'london_locations' environment variable
%nuclio: setting 'ACCURACY' environment variable
%nuclio: setting 'BASE_ACCURACY' environment variable


%nuclio: cannot find "=" in line
%nuclio: cannot find "=" in line


### Base image

In [20]:
%nuclio config spec.build.baseImage = "python:3.6-jessie"

%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'


### Install packages

In [21]:
%%nuclio cmd

# General
pip install pandas
# pip install json

# DB
pip install v3io_frames

# Function
pip install faker



## Function code

### Imports

In [22]:
import os
import itertools
import random

# Data handling
import pandas as pd

# DB
import v3io_frames as v3f

# Function
import faker
from faker.providers import BaseProvider

### Helper classes definitions

In [23]:
class LocationProvider(BaseProvider):
    '''
    Creates locations within base_location

    Uses QuadTree for Geohashing
        @{http://tech.taskrabbit.com/blog/2015/06/09/elasticsearch-geohash-vs-geotree/}
        @{http://mapzen.github.io/leaflet-spatial-prefix-tree/}
        @{http://blog.notdot.net/2009/11/Damn-Cool-Algorithms-Spatial-indexing-with-Quadtrees-and-Hilbert-Curves}
    '''    
    def location(self, location_base: str, base_acc: int=10, acc: int=20):
        coordinates = location_base[:base_acc]
        for i in range(acc-len(coordinates)):
            coordinates += str(random.randint(0, 3))
        return coordinates

### Init context

In [24]:
def init_context(context):
    ##########
    # Setups #
    ##########
    
    # DB Contexts
    v3c_frames = v3f.Client('http://' + os.environ['V3IO_FRAMESD'])
    setattr(context, 'v3f', v3c_frames)
    
    # DB Tables
    customers_table = os.getenv('CUSTOMERS', 'customers')
    setattr(context, 'customers_table', customers_table)
    
    customers_stream = os.getenv('CUSTOMERS_STREAM', 'customers_stream')
    setattr(context, 'customers_stream', customers_stream)
    
    # Function
    fakers = faker.Faker()
    fakers.add_provider(LocationProvider)
    setattr(context, 'faker', fakers)
    
    locations = list(os.getenv('london_locations', '').split(','))
    setattr(context, 'locations', locations)
    
    customers = v3c_frames.read('kv', customers_table)
    customers = range(customers.shape[0])
    setattr(context, 'customers', customers)
    
    location_accuracy_params = [int(os.getenv('BASE_ACCURACY', 14)), int(os.getenv('ACCURACY', 20))]
    setattr(context, 'location_accuracy_params', location_accuracy_params)
    
    ###########
    # Actions #
    ###########
    v3c_frames.delete('stream', customers_stream)
    v3c_frames.create('stream', customers_stream, attrs={'retention_hours':48,'shards':1})

### Helper functions

In [25]:
def create_customer(context):
    customer_id = random.choice(context.customers)
    location = context.faker.location(random.choice(context.locations), *context.location_accuracy_params)
    customer = {
        'id': customer_id,
        'location': location
    }
    return customer

In [26]:
def create_batch(context, batch_size: int):
    customers = (create_customer(context) for i in range(batch_size))
    customers = pd.DataFrame.from_records(itertools.chain(customers))
    customers = customers.set_index(['id'])
    return customers

### Handler

In [27]:
def handler(context, event):
    # Create customers
    customers = create_batch(context, int(os.getenv('BATCH_SIZE', 100)))
    
    # Send to stream
    context.v3f.write('stream', context.customers_stream, [customers])

In [37]:
%nuclio deploy -n generator -p recommendation_engine -c

[nuclio.deploy] 2019-04-25 05:17:19,481 (info) Building processor image
[nuclio.deploy] 2019-04-25 05:17:25,540 (info) Pushing image
[nuclio.deploy] 2019-04-25 05:17:25,541 (info) Build complete
[nuclio.deploy] 2019-04-25 05:17:29,583 (info) Function deploy complete
[nuclio.deploy] 2019-04-25 05:17:29,589 done updating generator, function address: 3.121.211.71:32689
%nuclio: function deployed


In [28]:
# nuclio: ignore
init_context(context)

In [29]:
# nuclio: ignore
event = nuclio.Event(body='')
handler(context, event)

In [30]:
%nuclio show

%nuclio: notebook generator exported
Config:
apiVersion: nuclio.io/v1
kind: Function
metadata:
  annotations:
    nuclio.io/generated_by: function generated at 25-04-2019 by iguazio from /User/tutorials/demos/location_based_recommendation/generator.ipynb
  labels: {}
  name: generator
spec:
  build:
    baseImage: python:3.6-jessie
    commands:
    - pip install pandas
    - pip install v3io_frames
    - pip install faker
    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlciBvbiAyMDE5LTA0LTI1IDA1OjExCgppbXBvcnQgb3MKaW1wb3J0IGl0ZXJ0b29scwppbXBvcnQgcmFuZG9tCgppbXBvcnQgcGFuZGFzIGFzIHBkCgppbXBvcnQgdjNpb19mcmFtZXMgYXMgdjNmCgppbXBvcnQgZmFrZXIKZnJvbSBmYWtlci5wcm92aWRlcnMgaW1wb3J0IEJhc2VQcm92aWRlcgoKY2xhc3MgTG9jYXRpb25Qcm92aWRlcihCYXNlUHJvdmlkZXIpOgogICAgJycnCiAgICBDcmVhdGVzIGxvY2F0aW9ucyB3aXRoaW4gYmFzZV9sb2NhdGlvbgoKICAgIFVzZXMgUXVhZFRyZWUgZm9yIEdlb2hhc2hpbmcKICAgICAgICBAe2h0dHA6Ly90ZWNoLnRhc2tyYWJiaXQuY29tL2Jsb2cvMjAxNS8wNi8wOS9lbGFzdGljc2VhcmNoLWdlb2hhc2gtdnMtZ

In [31]:
# nuclio: ignore
list(v3f.Client('http://' + os.environ['V3IO_FRAMESD']).read('stream', table='customers_stream', seek='earliest', shard_id='0', iterator=True))

[                id              location                   stream_time
 seq_number                                                            
 1           1815.0  03311311313011320100 2019-04-25 05:11:22.745050066
 2            722.0  03311311311222113320 2019-04-25 05:11:22.745050066
 3           1124.0  03311311311222022112 2019-04-25 05:11:22.745050066
 4           2708.0  03311311313010023013 2019-04-25 05:11:22.745050066
 5           1988.0  03311311311222032200 2019-04-25 05:11:22.745050066
 6           2393.0  03311311313010211323 2019-04-25 05:11:22.745050066
 7            916.0  03311311311222032132 2019-04-25 05:11:22.745050066
 8           1583.0  03311311313010303000 2019-04-25 05:11:22.745050066
 9            102.0  03311311311233313231 2019-04-25 05:11:22.745050066
 10           738.0  03311311313011223131 2019-04-25 05:11:22.745050066
 11          2025.0  03311311311233201122 2019-04-25 05:11:22.745050066
 12          2301.0  03311311311222011030 2019-04-25 05:11:22.74