In [1]:
# nuclio: ignore
import nuclio

In [2]:
import requests
import json

SCHEMA = '.%23schema'
PARTITION_BY_FIELDS = ['year', 'month', 'day', 'hour']

def get_request_url(container_name, table_name, v3io_api_endpoint_host, v3io_api_endpoint_port):
    return 'https://{}:{}/{}/{}/{}'.format(v3io_api_endpoint_host, v3io_api_endpoint_port, container_name, table_name
                                          , SCHEMA)

def get_request_headers():
    return {
        'Content-Type': 'application/json',
    }


def send_request(logger, url, headers, username, password):
    try:
        auth = requests.auth.HTTPBasicAuth(username, password)
        response = requests.get(url, headers=headers, auth=auth, timeout=10, verify=False)
        logger.debug(response.status_code)
        logger.debug(response.content)
        return response.content

    except Exception as e:
        logger.error('ERROR: {0}'.format(str(e)))

def get_table_schema(context, 
                 container: str = 'parquez',
                 kv_table: str = 'faker'):
    """Open a file/object archive into a target directory
    
    :param target_dir:   target directory
    :param archive_url:  source archive path/url
    
    :returns: content dir
    """
        
    kv_table = KVTable(logger, conf, args.real_time_table_name)
    kv_table.import_table_schema()
    
    context.logger.info(f'extracted archive to {target_dir}')
    context.log_artifact('content', local_path=target_dir)
    
class KVTable(object):
    def __init__(self, logger, conf, name='table'):
        self.name = name
        self.logger = logger
        self.schema = "init_schema"
        self.conf = conf

    def import_table_schema(self):
        url = get_request_url(self.conf.v3io_container, self.name, self.conf.v3io_api_endpoint_host,
                              self.conf.v3io_api_endpoint_port)
        headers = get_request_headers()
        schema = send_request(self.logger, url, headers, self.conf.username, self.conf.password)
        self.logger.info('KV table schema {}'.format(schema))
        self.schema = schema
        return schema

    def get_schema_fields_and_types(self):
        js = json.loads(self.schema)
        fields = js['fields']
        parsed_schema = ""
        for ls in fields:
            field = ls['name']
            if field not in PARTITION_BY_FIELDS:
                field_type = ls['type']
                if field_type == 'long':
                    field_type = 'bigint'
                parsed_schema += field + ' ' + field_type + ',\n'
        parsed_schema = parsed_schema[:-2]
        self.logger.debug('schema_fields_and_types {}'.format(parsed_schema))
        return parsed_schema

    def get_schema_fields(self):
        js = json.loads(self.schema)
        fields = js['fields']
        parsed_schema = ""
        for ls in fields:
            field = ls['name']
            if field not in PARTITION_BY_FIELDS:
                parsed_schema += field + ' ,\n'
        parsed_schema = parsed_schema[:-2]
        self.logger.debug('schema_fields {}'.format(parsed_schema))
        return parsed_schema

    def get_parquet_table_name(self):
        parquet_name = self.name + '_'+self.conf.compression
        self.logger.debug('parquet table name {}'.format(parquet_name))
        return parquet_name
    

    
    

In [3]:
# nuclio: end-code

In [4]:
import mlrun

In [5]:
# create job function object from notebook code
fn = mlrun.code_to_function('kv_schema', kind='job', with_doc=True,
                            handler=get_table_schema, image='mlrun/mlrun')

# add metadata (for templates and reuse)
fn.spec.default_handler = 'get_table_schema'
fn.spec.description = "this function returns kv table schema"
fn.metadata.categories = ['fileutils']
fn.metadata.labels = {'author': 'me'}

In [6]:
print(fn.to_yaml())


kind: job
metadata:
  name: kv-schema
  tag: ''
  project: ''
  labels:
    author: me
  categories:
  - fileutils
spec:
  command: ''
  args: []
  image: mlrun/mlrun
  volumes: []
  volume_mounts: []
  env: []
  default_handler: get_table_schema
  entry_points:
    get_request_url:
      name: get_request_url
      doc: ''
      parameters:
      - name: container_name
      - name: table_name
      - name: v3io_api_endpoint_host
      - name: v3io_api_endpoint_port
      outputs: []
      lineno: 9
    get_request_headers:
      name: get_request_headers
      doc: ''
      parameters: []
      outputs: []
      lineno: 13
    send_request:
      name: send_request
      doc: ''
      parameters:
      - name: logger
      - name: url
      - name: headers
      - name: username
      - name: password
      outputs: []
      lineno: 19
    get_table_schema:
      name: get_table_schema
      doc: Open a file/object archive into a target directory
      parameters:
      - name: conte

In [7]:
# save to a file (and can be pushed to a git)
fn.export('function.yaml')

[mlrun] 2020-05-06 08:27:53,276 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f28ce011630>

In [8]:
mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or 'http://mlrun-api:8080'

In [9]:
# load from local file
xfn = mlrun.import_function('./function.yaml')

# load function from MLRun functions hub 
# xfn = mlrun.import_function('hub://open_archive')

# get function doc
xfn.doc()

function: kv-schema
this function returns kv table schema
default handler: get_table_schema
entry points:
  get_request_url: 
    {'name': 'container_name'}
    {'name': 'table_name'}
    {'name': 'v3io_api_endpoint_host'}
    {'name': 'v3io_api_endpoint_port'}
  get_request_headers: 
  send_request: 
    {'name': 'logger'}
    {'name': 'url'}
    {'name': 'headers'}
    {'name': 'username'}
    {'name': 'password'}
  get_table_schema: Open a file/object archive into a target directory
    {'name': 'context'}
    {'name': 'container', 'type': 'str', 'default': 'parquez'}
    {'name': 'kv_table', 'type': 'str', 'default': 'faker'}
  __init__: 
    {'name': 'self'}
    {'name': 'logger'}
    {'name': 'conf'}
    {'name': 'name', 'default': 'table'}
  import_table_schema: 
    {'name': 'self'}
  get_schema_fields_and_types: 
    {'name': 'self'}
  get_schema_fields: 
    {'name': 'self'}
  get_parquet_table_name: 
    {'name': 'self'}


In [10]:
mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or 'http://mlrun-api:8080'

In [11]:
# load from local file
xfn = mlrun.import_function('./function.yaml')

# load function from MLRun functions hub 
# xfn = mlrun.import_function('hub://open_archive')

# get function doc
xfn.doc()

function: kv-schema
this function returns kv table schema
default handler: get_table_schema
entry points:
  get_request_url: 
    {'name': 'container_name'}
    {'name': 'table_name'}
    {'name': 'v3io_api_endpoint_host'}
    {'name': 'v3io_api_endpoint_port'}
  get_request_headers: 
  send_request: 
    {'name': 'logger'}
    {'name': 'url'}
    {'name': 'headers'}
    {'name': 'username'}
    {'name': 'password'}
  get_table_schema: Open a file/object archive into a target directory
    {'name': 'context'}
    {'name': 'container', 'type': 'str', 'default': 'parquez'}
    {'name': 'kv_table', 'type': 'str', 'default': 'faker'}
  __init__: 
    {'name': 'self'}
    {'name': 'logger'}
    {'name': 'conf'}
    {'name': 'name', 'default': 'table'}
  import_table_schema: 
    {'name': 'self'}
  get_schema_fields_and_types: 
    {'name': 'self'}
  get_schema_fields: 
    {'name': 'self'}
  get_parquet_table_name: 
    {'name': 'self'}


In [None]:
# configute it: mount on iguazio fabric, set as interactive (return stdout)
xfn.apply(mlrun.mount_v3io())

# create and run the task
images_path = '/User/mlrun/examples/images'
get_table_schema = mlrun.NewTask('download',  
    params={'target_dir': images_path},
    inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'})