# Nuclio - Generator function

In [1]:
# nuclio: ignore
import nuclio

### Setups

In [2]:
%%nuclio cmd -c

# Utils
pip install pyarrow
pip install pyyaml --upgrade
pip install pandas
pip install pytimeparse

# Igz DB
pip install v3io_frames --upgrade

# Function
pip install -i https://test.pypi.org/simple/ v3io-generator
pip install faker

## Function

In [37]:
import os
import datetime
import time
import yaml
import pandas as pd
import itertools

# MLRun
from mlrun import get_or_create_ctx

# DB Connection
import v3io_frames as v3f

# Data generator
from v3io_generator import metrics_generator, deployment_generator

### Helper functions

In [38]:
def _create_deployment():
    context.logger.info('Creating deployment')
    # Create meta-data factory
    dep_gen = deployment_generator.deployment_generator()
    faker=dep_gen.get_faker()

    # Design meta-data
    dep_gen.add_level(name='company',number=2,level_type=faker.company)
    dep_gen.add_level('data_center',number=2,level_type=faker.street_name)
    dep_gen.add_level('device',number=2,level_type=faker.msisdn)

    # Create meta-data
    deployment_df = dep_gen.generate_deployment()
    return deployment_df

In [39]:
def _is_deployment_exist(path):
    # Checking shared path for the devices table
    return os.path.exists(f'/v3io/bigdata/{path}')

In [40]:
def _get_deployment_from_kv(client, path):
    context.logger.info(f'Retrieving deployment from {path}')
    # Read the devices table from our KV store
    deployment_df = client.read(backend='kv', table=path)
    
    # Reset index to column
    deployment_df.index.name = 'device'
    deployment_df = deployment_df.reset_index()
    return deployment_df

In [41]:
def _save_deployment_to_kv(path, df, client=None):
    context.logger.info(f'Saving deployment to {path}')
    # Save deployment to our KV store
    client.write(backend='kv', table=path ,dfs=df, index_cols=['device'])

In [42]:
def get_or_create_deployment(path, client=None):
    deployment_df = None
    if client and _is_deployment_exist(path):
        # Get deployment from KV
        deployment_df = _get_deployment_from_kv(client, path)
    else:
        # Create deployment
        deployment_df = _create_deployment()
        
        context.logger.info(deployment_df)
        if client:
            _save_deployment_to_kv(path, deployment_df, client)

    return deployment_df

In [451]:
def set_indexes(df):
    df = df.set_index(['time', 'company', 'data_center', 'device'])
    return df

In [452]:
def save_metrics_to_tsdb(client, metrics: pd.DataFrame, metrics_table):
    client.write('tsdb', metrics_table, metrics)

In [546]:
def save_metrics_to_parquet(metrics, metrics_table):
#     df = pd.concat(itertools.chain(metrics))
    
    # Need to fix timestamps from ns to ms if we write to parquet
    df = metrics.reset_index()
    df['time'] = df.loc[:, 'time'].astype('datetime64[ms]')
    
    # Fix indexes
    df = set_indexes(df)
    
    # Save parquet
    first_timestamp = df.index[0][0].strftime('%Y%m%dT%H%M%S')
    last_timestamp = df.index[-1][0].strftime('%Y%m%dT%H%M%S')
    filename = first_timestamp + '-' + last_timestamp + '.parquet'
    filedir = os.path.join(os.getcwd(), metrics_table)
    filepath = os.path.join(filedir, filename)
    os.makedirs(filedir, exist_ok=True)
    with open(filepath, 'wb+') as f:
        df.to_parquet(f)
    
    return filepath

### Handler

In [560]:
v3c = v3f.Client(address='framesd:8081', container='bigdata')

def generator(context,
              metrics_configuration_file,
              initial_timestamp,
              use_tsdb,
              metrics_table,
              deployment_table,
              secs_to_generate = 180000):
    
    # load MLRUN runtime context (will be set by the runtime framework e.g. KubeFlow)
    context.logger.info('Loading parameters')
    metrics_configuration = yaml.safe_load(metrics_configuration_file.get())

    # Set V3IO Connection if needed
    client = None
    if use_tsdb:
        client = v3f.Client(address='framesd:8081', container='bigdata')
        client.create('tsdb', metrics_table, attrs={'rate': '1/s'}, if_exists=True)
    
    # Generate or create deployment
    deployment_df = get_or_create_deployment(deployment_table, client)
    context.logger.info(f'Deployment:\n{deployment_df}')

#     ## Set base initial values 
    deployment_df['cpu_utilization'] = 70
    deployment_df['latency'] = 0
    deployment_df['packet_loss'] = 0
    deployment_df['throughput'] = 290

    context.logger.info(f'Metrics Configuration:\n{metrics_configuration}')
    
#     # Create metrics generator
    start_time = datetime.datetime.fromtimestamp(initial_timestamp)
    end_time = (datetime.datetime.fromtimestamp(initial_timestamp)+datetime.timedelta(seconds=secs_to_generate))
    context.logger.info(f'Generating data from {start_time} to {end_time}')
    
    met_gen = metrics_generator.Generator_df(metrics_configuration, 
                                             user_hierarchy=deployment_df, 
                                             initial_timestamp=start_time)
    
    # Create metrics generator based on YAML configuration and deployment
    metrics = met_gen.generate_range(start_time=start_time,
                                     end_time=end_time,
                                     as_df=True,
                                     as_iterator=True)
    
    metrics = pd.concat(itertools.chain(metrics))
    metrics = metrics.reset_index()
    metrics = metrics.rename(columns={'timestamp': 'time'})
    context.logger.info(f'Generated metrics:\nSample: {metrics.head(1)}')
    
    # Save Generated metrics
    if use_tsdb:
        # Prepare dataframe for TSDB
        metrics = set_indexes(metrics)
        
        # Save to TSDB
        client.write('tsdb', metrics_table, metrics)
        context.logger.info(f'Saved data to TSDB: {metrics_table}')
    else:
        # Prepare dataframe for parquet
        metrics['time'] = metrics.loc[:, 'time'].astype('datetime64[ms]')
        metrics = set_indexes(metrics)
        
        # Prepare filename
        first_timestamp = metrics.index[0][0].strftime('%Y%m%dT%H%M%S')
        last_timestamp = metrics.index[-1][0].strftime('%Y%m%dT%H%M%S')
        filename = first_timestamp + '-' + last_timestamp + '.parquet'
        filedir = os.path.join(os.getcwd(), metrics_table)
        filepath = os.path.join(filedir, filename)
        
        # Save to Parquet
        os.makedirs(filedir, exist_ok=True)
        with open(filepath, 'wb+') as f:
            metrics.to_parquet(f)
            
        context.log_artifact('metrics', src_path=data, target_path=os.path.join(metrics_table, os.path.basename(data)), upload=True)
        context.logger.info(f'Saved data to Parquet: {data}')

In [561]:
# nuclio: end-code

In [562]:
# nuclio: ignore
from mlrun import new_function, code_to_function, mount_v3io, NewTask

In [563]:
# nuclio: ignore
params = {
    'use_tsdb': False,
    'deployment_table': 'netops_deployment',
    'metrics_table': 'netops_metrics',
    'initial_timestamp': (datetime.datetime.now()-datetime.timedelta(days=1)).timestamp(),
    'secs_to_generate': 10
}
inputs = {
    'metrics_configuration_file': os.path.join(os.getcwd(), 
                                              'configurations', 
                                              'metrics_configuration.yaml')
}

fn = new_function(runtime='', interactive=True)
task = NewTask(handler=generator, 
               params=params, 
               inputs=inputs,
               out_path='/User/mlrun-db/data/')
fn.run(task)

[mlrun] 2019-11-21 14:06:31,256 starting run generator uid=902e55017b8646d88e158b371482d8ec  -> 
Python> 2019-11-21 14:06:31,375 [info] Creating deployment
Python> 2019-11-21 14:06:33,525 [info]                       company       data_center         device
0  Daniel_PLC                  Andrew_Plaza      2447259508927
1  Daniel_PLC                  Andrew_Plaza      8127169230630
2  Daniel_PLC                  Rachel_Place      3576355064076
3  Daniel_PLC                  Rachel_Place      4475363768023
4  Johnson__Farmer_and_Robles  Rhonda_Underpass  2637820334710
5  Johnson__Farmer_and_Robles  Rhonda_Underpass  3323474148397
6  Johnson__Farmer_and_Robles  Alicia_Forest     2240355152545
7  Johnson__Farmer_and_Robles  Alicia_Forest     1888282991036
[mlrun] 2019-11-21 14:06:31,361 Loading parameters
[mlrun] 2019-11-21 14:06:33,538 Deployment:
                      company       data_center         device
0  Daniel_PLC                  Andrew_Plaza      2447259508927
1  Daniel_PLC    

name 'data' is not defined


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...82d8ec,0,,error,generator,repo=http://github.com/mlrun/demoscommit=a6a1680459beaaec4b527712ff0dd15c1109f917kind=handlerowner=adminhost=jupyter-h4pye88pz3-itkjk-6bbcdb955c-j8xtr,metrics_configuration_file,use_tsdb=Falsedeployment_table=netops_deploymentmetrics_table=netops_metricsinitial_timestamp=1574258791.255954secs_to_generate=10,,


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 902e55017b8646d88e158b371482d8ec 
[mlrun] 2019-11-21 14:06:33,807 run executed, status=error


RunError: name 'data' is not defined