# Generate simulated infrastructure telemetry 

In [1]:
# Install requiered packages if needed (only once)
!pip install pytimeparse
!pip install -i https://test.pypi.org/simple/ v3io-generator --upgrade
!pip install faker
!pip install pyarrow --upgrade

Looking in indexes: https://test.pypi.org/simple/
Requirement already up-to-date: v3io-generator in /User/.pythonlibs/lib/python3.6/site-packages (0.0.27.dev0)
Requirement already up-to-date: pyarrow in /User/.pythonlibs/lib/python3.6/site-packages (0.12.1)


In [16]:
import os
import time
import yaml
import pandas as pd
import datetime
import itertools

# DB Connection
import v3io_frames as v3f

# Data generator
from v3io_generator import metrics_generator, deployment_generator

General definitions

In [3]:
%env SAVE_TO_KV = True                 
%env DEPLOYMENT_TABLE = netops_devices 

env: SAVE_TO_KV=True
env: DEPLOYMENT_TABLE=netops_devices


## Create Metadata
the following section will create a list of devices which are scattered in multiple datacenters

In [4]:
def _create_deployment():
    print('creating deployment')
    # Create meta-data factory
    dep_gen = deployment_generator.deployment_generator()
    faker=dep_gen.get_faker()

    # Design meta-data
    dep_gen.add_level(name='company',number=2,level_type=faker.company)
    dep_gen.add_level('data_center',number=2,level_type=faker.street_name)
    dep_gen.add_level('device',number=2,level_type=faker.msisdn)

    # Create meta-data
    deployment_df = dep_gen.generate_deployment()
    return deployment_df

In [5]:
def _is_deployment_exist(path):
    # Checking shared path for the devices table
    return os.path.exists(f'/v3io/bigdata/{path}')

In [6]:
def _get_deployment_from_kv(path):
    print(f'Retrieving deployment from {path}')
    # Read the devices table from our KV store
    deployment_df = client.read(backend='kv', table=path)
    
    # Reset index to column
    deployment_df.index.name = 'device'
    deployment_df = deployment_df.reset_index()
    return deployment_df

In [7]:
def _save_deployment_to_kv(path, df, client=v3f.Client('framesd:8081')):
    # Save deployment to our KV store
    client.write(backend='kv', table='netops_devices',dfs=df, index_cols=['device'])

In [8]:
def get_or_create_deployment(path, save_to_cloud=False, client=v3f.Client('framesd:8081')):
    if _is_deployment_exist(path):
        # Get deployment from KV
        deployment_df = _get_deployment_from_kv(path)
    else:
        # Create deployment
        deployment_df = _create_deployment()
        
        if save_to_cloud:
            _save_deployment_to_kv(path, deployment_df, client)

    return deployment_df

In [9]:
# Create our DB client
client = v3f.Client('framesd:8081')

In [10]:
deployment_df = get_or_create_deployment(os.environ['DEPLOYMENT_TABLE'], os.environ['SAVE_TO_KV'])
deployment_df

Retrieving deployment from netops_devices


Unnamed: 0,device,company,data_center
0,63085034865,Henderson-Lopez,Spencer_Greens
1,236137499649,Henderson-Lopez,Rogers_Route
2,2190949003307,Kelly-Arroyo,Mclaughlin_Gateway
3,3440493824847,Kelly-Arroyo,Nicholas_Courts
4,3669819894699,Kelly-Arroyo,Nicholas_Courts
5,9909774422784,Henderson-Lopez,Spencer_Greens
6,7124057837347,Henderson-Lopez,Rogers_Route
7,5804822255480,Kelly-Arroyo,Mclaughlin_Gateway


Read from our KV to make sure we have backup

In [11]:
# verify the table is written 
client.read(backend='kv', table='netops_devices')

Unnamed: 0_level_0,company,data_center
__name,Unnamed: 1_level_1,Unnamed: 2_level_1
3440493824847,Kelly-Arroyo,Nicholas_Courts
3669819894699,Kelly-Arroyo,Nicholas_Courts
9909774422784,Henderson-Lopez,Spencer_Greens
7124057837347,Henderson-Lopez,Rogers_Route
5804822255480,Kelly-Arroyo,Mclaughlin_Gateway
236137499649,Henderson-Lopez,Rogers_Route
2190949003307,Kelly-Arroyo,Mclaughlin_Gateway
63085034865,Henderson-Lopez,Spencer_Greens


## Add initial values

In [12]:
deployment_df['cpu_utilization'] = 70
deployment_df['latency'] = 0
deployment_df['packet_loss'] = 0
deployment_df['throughput'] = 290
deployment_df.head()

Unnamed: 0,device,company,data_center,cpu_utilization,latency,packet_loss,throughput
0,63085034865,Henderson-Lopez,Spencer_Greens,70,0,0,290
1,236137499649,Henderson-Lopez,Rogers_Route,70,0,0,290
2,2190949003307,Kelly-Arroyo,Mclaughlin_Gateway,70,0,0,290
3,3440493824847,Kelly-Arroyo,Nicholas_Courts,70,0,0,290
4,3669819894699,Kelly-Arroyo,Nicholas_Courts,70,0,0,290


## Generate simulated metrics per device
Metrics schema (describe simulated values) is read from `metrics_configuration.yaml`

In [31]:
# Load metrics configuration from YAML file
with open('metrics_configuration.yaml', 'r') as f:
    metrics_configuration = yaml.load(f)

# Create metrics generator based on YAML configuration
met_gen = metrics_generator.Generator_df(metrics_configuration, user_hierarchy=deployment_df, initial_timestamp=time.time())
metrics = met_gen.generate_range(start_time=datetime.datetime.now(),
                                 end_time=datetime.datetime.now()+datetime.timedelta(hours=1),
                                 as_df=True,
                                 as_iterator=True)

In [32]:
df = pd.concat(itertools.chain(metrics))
df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,cpu_utilization_is_error,latency,latency_is_error,packet_loss,packet_loss_is_error,throughput,throughput_is_error,is_error
timestamp,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-04-24 16:29:53.996772,Henderson-Lopez,Spencer_Greens,63085034865,80.30036,False,0.0,False,0.211627,False,268.036926,False,False
2019-04-24 16:29:53.996772,Henderson-Lopez,Rogers_Route,236137499649,65.564479,False,3.117734,False,0.0,False,238.819943,False,False
2019-04-24 16:29:53.996772,Kelly-Arroyo,Mclaughlin_Gateway,2190949003307,63.920317,False,2.8191,False,1.572495,False,227.147022,False,False
2019-04-24 16:29:53.996772,Kelly-Arroyo,Nicholas_Courts,3440493824847,53.639535,False,0.0,False,2.567153,False,257.215786,False,False
2019-04-24 16:29:53.996772,Kelly-Arroyo,Nicholas_Courts,3669819894699,74.254264,False,0.0,False,0.403987,False,265.800801,False,False


## Save to Iguazio Time-series Database

In [33]:
# uncomment the line below if you want to reset the TSDB table 
client.delete(backend='tsdb', table='netops_metrics_jupyter')

In [34]:
# create a new table, need to specify estimated sample rate
client.create(backend='tsdb', table='netops_metrics_jupyter', attrs={'rate': '1/m'})

In [35]:
# write the dataframe into the time-seried DB, note the company,data_center,device indexes are automatically converted to search optimized labels
client.write(backend='tsdb', table='netops_metrics_jupyter', dfs=df)

## Verify that the data was written

In [21]:
client.read(backend='tsdb', query='select avg(cpu_utilization) as cpu_utilization, avg(latency) as latency, avg(packet_loss) as packet_loss, avg(throughput) as throughput from netops_metrics_jupyter group by company, data_center, device',
            start="now-1d", end='now+1d', multi_index=True, step='5m').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,latency,packet_loss,throughput
time,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-04-24 16:21:13,Henderson-Lopez,Rogers_Route,7124057837347,70.885992,2.017412,0.967461,252.622477
2019-04-24 16:26:13,Henderson-Lopez,Rogers_Route,7124057837347,81.921424,35.154002,9.659135,212.575115
2019-04-24 16:31:13,Henderson-Lopez,Rogers_Route,7124057837347,79.872988,25.377096,12.417467,188.326686
2019-04-24 16:36:13,Henderson-Lopez,Rogers_Route,7124057837347,72.426217,7.671809,0.69765,251.799112
2019-04-24 16:41:13,Henderson-Lopez,Rogers_Route,7124057837347,84.985113,47.19801,21.325391,141.247529
2019-04-24 16:46:13,Henderson-Lopez,Rogers_Route,7124057837347,69.994465,1.745857,0.543987,253.48661
2019-04-24 16:51:13,Henderson-Lopez,Rogers_Route,7124057837347,72.876036,1.504754,0.672011,253.601863
2019-04-24 16:56:13,Henderson-Lopez,Rogers_Route,7124057837347,69.033523,2.238023,0.733741,250.953048
2019-04-24 17:01:13,Henderson-Lopez,Rogers_Route,7124057837347,70.975776,2.280567,0.88861,244.832079
2019-04-24 17:06:13,Henderson-Lopez,Rogers_Route,7124057837347,69.775006,2.047805,0.845098,248.65506


### Save the generated dataset to parquet for future reproducability 

In [22]:
# craete directory if doesnt exist 
!mkdir data

mkdir: cannot create directory 'data': File exists


In [23]:
import pyarrow as pa
from pyarrow import parquet as pq

In [24]:
#write the dataframe into a parquet (on iguazio file system)
version = '1.0'
filepath = 'data/netops_metrics.v{}.parquet'.format(version)
pq.write_table(pa.Table.from_pandas(df), filepath)

### Reading the data from parquet into the time-series DB
if we want to reproduce the same results we can rebuild the TSDB from the saved parquet file

In [25]:
# uncomment the line below if you want to reset the TSDB table 
client.delete(backend='tsdb', table='netops_metrics_jupyter')
client.create(backend='tsdb', table='netops_metrics_jupyter', attrs={'rate': '1/m'})

In [26]:
# read the parquet into memory and print the head 
pqdf = pq.read_table(filepath).to_pandas()
pqdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,cpu_utilization_is_error,latency,latency_is_error,packet_loss,packet_loss_is_error,throughput,throughput_is_error,is_error
timestamp,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-04-24 16:24:57.639091,Henderson-Lopez,Spencer_Greens,63085034865,76.223322,False,4.80143,False,0.0,False,283.648942,False,False
2019-04-24 16:24:57.639091,Henderson-Lopez,Rogers_Route,236137499649,95.17578,False,0.0,False,1.699464,False,265.224289,False,False
2019-04-24 16:24:57.639091,Kelly-Arroyo,Mclaughlin_Gateway,2190949003307,70.94711,False,1.409136,False,0.0,False,274.725084,False,False
2019-04-24 16:24:57.639091,Kelly-Arroyo,Nicholas_Courts,3440493824847,89.953635,False,5.472598,False,0.0,False,263.175821,False,False
2019-04-24 16:24:57.639091,Kelly-Arroyo,Nicholas_Courts,3669819894699,71.81394,False,5.022275,False,1.61714,False,243.377758,False,False


In [29]:
# write the dataframe into the time-seried DB, uncomment the line below
client.write(backend='tsdb', table='netops_metrics_jupyter', dfs=pqdf)

In [30]:
# verify the table is written 
client.read(backend='tsdb', query='select avg(cpu_utilization) as cpu_utilization_avg, avg(latency) as latency_avg, avg(packet_loss) as packet_loss_avg, avg(throughput) as throughput_avg from netops_metrics_jupyter group by company, data_center, device',
            start="now-1d", end='now+1d', multi_index=True, step='5m').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,latency,packet_loss,throughput
time,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-04-24 16:24:28,Kelly-Arroyo,Nicholas_Courts,3669819894699,70.849387,2.147264,0.677893,246.597086
2019-04-24 16:29:28,Kelly-Arroyo,Nicholas_Courts,3669819894699,84.817562,46.930369,22.344594,149.586233
2019-04-24 16:34:28,Kelly-Arroyo,Nicholas_Courts,3669819894699,70.633385,1.493488,0.88461,251.194946
2019-04-24 16:39:28,Kelly-Arroyo,Nicholas_Courts,3669819894699,83.232115,47.085941,21.721325,176.920127
2019-04-24 16:44:28,Kelly-Arroyo,Nicholas_Courts,3669819894699,71.467319,5.493702,2.827508,242.738542
2019-04-24 16:49:28,Kelly-Arroyo,Nicholas_Courts,3669819894699,82.285018,28.064059,11.715909,201.513149
2019-04-24 16:54:28,Kelly-Arroyo,Nicholas_Courts,3669819894699,75.989181,21.781918,10.659431,200.802553
2019-04-24 16:59:28,Kelly-Arroyo,Nicholas_Courts,3669819894699,69.636836,2.377856,0.606287,245.767407
2019-04-24 17:04:28,Kelly-Arroyo,Nicholas_Courts,3669819894699,69.289693,1.585281,0.866739,238.51694
2019-04-24 17:09:28,Kelly-Arroyo,Nicholas_Courts,3669819894699,83.253099,43.376949,20.5346,150.155566
