# Generate simulated infrastructure telemetry 

In [1]:
# Install requiered packages if needed (only once)
!pip install pytimeparse
!pip install -i https://test.pypi.org/simple/ v3io-generator --upgrade
!pip install faker
!pip install pyarrow --upgrade

Looking in indexes: https://test.pypi.org/simple/
Requirement already up-to-date: v3io-generator in /User/.pythonlibs/lib/python3.6/site-packages (0.0.27.dev0)
Requirement already up-to-date: pyarrow in /User/.pythonlibs/lib/python3.6/site-packages (0.12.1)


In [2]:
import os
import time
import yaml
import pandas as pd
import datetime

# DB Connection
import v3io_frames as v3f

# Data generator
from v3io_generator import metrics_generator, deployment_generator

General definitions

In [3]:
%env SAVE_TO_KV = True                 
%env DEPLOYMENT_TABLE = netops_devices 

env: SAVE_TO_KV=True
env: DEPLOYMENT_TABLE=netops_devices


## Create Metadata
the following section will create a list of devices which are scattered in multiple datacenters

In [4]:
def _create_deployment():
    print('creating deployment')
    # Create meta-data factory
    dep_gen = deployment_generator.deployment_generator()
    faker=dep_gen.get_faker()

    # Design meta-data
    dep_gen.add_level(name='company',number=2,level_type=faker.company)
    dep_gen.add_level('data_center',number=2,level_type=faker.street_name)
    dep_gen.add_level('device',number=2,level_type=faker.msisdn)

    # Create meta-data
    deployment_df = dep_gen.generate_deployment()
    return deployment_df

In [5]:
def _is_deployment_exist(path):
    # Checking shared path for the devices table
    return os.path.exists(f'/v3io/bigdata/{path}')

In [6]:
def _get_deployment_from_kv(path):
    print(f'Retrieving deployment from {path}')
    # Read the devices table from our KV store
    deployment_df = client.read(backend='kv', table=path)
    
    # Reset index to column
    deployment_df.index.name = 'device'
    deployment_df = deployment_df.reset_index()
    return deployment_df

In [7]:
def _save_deployment_to_kv(path, df, client=v3f.Client('framesd:8081')):
    # Save deployment to our KV store
    client.write(backend='kv', table='netops_devices',dfs=df, index_cols=['device'])

In [8]:
def get_or_create_deployment(path, save_to_cloud=False, client=v3f.Client('framesd:8081')):
    if _is_deployment_exist(path):
        # Get deployment from KV
        deployment_df = _get_deployment_from_kv(path)
    else:
        # Create deployment
        deployment_df = _create_deployment()
        
        if save_to_cloud:
            _save_deployment_to_kv(path, deployment_df, client)

    return deployment_df

In [9]:
# Create our DB client
client = v3f.Client('framesd:8081')

In [10]:
deployment_df = get_or_create_deployment(os.environ['DEPLOYMENT_TABLE'], os.environ['SAVE_TO_KV'])
deployment_df

creating deployment


Unnamed: 0,company,data_center,device
0,Adkins_PLC,Zachary_Pine,2354411119784
1,Adkins_PLC,Zachary_Pine,4881766997694
2,Adkins_PLC,Shannon_Plains,1646715041160
3,Adkins_PLC,Shannon_Plains,9457511034773
4,Mullins__Martinez_and_Ponce,Ruth_Estate,7847035326081
5,Mullins__Martinez_and_Ponce,Ruth_Estate,8151267286992
6,Mullins__Martinez_and_Ponce,Becker_Passage,6808461982302
7,Mullins__Martinez_and_Ponce,Becker_Passage,1909254331266


Read from our KV to make sure we have backup

In [11]:
# verify the table is written 
client.read(backend='kv', table='netops_devices')

Unnamed: 0_level_0,company,data_center
__name,Unnamed: 1_level_1,Unnamed: 2_level_1
1909254331266,Mullins__Martinez_and_Ponce,Becker_Passage
6808461982302,Mullins__Martinez_and_Ponce,Becker_Passage
1646715041160,Adkins_PLC,Shannon_Plains
8151267286992,Mullins__Martinez_and_Ponce,Ruth_Estate
7847035326081,Mullins__Martinez_and_Ponce,Ruth_Estate
9457511034773,Adkins_PLC,Shannon_Plains
2354411119784,Adkins_PLC,Zachary_Pine
4881766997694,Adkins_PLC,Zachary_Pine


## Add initial values

In [12]:
deployment_df['cpu_utilization'] = 70
deployment_df['latency'] = 0
deployment_df['packet_loss'] = 0
deployment_df['throughput'] = 290
deployment_df.head()

Unnamed: 0,company,data_center,device,cpu_utilization,latency,packet_loss,throughput
0,Adkins_PLC,Zachary_Pine,2354411119784,70,0,0,290
1,Adkins_PLC,Zachary_Pine,4881766997694,70,0,0,290
2,Adkins_PLC,Shannon_Plains,1646715041160,70,0,0,290
3,Adkins_PLC,Shannon_Plains,9457511034773,70,0,0,290
4,Mullins__Martinez_and_Ponce,Ruth_Estate,7847035326081,70,0,0,290


## Generate simulated metrics per device
Metrics schema (describe simulated values) is read from `metrics_configuration.yaml`

In [13]:
# Load metrics configuration from YAML file
with open('metrics_configuration.yaml', 'r') as f:
    metrics_configuration = yaml.load(f)

# Create metrics generator based on YAML configuration
met_gen = metrics_generator.Generator_df(metrics_configuration, user_hierarchy=deployment_df, initial_timestamp=time.time())
metrics = met_gen.generate_range(start_time=datetime.datetime.now(),
                                 end_time=datetime.datetime.now()+datetime.timedelta(hours=1),
                                 as_df=True,
                                 as_iterator=False)

In [14]:
df = metrics.set_index(['timestamp', 'company', 'data_center', 'device'])
df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,cpu_utilization_is_error,latency,latency_is_error,packet_loss,packet_loss_is_error,throughput,throughput_is_error
timestamp,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-03-28 09:59:14.364434,Adkins_PLC,Zachary_Pine,2354411119784,90.284373,False,0.0,False,0.0,False,251.251525,False
2019-03-28 09:59:14.364434,Adkins_PLC,Zachary_Pine,4881766997694,71.510566,False,0.0,False,4.124334,False,251.354978,False
2019-03-28 09:59:14.364434,Adkins_PLC,Shannon_Plains,1646715041160,53.691249,False,0.0,False,0.0,False,236.631635,False
2019-03-28 09:59:14.364434,Adkins_PLC,Shannon_Plains,9457511034773,69.96179,False,0.0,False,0.282288,False,247.288894,False
2019-03-28 09:59:14.364434,Mullins__Martinez_and_Ponce,Ruth_Estate,7847035326081,72.89032,False,6.041336,False,0.0,False,238.51562,False


## Save to Iguazio Time-series Database

In [15]:
# uncomment the line below if you want to reset the TSDB table 
client.delete(backend='tsdb', table='netops_metrics_jupyter')

In [16]:
# create a new table, need to specify estimated sample rate
client.create(backend='tsdb', table='netops_metrics_jupyter', attrs={'rate': '1/m'})

In [17]:
# write the dataframe into the time-seried DB, note the company,data_center,device indexes are automatically converted to search optimized labels
client.write(backend='tsdb', table='netops_metrics_jupyter', dfs=df)

## Verify that the data was written

In [18]:
client.read(backend='tsdb', query='select avg(cpu_utilization) as cpu_utilization, avg(latency) as latency, avg(packet_loss) as packet_loss, avg(throughput) as throughput from netops_metrics_jupyter group by company, data_center, device',
            start="now-1d", end='now+1d', multi_index=True, step='5m').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,latency,packet_loss,throughput
time,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-03-28 09:59:08,Adkins_PLC,Shannon_Plains,1646715041160,81.43697,30.039956,13.867534,207.47367
2019-03-28 10:04:08,Adkins_PLC,Shannon_Plains,1646715041160,76.86865,19.447706,9.659274,205.161321
2019-03-28 10:09:08,Adkins_PLC,Shannon_Plains,1646715041160,70.019396,2.38527,0.972046,250.606702
2019-03-28 10:14:08,Adkins_PLC,Shannon_Plains,1646715041160,68.434147,1.495817,0.651946,248.795556
2019-03-28 10:19:08,Adkins_PLC,Shannon_Plains,1646715041160,72.088775,2.056602,0.861172,247.352472
2019-03-28 10:24:08,Adkins_PLC,Shannon_Plains,1646715041160,68.550028,7.207673,1.933722,253.721338
2019-03-28 10:29:08,Adkins_PLC,Shannon_Plains,1646715041160,84.771881,49.406426,24.389951,141.29335
2019-03-28 10:34:08,Adkins_PLC,Shannon_Plains,1646715041160,70.714797,1.957401,1.740338,241.703977
2019-03-28 10:39:08,Adkins_PLC,Shannon_Plains,1646715041160,85.252051,43.178003,21.327192,150.525951
2019-03-28 10:44:08,Adkins_PLC,Shannon_Plains,1646715041160,72.922613,1.947467,1.064005,249.031715


### Save the generated dataset to parquet for future reproducability 

In [19]:
# craete directory if doesnt exist 
!mkdir data

In [20]:
import pyarrow as pa
from pyarrow import parquet as pq

In [21]:
#write the dataframe into a parquet (on iguazio file system)
version = '1.0'
filepath = 'data/netops_metrics.v{}.parquet'.format(version)
pq.write_table(pa.Table.from_pandas(df), filepath)

### Reading the data from parquet into the time-series DB
if we want to reproduce the same results we can rebuild the TSDB from the saved parquet file

In [22]:
# uncomment the line below if you want to reset the TSDB table 
client.delete(backend='tsdb', table='netops_metrics_jupyter')
client.create(backend='tsdb', table='netops_metrics_jupyter', attrs={'rate': '1/m'})

In [23]:
# read the parquet into memory and print the head 
pqdf = pq.read_table(filepath).to_pandas()
pqdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,cpu_utilization_is_error,latency,latency_is_error,packet_loss,packet_loss_is_error,throughput,throughput_is_error
timestamp,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-03-28 09:59:14.364434,Adkins_PLC,Zachary_Pine,2354411119784,90.284373,False,0.0,False,0.0,False,251.251525,False
2019-03-28 09:59:14.364434,Adkins_PLC,Zachary_Pine,4881766997694,71.510566,False,0.0,False,4.124334,False,251.354978,False
2019-03-28 09:59:14.364434,Adkins_PLC,Shannon_Plains,1646715041160,53.691249,False,0.0,False,0.0,False,236.631635,False
2019-03-28 09:59:14.364434,Adkins_PLC,Shannon_Plains,9457511034773,69.96179,False,0.0,False,0.282288,False,247.288894,False
2019-03-28 09:59:14.364434,Mullins__Martinez_and_Ponce,Ruth_Estate,7847035326081,72.89032,False,6.041336,False,0.0,False,238.51562,False


In [24]:
# write the dataframe into the time-seried DB, uncomment the line below
client.write(backend='tsdb', table='netops_metrics_jupyter', dfs=pqdf)

In [25]:
# verify the table is written 
client.read(backend='tsdb', query='select avg(cpu_utilization) as cpu_utilization, avg(latency) as latency, avg(packet_loss) as packet_loss, avg(throughput) as throughput from netops_metrics_jupyter group by company, data_center, device',
            start="now-1d", end='now+1d', multi_index=True, step='5m').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,latency,packet_loss,throughput
time,device,company,data_center,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-03-28 09:59:10,7847035326081,Mullins__Martinez_and_Ponce,Ruth_Estate,68.995294,1.807755,1.025134,249.993116
2019-03-28 10:04:10,7847035326081,Mullins__Martinez_and_Ponce,Ruth_Estate,71.617829,3.255674,0.485394,247.175631
2019-03-28 10:09:10,7847035326081,Mullins__Martinez_and_Ponce,Ruth_Estate,71.52502,2.050859,1.132809,250.298139
2019-03-28 10:14:10,7847035326081,Mullins__Martinez_and_Ponce,Ruth_Estate,69.60051,1.893953,0.601989,252.101578
2019-03-28 10:19:10,7847035326081,Mullins__Martinez_and_Ponce,Ruth_Estate,70.053661,2.110228,1.112451,250.925287
2019-03-28 10:24:10,7847035326081,Mullins__Martinez_and_Ponce,Ruth_Estate,71.608371,2.261748,0.833482,249.507487
2019-03-28 10:29:10,7847035326081,Mullins__Martinez_and_Ponce,Ruth_Estate,86.952528,49.111288,21.561853,158.706648
2019-03-28 10:34:10,7847035326081,Mullins__Martinez_and_Ponce,Ruth_Estate,69.75309,2.082396,0.639576,249.750646
2019-03-28 10:39:10,7847035326081,Mullins__Martinez_and_Ponce,Ruth_Estate,75.405442,3.943662,2.777239,236.636893
2019-03-28 10:44:10,7847035326081,Mullins__Martinez_and_Ponce,Ruth_Estate,83.441318,46.951835,20.621891,155.23332
