# Generate simulated infrastructure telemetry 

In [1]:
# Install requiered packages if needed (only once)
!pip install pytimeparse
!pip install -i https://test.pypi.org/simple/ v3io-generator --upgrade
!pip install faker
!pip install pyarrow --upgrade

Collecting pytimeparse
  Downloading https://files.pythonhosted.org/packages/1b/b4/afd75551a3b910abd1d922dbd45e49e5deeb4d47dc50209ce489ba9844dd/pytimeparse-1.1.8-py2.py3-none-any.whl
Installing collected packages: pytimeparse
Successfully installed pytimeparse-1.1.8
Looking in indexes: https://test.pypi.org/simple/
Collecting v3io-generator
  Downloading https://test-files.pythonhosted.org/packages/6c/f6/ba9045111de98747af2c94e10f3dbf74311e6bd3a033c7ea1ca84e084e82/v3io_generator-0.0.27.dev0-py3-none-any.whl
Installing collected packages: v3io-generator
Successfully installed v3io-generator-0.0.27.dev0
Collecting pyarrow
[?25l  Downloading https://files.pythonhosted.org/packages/dd/77/5865b367a6792da2f811ae49391c1f85c29b29663555aac0a118fe8e153e/pyarrow-0.15.1-cp36-cp36m-manylinux1_x86_64.whl (59.0MB)
[K    100% |████████████████████████████████| 59.0MB 593kB/s eta 0:00:01��█████████████████████████▋   | 52.7MB 62.3MB/s eta 0:00:01
Installing collected packages: pyarrow
Successfully in

In [2]:
import os
import time
import yaml
import pandas as pd
import datetime
import itertools
import sys
sys.path.append('../../')
# DB Connection
import v3io_frames_local as v3f

# Data generator
from v3io_generator import metrics_generator, deployment_generator

General definitions

In [3]:
%env SAVE_TO_KV = True                 
%env DEPLOYMENT_TABLE = netops_devices 

env: SAVE_TO_KV=True
env: DEPLOYMENT_TABLE=netops_devices


## Create Metadata
the following section will create a list of devices which are scattered in multiple datacenters

In [4]:
def _create_deployment():
    print('creating deployment')
    # Create meta-data factory
    dep_gen = deployment_generator.deployment_generator()
    faker=dep_gen.get_faker()

    # Design meta-data
    dep_gen.add_level(name='company',number=2,level_type=faker.company)
    dep_gen.add_level('data_center',number=2,level_type=faker.street_name)
    dep_gen.add_level('device',number=2,level_type=faker.msisdn)

    # Create meta-data
    deployment_df = dep_gen.generate_deployment()
    return deployment_df

In [5]:
def _is_deployment_exist(path):
    # Checking shared path for the devices table
    return os.path.exists(f'/v3io/bigdata/{path}')

In [6]:
def _get_deployment_from_kv(path):
    print(f'Retrieving deployment from {path}')
    # Read the devices table from our KV store
    deployment_df = client.read(backend='kv', table=path)
    
    # Reset index to column
    deployment_df.index.name = 'device'
    deployment_df = deployment_df.reset_index()
    return deployment_df

In [7]:
def _save_deployment_to_kv(path, df, client=v3f.Client('framesd:8081')):
    # Save deployment to our KV store
    client.write(backend='kv', table='netops_devices',dfs=df, index_cols=['device'])

In [8]:
def get_or_create_deployment(path, save_to_cloud=False, client=v3f.Client('framesd:8081')):
    if _is_deployment_exist(path):
        # Get deployment from KV
        deployment_df = _get_deployment_from_kv(path)
    else:
        # Create deployment
        deployment_df = _create_deployment()
        
        if save_to_cloud:
            _save_deployment_to_kv(path, deployment_df, client)

    return deployment_df

In [9]:
# Create our DB client
client = v3f.Client('framesd:8081')

In [10]:
deployment_df = get_or_create_deployment(os.environ['DEPLOYMENT_TABLE'], os.environ['SAVE_TO_KV'])
deployment_df

creating deployment


Unnamed: 0,company,data_center,device
0,Douglas-Holmes,Lopez_Summit,4752014671680
1,Douglas-Holmes,Lopez_Summit,7236123187473
2,Douglas-Holmes,Paul_Lane,1483265418393
3,Douglas-Holmes,Paul_Lane,3232046314748
4,Roberts_PLC,John_Curve,7873407333830
5,Roberts_PLC,John_Curve,7026488238274
6,Roberts_PLC,Gregory_Tunnel,3565281248497
7,Roberts_PLC,Gregory_Tunnel,4401357069985


Read from our KV to make sure we have backup

In [11]:
# verify the table is written 
client.read(backend='kv', table='netops_devices')

Unnamed: 0_level_0,company,data_center
device,Unnamed: 1_level_1,Unnamed: 2_level_1
4752014671680,Douglas-Holmes,Lopez_Summit
7236123187473,Douglas-Holmes,Lopez_Summit
4401357069985,Roberts_PLC,Gregory_Tunnel
7873407333830,Roberts_PLC,John_Curve
3232046314748,Douglas-Holmes,Paul_Lane
1483265418393,Douglas-Holmes,Paul_Lane
7026488238274,Roberts_PLC,John_Curve
3565281248497,Roberts_PLC,Gregory_Tunnel


## Add initial values

In [12]:
deployment_df['cpu_utilization'] = 70
deployment_df['latency'] = 0
deployment_df['packet_loss'] = 0
deployment_df['throughput'] = 290
deployment_df.head()

Unnamed: 0,company,data_center,device,cpu_utilization,latency,packet_loss,throughput
0,Douglas-Holmes,Lopez_Summit,4752014671680,70,0,0,290
1,Douglas-Holmes,Lopez_Summit,7236123187473,70,0,0,290
2,Douglas-Holmes,Paul_Lane,1483265418393,70,0,0,290
3,Douglas-Holmes,Paul_Lane,3232046314748,70,0,0,290
4,Roberts_PLC,John_Curve,7873407333830,70,0,0,290


## Generate simulated metrics per device
Metrics schema (describe simulated values) is read from `metrics_configuration.yaml`

In [13]:
# Load metrics configuration from YAML file
with open('configurations/metrics_configuration.yaml', 'r') as f:
    metrics_configuration = yaml.load(f)

# Create metrics generator based on YAML configuration
met_gen = metrics_generator.Generator_df(metrics_configuration, user_hierarchy=deployment_df, initial_timestamp=time.time())
metrics = met_gen.generate_range(start_time=datetime.datetime.now(),
                                 end_time=datetime.datetime.now()+datetime.timedelta(hours=1),
                                 as_df=True,
                                 as_iterator=True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
df = pd.concat(itertools.chain(metrics))
df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,cpu_utilization_is_error,latency,latency_is_error,packet_loss,packet_loss_is_error,throughput,throughput_is_error,is_error
timestamp,data_center,company,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-01-10 18:30:34.637784,Lopez_Summit,Douglas-Holmes,4752014671680,86.381861,False,10.498757,False,0.0,False,261.468386,False,False
2020-01-10 18:30:34.637784,Lopez_Summit,Douglas-Holmes,7236123187473,64.0984,False,0.0,False,0.002797,False,263.853408,False,False
2020-01-10 18:30:34.637784,Paul_Lane,Douglas-Holmes,1483265418393,67.333629,False,0.0,False,0.0,False,235.924007,False,False
2020-01-10 18:30:34.637784,Paul_Lane,Douglas-Holmes,3232046314748,59.778777,False,1.036221,False,0.254123,False,270.566887,False,False
2020-01-10 18:30:34.637784,John_Curve,Roberts_PLC,7873407333830,58.683553,False,14.00314,False,0.0,False,249.515045,False,False


## Save to Iguazio Time-series Database

In [16]:
# uncomment the line below if you want to reset the TSDB table 
#client.delete(backend='tsdb', table='netops_metrics_jupyter')

In [18]:
# create a new table, need to specify estimated sample rate
client.create(backend='tsdb', table='netops_metrics_jupyter', rate='1/m')

In [19]:
# write the dataframe into the time-seried DB, note the company,data_center,device indexes are automatically converted to search optimized labels
client.write(backend='tsdb', table='netops_metrics_jupyter', dfs=df)

## Verify that the data was written

In [20]:
client.read(backend='tsdb', query='select avg(cpu_utilization), avg(latency) , avg(packet_loss) , avg(throughput)  from netops_metrics_jupyter group by company, data_center, device',
            start="now-1d", end='now+1d', multi_index=True, step='5m').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,avg(cpu_utilization),avg(latency),avg(packet_loss),avg(throughput)
time,data_center,device,company,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-10 18:26:54,Lopez_Summit,4752014671680,Douglas-Holmes,71.524653,2.126854,0.738952,260.575183
2020-01-10 18:31:54,Lopez_Summit,4752014671680,Douglas-Holmes,81.253237,46.227904,19.211925,151.857232
2020-01-10 18:36:54,Lopez_Summit,4752014671680,Douglas-Holmes,69.719633,6.717394,1.570689,243.595904
2020-01-10 18:41:54,Lopez_Summit,4752014671680,Douglas-Holmes,84.152484,42.802822,16.644697,178.531496
2020-01-10 18:46:54,Lopez_Summit,4752014671680,Douglas-Holmes,70.73222,5.450933,1.705209,248.906017
2020-01-10 18:51:54,Lopez_Summit,4752014671680,Douglas-Holmes,81.235213,38.144399,18.141146,180.622731
2020-01-10 18:56:54,Lopez_Summit,4752014671680,Douglas-Holmes,69.181855,1.964802,0.767973,249.567418
2020-01-10 19:01:54,Lopez_Summit,4752014671680,Douglas-Holmes,68.759067,1.934528,0.557567,250.804524
2020-01-10 19:06:54,Lopez_Summit,4752014671680,Douglas-Holmes,70.789387,2.277926,0.697254,245.583328
2020-01-10 19:11:54,Lopez_Summit,4752014671680,Douglas-Holmes,82.547525,45.029108,17.251231,160.645969


### Save the generated dataset to parquet for future reproducability 

In [21]:
# Create directory if doesn't exist 
!mkdir data

In [22]:
import pyarrow as pa
from pyarrow import parquet as pq

In [23]:
#write the dataframe into a parquet (on iguazio file system)
version = '1.0'
filepath = 'data/netops_metrics.v{}.parquet'.format(version)
pq.write_table(pa.Table.from_pandas(df), filepath)

### Reading the data from parquet into the time-series DB
if we want to reproduce the same results we can rebuild the TSDB from the saved parquet file

In [25]:
# uncomment the line below if you want to reset the TSDB table 
#client.delete(backend='tsdb', table='netops_metrics_jupyter')
client.create(backend='tsdb', table='netops_metrics_jupyter', rate='1/m')

In [26]:
# read the parquet into memory and print the head 
pqdf = pq.read_table(filepath).to_pandas()
pqdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,cpu_utilization_is_error,latency,latency_is_error,packet_loss,packet_loss_is_error,throughput,throughput_is_error,is_error
timestamp,data_center,company,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-01-10 18:30:34.637784,Lopez_Summit,Douglas-Holmes,4752014671680,86.381861,False,10.498757,False,0.0,False,261.468386,False,False
2020-01-10 18:30:34.637784,Lopez_Summit,Douglas-Holmes,7236123187473,64.0984,False,0.0,False,0.002797,False,263.853408,False,False
2020-01-10 18:30:34.637784,Paul_Lane,Douglas-Holmes,1483265418393,67.333629,False,0.0,False,0.0,False,235.924007,False,False
2020-01-10 18:30:34.637784,Paul_Lane,Douglas-Holmes,3232046314748,59.778777,False,1.036221,False,0.254123,False,270.566887,False,False
2020-01-10 18:30:34.637784,John_Curve,Roberts_PLC,7873407333830,58.683553,False,14.00314,False,0.0,False,249.515045,False,False


In [27]:
# write the dataframe into the time-seried DB, uncomment the line below
client.write(backend='tsdb', table='netops_metrics_jupyter', dfs=pqdf)

In [28]:
# verify the table is written 
client.read(backend='tsdb', query='select avg(cpu_utilization) as cpu_utilization_avg, avg(latency) as latency_avg, avg(packet_loss) as packet_loss_avg, avg(throughput) as throughput_avg from netops_metrics_jupyter group by company, data_center, device',
            start="now-1d", end='now+1d', multi_index=True, step='5m').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization_avg,latency_avg,packet_loss_avg,throughput_avg
time,device,company,data_center,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-10 18:27:28,7873407333830,Roberts_PLC,John_Curve,67.119293,2.296569,0.488892,253.353706
2020-01-10 18:32:28,7873407333830,Roberts_PLC,John_Curve,67.815791,2.438452,0.844108,248.996348
2020-01-10 18:37:28,7873407333830,Roberts_PLC,John_Curve,82.328461,42.378662,17.249981,164.405605
2020-01-10 18:42:28,7873407333830,Roberts_PLC,John_Curve,69.946635,3.466523,1.596973,249.954939
2020-01-10 18:47:28,7873407333830,Roberts_PLC,John_Curve,82.255952,39.612626,17.36734,173.745552
2020-01-10 18:52:28,7873407333830,Roberts_PLC,John_Curve,72.24934,3.553545,0.597744,247.889233
2020-01-10 18:57:28,7873407333830,Roberts_PLC,John_Curve,85.210647,42.958575,18.035617,167.655121
2020-01-10 19:02:28,7873407333830,Roberts_PLC,John_Curve,70.103942,2.172345,1.073567,251.999156
2020-01-10 19:07:28,7873407333830,Roberts_PLC,John_Curve,69.764409,1.4916,0.726639,247.435701
2020-01-10 19:12:28,7873407333830,Roberts_PLC,John_Curve,66.886423,2.615169,0.865065,251.157981
