In [1]:
import tempfile

tempdir_context = tempfile.TemporaryDirectory()

def cleanup():
    tempdir_context.__exit__(None,None,None)
    
class Demo:
    """parameters for this demo"""
    
    # the profile name under which waylay credentials are stored
    waylay_client_profile='staging'
    
    # a prefix for all resources created by this demo
    resource_prefix = 'demo_etl_import'
    
    temp_dir = tempdir_context.__enter__()
    

In [2]:
import waylay
waylay.__version__

'v0.1.3+11.ge7a6a8c.dirty'

In [3]:
waylay_client = waylay.WaylayClient.from_profile(Demo.waylay_client_profile)
etl_tool = waylay_client.timeseries.etl_tool
etl_tool.temp_dir = Demo.temp_dir

# conversion of (large) files and streams

The python SDK supports importing large sets of time series using files and streams in a CSV format.

The SDK converts the file to a local ETL Import file and uploads this file to waylay for an asynchronous import.

This section illustrates the various csv import formats that are supported by the SDK (without actually uploading the series)

In [4]:
import pandas as pd
import csv
import io
import gzip

def preview_csv_text(csv_text: str, lines=10):
    with io.StringIO(csv_text) as csv_stream:
        display(pd.read_csv(csv_stream).head(lines))
                
def preview_import_file(path: str, lines=10):
    with gzip.open(path, 'rt') as csv_stream:
        display(pd.read_csv(csv_stream).head(lines))
        

### A simple CSV file

In [5]:
csv_text=(
    "timestamp,temperature\n"
    "2021-03-01T00:00+00:00,-1\n"
    "2021-03-01T03:00+00:00,-2\n"
    "2021-03-01T06:00+00:00,3\n"
    "2021-03-01T09:00+00:00,10\n"
    "2021-03-01T12:00+00:00,15\n"
    "2021-03-01T15:00+00:00,21\n"
    "2021-03-01T18:00+00:00,14\n"
    "2021-03-01T21:00+00:00,8\n"
)
preview_csv_text(csv_text)

Unnamed: 0,timestamp,temperature
0,2021-03-01T00:00+00:00,-1
1,2021-03-01T03:00+00:00,-2
2,2021-03-01T06:00+00:00,3
3,2021-03-01T09:00+00:00,10
4,2021-03-01T12:00+00:00,15
5,2021-03-01T15:00+00:00,21
6,2021-03-01T18:00+00:00,14
7,2021-03-01T21:00+00:00,8


In [6]:
# arguments to `prepare_import` can be a
#  - (csv) file location (with .gz extension if gzipped)
#  - csv text stream, 
#  - iterable of string tuples (e.g. a [ ('timestamp','value'), (t1,v1), (t2,v2)]
#  - pandas dataframe

etl_import = etl_tool.prepare_import(
    io.StringIO(csv_text),
    resource=f'{Demo.resource_prefix}_01',
)

preview_import_file(etl_import.import_file.path)        

# read the etl file back in as a dataframe
etl_tool.read_etl_import_dataframe(etl_import)
# # this would upload this import file as waylay series
# waylay_client.timeseries.etl_tool.initiate_import(etl_import)

# # this would check the status of the import job
# waylay_client.timeseries.etl_tool.check_import(etl_import)


100%|██████████| 1.00/1.00 [00:00<00:00, 2.68kseries/s]


Unnamed: 0,resource,metric,timestamp,value
0,demo_etl_import_01,temperature,2021-03-01T00:00:00Z,-1
1,demo_etl_import_01,temperature,2021-03-01T03:00:00Z,-2
2,demo_etl_import_01,temperature,2021-03-01T06:00:00Z,3
3,demo_etl_import_01,temperature,2021-03-01T09:00:00Z,10
4,demo_etl_import_01,temperature,2021-03-01T12:00:00Z,15
5,demo_etl_import_01,temperature,2021-03-01T15:00:00Z,21
6,demo_etl_import_01,temperature,2021-03-01T18:00:00Z,14
7,demo_etl_import_01,temperature,2021-03-01T21:00:00Z,8


resource,demo_etl_import_01
metric,temperature
timestamp,Unnamed: 1_level_2
2021-03-01 00:00:00+00:00,-1.0
2021-03-01 03:00:00+00:00,-2.0
2021-03-01 06:00:00+00:00,3.0
2021-03-01 09:00:00+00:00,10.0
2021-03-01 12:00:00+00:00,15.0
2021-03-01 15:00:00+00:00,21.0
2021-03-01 18:00:00+00:00,14.0
2021-03-01 21:00:00+00:00,8.0


### multiple metric columns

In [7]:
csv_text=(
    "timestamp,temperature,humidity\n"
    "2021-03-01T00:00Z,-1,203\n"
    "2021-03-01T03:00Z,-2,201\n"
    "2021-03-01T06:00Z,3,221\n"
    "2021-03-01T09:00Z,10,223\n"
    "2021-03-01T12:00Z,15,243\n"
    "2021-03-01T15:00Z,21,183\n"
    "2021-03-01T18:00Z,14,203\n"
    "2021-03-01T21:00Z,8,200\n"
)
preview_csv_text(csv_text)
etl_import = etl_tool.prepare_import(
    io.StringIO(csv_text),
    resource=f'{Demo.resource_prefix}_01',
)
preview_import_file(etl_import.import_file.path, 16)  

Unnamed: 0,timestamp,temperature,humidity
0,2021-03-01T00:00Z,-1,203
1,2021-03-01T03:00Z,-2,201
2,2021-03-01T06:00Z,3,221
3,2021-03-01T09:00Z,10,223
4,2021-03-01T12:00Z,15,243
5,2021-03-01T15:00Z,21,183
6,2021-03-01T18:00Z,14,203
7,2021-03-01T21:00Z,8,200


100%|██████████| 2.00/2.00 [00:00<00:00, 5.68kseries/s]


Unnamed: 0,resource,metric,timestamp,value
0,demo_etl_import_01,temperature,2021-03-01T00:00:00Z,-1
1,demo_etl_import_01,temperature,2021-03-01T03:00:00Z,-2
2,demo_etl_import_01,temperature,2021-03-01T06:00:00Z,3
3,demo_etl_import_01,temperature,2021-03-01T09:00:00Z,10
4,demo_etl_import_01,temperature,2021-03-01T12:00:00Z,15
5,demo_etl_import_01,temperature,2021-03-01T15:00:00Z,21
6,demo_etl_import_01,temperature,2021-03-01T18:00:00Z,14
7,demo_etl_import_01,temperature,2021-03-01T21:00:00Z,8
8,demo_etl_import_01,humidity,2021-03-01T00:00:00Z,203
9,demo_etl_import_01,humidity,2021-03-01T03:00:00Z,201


### metric specified in column

In [8]:
csv_text=(
    "timestamp,metric,value\n"
    "2021-03-01T00:00Z,temperature,-1\n"
    "2021-03-01T00:00Z,humidity,203\n"
    "2021-03-01T03:00Z,temperature,-2\n"
    "2021-03-01T03:00Z,humidity,201\n"
    "2021-03-01T06:00Z,temperature,3\n"
    "2021-03-01T06:00Z,humidity,221\n"
    "2021-03-01T09:00Z,temperature,10\n"
    "2021-03-01T09:00Z,humidity,223\n"
    "2021-03-01T12:00Z,temperature,15\n"
    "2021-03-01T12:00Z,humidity,243\n"
    "2021-03-01T15:00Z,temperature,21\n"
    "2021-03-01T15:00Z,humidity,183\n"
    "2021-03-01T18:00Z,temperature,14\n"
    "2021-03-01T18:00Z,humidity,203\n"
    "2021-03-01T21:00Z,temperature,8\n"
    "2021-03-01T21:00Z,humidity,200\n"
)
preview_csv_text(csv_text)
etl_import = etl_tool.prepare_import(
    io.StringIO(csv_text),
    resource=f'{Demo.resource_prefix}_01',
    value_column='value'
)
preview_import_file(etl_import.import_file.path, 100)  

Unnamed: 0,timestamp,metric,value
0,2021-03-01T00:00Z,temperature,-1
1,2021-03-01T00:00Z,humidity,203
2,2021-03-01T03:00Z,temperature,-2
3,2021-03-01T03:00Z,humidity,201
4,2021-03-01T06:00Z,temperature,3
5,2021-03-01T06:00Z,humidity,221
6,2021-03-01T09:00Z,temperature,10
7,2021-03-01T09:00Z,humidity,223
8,2021-03-01T12:00Z,temperature,15
9,2021-03-01T12:00Z,humidity,243


100%|██████████| 2.00/2.00 [00:00<00:00, 3.73kseries/s]


Unnamed: 0,resource,metric,timestamp,value
0,demo_etl_import_01,temperature,2021-03-01T00:00:00Z,-1
1,demo_etl_import_01,temperature,2021-03-01T03:00:00Z,-2
2,demo_etl_import_01,temperature,2021-03-01T06:00:00Z,3
3,demo_etl_import_01,temperature,2021-03-01T09:00:00Z,10
4,demo_etl_import_01,temperature,2021-03-01T12:00:00Z,15
5,demo_etl_import_01,temperature,2021-03-01T15:00:00Z,21
6,demo_etl_import_01,temperature,2021-03-01T18:00:00Z,14
7,demo_etl_import_01,temperature,2021-03-01T21:00:00Z,8
8,demo_etl_import_01,humidity,2021-03-01T00:00:00Z,203
9,demo_etl_import_01,humidity,2021-03-01T03:00:00Z,201


### resource in column

In [9]:
csv_text=(
    "timestamp,resource,metric,value\n"
    "2021-03-01T00:00Z,r1,temperature,-1\n"
    "2021-03-01T00:00Z,r1,humidity,203\n"
    "2021-03-01T03:00Z,r2,temperature,-2\n"
    "2021-03-01T03:00Z,r2,humidity,201\n"
    "2021-03-01T06:00Z,r1,temperature,3\n"
    "2021-03-01T06:00Z,r1,humidity,221\n"
    "2021-03-01T09:00Z,r2,temperature,10\n"
    "2021-03-01T09:00Z,r2,humidity,223\n"
    "2021-03-01T12:00Z,r1,temperature,15\n"
    "2021-03-01T12:00Z,r1,humidity,243\n"
    "2021-03-01T15:00Z,r2,temperature,21\n"
    "2021-03-01T15:00Z,r2,humidity,183\n"
    "2021-03-01T18:00Z,r1,temperature,14\n"
    "2021-03-01T18:00Z,r1,humidity,203\n"
    "2021-03-01T21:00Z,r2,temperature,8\n"
    "2021-03-01T21:00Z,r2,humidity,200\n"
)
preview_csv_text(csv_text)
etl_import = etl_tool.prepare_import(
    io.StringIO(csv_text),
    value_column='value'
)
preview_import_file(etl_import.import_file.path, 100)  

Unnamed: 0,timestamp,resource,metric,value
0,2021-03-01T00:00Z,r1,temperature,-1
1,2021-03-01T00:00Z,r1,humidity,203
2,2021-03-01T03:00Z,r2,temperature,-2
3,2021-03-01T03:00Z,r2,humidity,201
4,2021-03-01T06:00Z,r1,temperature,3
5,2021-03-01T06:00Z,r1,humidity,221
6,2021-03-01T09:00Z,r2,temperature,10
7,2021-03-01T09:00Z,r2,humidity,223
8,2021-03-01T12:00Z,r1,temperature,15
9,2021-03-01T12:00Z,r1,humidity,243


100%|██████████| 4.00/4.00 [00:00<00:00, 5.61kseries/s]


Unnamed: 0,resource,metric,timestamp,value
0,r1,temperature,2021-03-01T00:00:00Z,-1
1,r1,temperature,2021-03-01T06:00:00Z,3
2,r1,temperature,2021-03-01T12:00:00Z,15
3,r1,temperature,2021-03-01T18:00:00Z,14
4,r1,humidity,2021-03-01T00:00:00Z,203
5,r1,humidity,2021-03-01T06:00:00Z,221
6,r1,humidity,2021-03-01T12:00:00Z,243
7,r1,humidity,2021-03-01T18:00:00Z,203
8,r2,temperature,2021-03-01T03:00:00Z,-2
9,r2,temperature,2021-03-01T09:00:00Z,10


### resource in column, multiple metric columns

In [10]:
csv_text=(
    "resource,timestamp,temperature,humidity\n"
    "r1,2021-03-01T00:00Z,-1,203\n"
    "r2,2021-03-01T00:00Z,-2,201\n"
    "r1,2021-03-01T06:00Z,3,221\n"
    "r2,2021-03-01T06:00Z,10,223\n"
    "r1,2021-03-01T12:00Z,15,243\n"
    "r2,2021-03-01T12:00Z,21,183\n"
    "r1,2021-03-01T18:00Z,14,203\n"
    "r2,2021-03-01T18:00Z,8,200"
)
preview_csv_text(csv_text)
etl_import = etl_tool.prepare_import(
    io.StringIO(csv_text),
    resource_column='resource',
)
preview_import_file(etl_import.import_file.path, 16)  

Unnamed: 0,resource,timestamp,temperature,humidity
0,r1,2021-03-01T00:00Z,-1,203
1,r2,2021-03-01T00:00Z,-2,201
2,r1,2021-03-01T06:00Z,3,221
3,r2,2021-03-01T06:00Z,10,223
4,r1,2021-03-01T12:00Z,15,243
5,r2,2021-03-01T12:00Z,21,183
6,r1,2021-03-01T18:00Z,14,203
7,r2,2021-03-01T18:00Z,8,200


100%|██████████| 4.00/4.00 [00:00<00:00, 6.39kseries/s]


Unnamed: 0,resource,metric,timestamp,value
0,r1,temperature,2021-03-01T00:00:00Z,-1
1,r1,temperature,2021-03-01T06:00:00Z,3
2,r1,temperature,2021-03-01T12:00:00Z,15
3,r1,temperature,2021-03-01T18:00:00Z,14
4,r1,humidity,2021-03-01T00:00:00Z,203
5,r1,humidity,2021-03-01T06:00:00Z,221
6,r1,humidity,2021-03-01T12:00:00Z,243
7,r1,humidity,2021-03-01T18:00:00Z,203
8,r2,temperature,2021-03-01T00:00:00Z,-2
9,r2,temperature,2021-03-01T06:00:00Z,10


### filtering and renaming resources and metrics

In [11]:
etl_import = etl_tool.prepare_import(
    io.StringIO(csv_text),
    resource_column='resource',
    resources=['r1'],
    metrics=['temperature']
)
preview_import_file(etl_import.import_file.path, 16)  

100%|██████████| 1.00/1.00 [00:00<00:00, 3.95kseries/s]


Unnamed: 0,resource,metric,timestamp,value
0,r1,temperature,2021-03-01T00:00:00Z,-1
1,r1,temperature,2021-03-01T06:00:00Z,3
2,r1,temperature,2021-03-01T12:00:00Z,15
3,r1,temperature,2021-03-01T18:00:00Z,14


In [14]:
from waylay.service.timeseries.parser import MetricSpec, ResourceSpec
etl_import = etl_tool.prepare_import(
    io.StringIO(csv_text),
    name='remap',
    resource_column='resource',
    resources=[
        ResourceSpec('resource_1','r1'), 
        ResourceSpec('resource_2','r2')
    ],
    metrics=[
        MetricSpec('temp','temperature',unit='C',value_type='integer')
    ]
)
preview_import_file(etl_import.import_file.path, 16) 

with etl_tool.read_etl_import_csv(etl_import) as csv_reader:
    display(list(csv_reader))

etl_tool.read_etl_import_dataframe(etl_import)


100%|██████████| 2.00/2.00 [00:00<00:00, 4.68kseries/s]


Unnamed: 0,resource,metric,timestamp,value
0,resource_1,temp,2021-03-01T00:00:00Z,-1
1,resource_1,temp,2021-03-01T06:00:00Z,3
2,resource_1,temp,2021-03-01T12:00:00Z,15
3,resource_1,temp,2021-03-01T18:00:00Z,14
4,resource_2,temp,2021-03-01T00:00:00Z,-2
5,resource_2,temp,2021-03-01T06:00:00Z,10
6,resource_2,temp,2021-03-01T12:00:00Z,21
7,resource_2,temp,2021-03-01T18:00:00Z,8


[['resource', 'metric', 'timestamp', 'value'],
 ['resource_1', 'temp', '2021-03-01T00:00:00Z', '-1'],
 ['resource_1', 'temp', '2021-03-01T06:00:00Z', '3'],
 ['resource_1', 'temp', '2021-03-01T12:00:00Z', '15'],
 ['resource_1', 'temp', '2021-03-01T18:00:00Z', '14'],
 ['resource_2', 'temp', '2021-03-01T00:00:00Z', '-2'],
 ['resource_2', 'temp', '2021-03-01T06:00:00Z', '10'],
 ['resource_2', 'temp', '2021-03-01T12:00:00Z', '21'],
 ['resource_2', 'temp', '2021-03-01T18:00:00Z', '8']]

resource,resource_1,resource_2
metric,temp,temp
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2
2021-03-01 00:00:00+00:00,-1,-2
2021-03-01 06:00:00+00:00,3,10
2021-03-01 12:00:00+00:00,15,21
2021-03-01 18:00:00+00:00,14,8


In [15]:
list(etl_import.spec.iter_metrics())

[MetricSpec(name='temp', key='temperature', value_parser=None, description=None, value_type='integer', metric_type=None, unit='C')]

In [16]:
import sys
class HugeDataSet():
    """An _iterable_ dataset that provides data for a number of resources and metrics."""
    def __init__(self, size:int, resources=4, metrics=12, values=1000):
        self.size=size
        self.resources=resources
        self.metrics=metrics
        self.values=values

    def __iter__(self):
        resources=self.resources
        metrics=self.metrics
        values=self.values
        # sys.stdout.write('.')
        yield ['timestamp', 'resource', 'metric', 'value']
        # sys.stdout.write('+')
        for i in range(self.size):
            yield [ pd.Timestamp(i, unit='s'), f'res_{i%resources}', f'metric_{i%metrics}', i % values]



In [20]:
etl_import = etl_tool.prepare_import(
    HugeDataSet(200),
    value_column='value',
    resources=['res_3']
)
preview_import_file(etl_import.import_file.path, 16)  

100%|██████████| 3.00/3.00 [00:00<00:00, 1.05kseries/s]


Unnamed: 0,resource,metric,timestamp,value
0,res_3,metric_3,1970-01-01T00:00:03,3
1,res_3,metric_3,1970-01-01T00:00:15,15
2,res_3,metric_3,1970-01-01T00:00:27,27
3,res_3,metric_3,1970-01-01T00:00:39,39
4,res_3,metric_3,1970-01-01T00:00:51,51
5,res_3,metric_3,1970-01-01T00:01:03,63
6,res_3,metric_3,1970-01-01T00:01:15,75
7,res_3,metric_3,1970-01-01T00:01:27,87
8,res_3,metric_3,1970-01-01T00:01:39,99
9,res_3,metric_3,1970-01-01T00:01:51,111


In [21]:
import timeit

def prepare():
    etl_import = etl_tool.prepare_import(
        HugeDataSet(int(10e5)),
        value_column='value'
    )
    display(str(etl_import.import_file.path))
    display(etl_import.import_file.path.stat().st_size)

timeit.timeit(prepare, number=1)

100%|██████████| 12.0/12.0 [00:27<00:00, 2.26s/series]


'/var/folders/07/zn347xhn33z8m79l8xtz1hn80000gp/T/tmp0oe674y4/import-20210222.234632-timeseries.csv.gz'

3641431

29.345280841999994

In [22]:
import timeit

def prepare():
    etl_import = etl_tool.prepare_import(
        HugeDataSet(int(10e5)),
        value_column='value'
    )
    display(str(etl_import.import_file.path))
    display(etl_import.import_file.path.stat().st_size)

timeit.timeit(prepare, number=1)

100%|██████████| 12.0/12.0 [00:27<00:00, 2.26s/series]


'/var/folders/07/zn347xhn33z8m79l8xtz1hn80000gp/T/tmp0oe674y4/import-20210222.234706-timeseries.csv.gz'

3641431

29.275765514

In [23]:
import timeit

def prepare():
    etl_import = etl_tool.prepare_import(
        HugeDataSet(int(10e5), resources=1000, metrics=2),
        value_column='value',
        resources=['res_0']
    )
    display(str(etl_import.import_file.path))
    display(etl_import.import_file.path.stat().st_size)

timeit.timeit(prepare, number=1)

100%|██████████| 1.00/1.00 [00:01<00:00, 1.96s/series]


'/var/folders/07/zn347xhn33z8m79l8xtz1hn80000gp/T/tmp0oe674y4/import-20210222.234737-timeseries.csv.gz'

2882

4.19732158299999

In [17]:
export_name = 'staging.waylay.io-20210222-timeseries.csv.gz'
local_export_file = f'{Demo.temp_dir}/{export_name}'
waylay_client.storage.object.stat('etl-export',export_name)
waylay_client.storage.content.get('etl-export',export_name, to_file=local_export_file)

100%|██████████| 79.8M/79.8M [00:02<00:00, 31.7MB/s]


PosixPath('/var/folders/07/zn347xhn33z8m79l8xtz1hn80000gp/T/tmp1g3ozms8/staging.waylay.io-20210222-timeseries.csv.gz')

In [19]:
etl_import = etl_tool.prepare_import(
    local_export_file,
    value_column='value',
    metrics=['activeScore'],
    resources=['thomas@waylay.io@fitbitsimulator.00574']
)

100%|██████████| 1.00/1.00 [00:09<00:00, 9.39s/series]


In [18]:
preview_import_file(local_export_file, 16)

Unnamed: 0,resource,metric,timestamp,value
0,thomas@waylay.io@fitbitsimulator.00574,waylay.resourcemessage.metric.activeScore,2021-02-21T14:27:02.391Z,-1
1,thomas@waylay.io@fitbitsimulator.00574,waylay.resourcemessage.metric.activeScore,2021-02-21T14:48:34.300Z,2
2,thomas@waylay.io@fitbitsimulator.00574,waylay.resourcemessage.metric.activeScore,2021-02-21T14:49:10.541Z,2
3,thomas@waylay.io@fitbitsimulator.00574,waylay.resourcemessage.metric.activeScore,2021-02-21T15:44:41.791Z,1
4,thomas@waylay.io@fitbitsimulator.00574,waylay.resourcemessage.metric.activeScore,2021-02-21T16:02:42.641Z,-1
5,thomas@waylay.io@fitbitsimulator.00574,waylay.resourcemessage.metric.activeScore,2021-02-21T16:19:57.726Z,-1
6,thomas@waylay.io@fitbitsimulator.00574,waylay.resourcemessage.metric.activeScore,2021-02-21T16:27:47.142Z,-1
7,thomas@waylay.io@fitbitsimulator.00574,waylay.resourcemessage.metric.activeScore,2021-02-21T16:46:20.470Z,-1
8,thomas@waylay.io@fitbitsimulator.00574,waylay.resourcemessage.metric.activeScore,2021-02-21T16:48:38.891Z,2
9,thomas@waylay.io@fitbitsimulator.00574,waylay.resourcemessage.metric.activeScore,2021-02-21T16:48:49.463Z,-1


In [20]:
# removes the temporary directory
cleanup()
