In [1]:
## dependencies: see /env/tensorflow_sklearn.environment.yml and /env/tensorflow_sklearn.requirements.txt

# HVAC occupancy detection

This notebook illustrates how to interact with the Waylay Platform API's for an HVAC data science use case. 

## References
* The [kaggle](https://www.kaggle.com) notebook [HVAC Occupancy Detection with ML and DL Methods](https://www.kaggle.com/turksoyomer/hvac-occupancy-detection-with-ml-and-dl-methods/notebook), and related [dataset](https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+), on which this example is based.
* The [Waylay api documentation](https://docs.waylay.io/api/)
* The [Waylay python SDK](https://docs.waylay.io/api/sdk/python/)
* [Setup instructions](https://github.com/waylayio/demo-general/tree/master/python-sdk) for a python notebook using the Waylay Python SDK.


## Parameters
Please review and adapt the following parameters for this demo.

In [2]:
from random import random
from datetime import datetime
run_id = f"{datetime.now():%Y-%m-%d}-{1000 * random():04.0f}"

WAYLAY_PROFILE='demo'
RESOURCE_ID=f'hvac-demo-{run_id}'
DATA_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip'

RESOURCE_ID

'hvac-demo-2021-04-29-0578'

## Setup

In [3]:
import pandas as pd
import waylay
from datetime import datetime
import tenacity

waylay.__version__

'v0.2.0'

In [4]:
# if the profile does not exist, this will interactively request for credentials, and let you optionally store it.
waylay_client = waylay.WaylayClient.from_profile(WAYLAY_PROFILE)

## Data retrieval

### download the data set
We download the dataset (a zipped set of csv files), inspect its content, and read out the csv files into a pandas data structure.

In [5]:
import os
import os.path
import zipfile
from urllib.request import urlretrieve

os.makedirs('input', exist_ok=True)
os.makedirs('output', exist_ok=True)

# download the kaggle data set
if not os.path.isfile('input/occupancy.zip'):
    urlretrieve(DATA_URL, 'input/occupancy.zip')
    
with zipfile.ZipFile('input/occupancy.zip') as occ_zip:
    for file_name in occ_zip.namelist():
        print(file_name)

datatest.txt
datatest2.txt
datatraining.txt


In [6]:
with zipfile.ZipFile('input/occupancy.zip') as occ_zip:
    datatraining = pd.read_csv(occ_zip.open('datatraining.txt'))
    datavalidation = pd.read_csv(occ_zip.open('datatest.txt'))
    datatest = pd.read_csv(occ_zip.open('datatest2.txt'))
    


In [7]:
datatraining.describe()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
count,8143.0,8143.0,8143.0,8143.0,8143.0,8143.0
mean,20.619084,25.731507,119.519375,606.546243,0.003863,0.21233
std,1.016916,5.531211,194.755805,314.320877,0.000852,0.408982
min,19.0,16.745,0.0,412.75,0.002674,0.0
25%,19.7,20.2,0.0,439.0,0.003078,0.0
50%,20.39,26.2225,0.0,453.5,0.003801,0.0
75%,21.39,30.533333,256.375,638.833333,0.004352,0.0
max,23.18,39.1175,1546.333333,2028.5,0.006476,1.0


In [8]:
datatraining.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
1,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
2,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
3,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
4,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
5,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


### convert to etl format
To upload bulk data into waylay, the data should be converted into an optimized format.
The `timeseries.tool.prepare_etl_import` helps you to create these _import files_.

In this case, we provide the tool with additional information:
 * `timestamp_timezone='UTC'` as timestamps do not contain a timezone component
 * `resource=RESOURCE_ID` as the resource id is not provided in the input
 * `timestamp_key='date'`, as timestamps are in the `date` column. In this case this is not required as `date` will be recognised as a timestamp column if not specified otherwise.
 * `directory='input'` because we want the resulting import file to reside in that directory

The first two instruction are required for this dataset. Try to omit them to see what errors are raised.

In [9]:
etl_import = waylay_client.timeseries.etl_tool.prepare_import(
    datatraining, 
    timestamp_timezone='UTC',
    resource=RESOURCE_ID,
    timestamp_column='date',
    temp_dir='output'
)
etl_import

100%|██████████| 6.00/6.00 [00:00<00:00, 15.7series/s]


WaylayETLSeriesImport(import_file=ETLFile(directory='output', prefix='import-20210429.072137'), settings=SeriesSettings(metrics=['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio', 'Occupancy'], metric_column=None, metric=None, resources=['hvac-demo-2021-04-29-0578'], resource_column=None, resource='hvac-demo-2021-04-29-0578', value_column=None, timestamp_column='date', timestamp_offset=None, timestamp_first=None, timestamp_last=None, timestamp_interval=None, timestamp_constructor=None, timestamp_timezone='UTC'), storage_bucket='etl-import')

Because it is easer to work with recent data, we instruct the tool to shift timestamps
(with `timestamp_offset`, `timestamp_first` or `timestamp_last`)

In [10]:
etl_import = waylay_client.timeseries.etl_tool.prepare_import(
    datatraining,
    name=RESOURCE_ID,
    timestamp_timezone='UTC',
    resource=RESOURCE_ID,
    timestamp_column='date',
    timestamp_last=datetime.utcnow(), # shift all timestamps so that last one is now
    temp_dir='output'
)
etl_import

100%|██████████| 6.00/6.00 [00:00<00:00, 16.8series/s]


WaylayETLSeriesImport(import_file=ETLFile(directory='output', prefix='hvac-demo-2021-04-29-0578'), settings=SeriesSettings(metrics=['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio', 'Occupancy'], metric_column=None, metric=None, resources=['hvac-demo-2021-04-29-0578'], resource_column=None, resource='hvac-demo-2021-04-29-0578', value_column=None, timestamp_column='date', timestamp_offset=None, timestamp_first=None, timestamp_last=datetime.datetime(2021, 4, 29, 7, 21, 37, 941060), timestamp_interval=None, timestamp_constructor=None, timestamp_timezone='UTC'), storage_bucket='etl-import')

The resulting file is a `gzip` compressed csv file in fully normalized _waylay timeseries ETL_ format

In [11]:
import gzip
with gzip.open(etl_import.import_file.path, 'rt') as csv_file:
     etl_series_df = pd.read_csv(csv_file)

etl_series_df.head()

Unnamed: 0,resource,metric,timestamp,value
0,hvac-demo-2021-04-29-0578,Temperature,2021-04-23T15:39:37.941060Z,23.18
1,hvac-demo-2021-04-29-0578,Temperature,2021-04-23T15:40:36.941060Z,23.15
2,hvac-demo-2021-04-29-0578,Temperature,2021-04-23T15:41:37.941060Z,23.15
3,hvac-demo-2021-04-29-0578,Temperature,2021-04-23T15:42:37.941060Z,23.15
4,hvac-demo-2021-04-29-0578,Temperature,2021-04-23T15:43:37.941060Z,23.1


### create or update waylay resource
Timeseries in waylay are best associated with a Waylay resource. This documents the entity that is represented by the timeseries data.

In [12]:
from waylay.service.timeseries import Resource, Metric
hvac_resource_info = Resource(
    id= RESOURCE_ID,
    name= RESOURCE_ID,
    description =(
        "Experimental data used for binary classification (room occupancy) "
        "from Temperature,Humidity,Light and CO2.\n"
        "Ground-truth occupancy was obtained from time stamped pictures that were taken every minute.\n"
        "See https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+#"
    ),
    metrics = [
        Metric(name="Temperature", value_type="float", metric_type="gauge", unit="°C"), 
        Metric(name="Humidity", value_type= "float",  metric_type="gauge",  unit="%", description= "Relative Humidity"), 
        Metric(name="Light",value_type="float", metric_type="gauge",  unit= "Lux"), 
        Metric(name="CO2", value_type="float",  metric_type="gauge",  unit= "ppm"), 
        Metric(name="HumidityRatio", value_type="float", metric_type="gauge", unit="kgwater-vapor/kg-air", description="Derived quantity from temperature and relative humidity."),
        Metric(name="Occupancy", value_type="integer",  metric_type="gauge",  unit="boolean", description="0 for not occupied, 1 for occupied status")
    ]
)
hvac_resource_info.to_dict()

{'id': 'hvac-demo-2021-04-29-0578',
 'description': 'Experimental data used for binary classification (room occupancy) from Temperature,Humidity,Light and CO2.\nGround-truth occupancy was obtained from time stamped pictures that were taken every minute.\nSee https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+#',
 'name': 'hvac-demo-2021-04-29-0578',
 'metrics': [{'name': 'Temperature',
   'valueType': 'float',
   'metricType': 'gauge',
   'unit': '°C'},
  {'name': 'Humidity',
   'description': 'Relative Humidity',
   'valueType': 'float',
   'metricType': 'gauge',
   'unit': '%'},
  {'name': 'Light',
   'valueType': 'float',
   'metricType': 'gauge',
   'unit': 'Lux'},
  {'name': 'CO2', 'valueType': 'float', 'metricType': 'gauge', 'unit': 'ppm'},
  {'name': 'HumidityRatio',
   'description': 'Derived quantity from temperature and relative humidity.',
   'valueType': 'float',
   'metricType': 'gauge',
   'unit': 'kgwater-vapor/kg-air'},
  {'name': 'Occupancy',
   'description':

In [13]:
# use `update` (PATCH method) to upsert the resource
hvac_resource_resp = waylay_client.api.resource.update(RESOURCE_ID, body=hvac_resource_info.to_dict())

# validate it is stored correctly
waylay_client.api.resource.get(RESOURCE_ID)

{'id': 'hvac-demo-2021-04-29-0578',
 'name': 'hvac-demo-2021-04-29-0578',
 'metrics': [{'name': 'Temperature',
   'valueType': 'float',
   'metricType': 'gauge',
   'unit': '°C'},
  {'name': 'Humidity',
   'valueType': 'float',
   'metricType': 'gauge',
   'unit': '%',
   'description': 'Relative Humidity'},
  {'name': 'Light',
   'valueType': 'float',
   'metricType': 'gauge',
   'unit': 'Lux'},
  {'name': 'CO2', 'valueType': 'float', 'metricType': 'gauge', 'unit': 'ppm'},
  {'name': 'HumidityRatio',
   'valueType': 'float',
   'metricType': 'gauge',
   'unit': 'kgwater-vapor/kg-air',
   'description': 'Derived quantity from temperature and relative humidity.'},
  {'name': 'Occupancy',
   'valueType': 'integer',
   'metricType': 'gauge',
   'unit': 'boolean',
   'description': '0 for not occupied, 1 for occupied status'}],
 'description': 'Experimental data used for binary classification (room occupancy) from Temperature,Humidity,Light and CO2.\nGround-truth occupancy was obtained fro

### upload the etl-import data

The next step will upload the import file to the `etl-import/upload` storage folder.

Any upload in this folder will initiate the following etl process:

* the file is moved from `etl-import/upload` to an timestamped folder in `etl-import/busy`
* the etl process is kicked of, reading data from this _busy_ folder.
* on completion, the file (and a result statement) is moved to a folder in `etl-import/done`

If anything goes wrong, either:
* the files are moved to an `ignored` folder if they do no comply with the requirements for an import.
* the files are moved to a `failed` folder if the etl process raised a fatal error
* note that, even if moved to the `done` folder, the processings results might still contain errors. Typically this is caused by parsing errors, e.g. if timestamps are not of the correct format.

The import files that are created by  _timeseries.etl_tool.prepare_import()_ should not run into `ignored` or parsing errors ...


The following reporting utilities allow you to follow up this process:
* `etl_tool.check_import(etl_import)` checks the status of a specific import job.
* `etl_tool.list_import(name_filter, status_filter)` queries the status of all jobs, optionally filtering on name and/or status.

In [14]:
etl_import = waylay_client.timeseries.etl_tool.initiate_import(etl_import)

Uploading content to etl-import/upload/hvac-demo-2021-04-29-0578-timeseries.csv.gz ...
... done.


In [15]:
from tenacity import Retrying, stop_after_attempt, wait_fixed, TryAgain
from IPython.core.display import HTML, Markdown

for attempt in Retrying(stop=stop_after_attempt(10),wait=wait_fixed(5)):
    with attempt:

        import_job = waylay_client.timeseries.etl_tool.check_import(etl_import)

        display(HTML(import_job.to_html()))

        if import_job.status != 'done':
            raise TryAgain
            


In [16]:
HTML(
'You can view the resource and it'' data on the Waylay console:'
'<ul>'
f'<li><a target="console" href="https://console-io.waylay.io/resources/{RESOURCE_ID}/data">console.waylay.io</a> (enterprise platform)</li>'
f'<li><a target="console" href="https://console-io.waylay.io/resources/{RESOURCE_ID}/data"">console-io.waylay.io</a> (io platform)</li>'
'</ul>'
)

### query the timeseries data

In [17]:
query = dict(
    resource=RESOURCE_ID,
    data=[
        dict(metric=metric) for metric in etl_import.settings.metrics
    ]
)
# test query
waylay_client.analytics.query.execute(
    body=query, 
    params=dict(until=datetime.utcnow().isoformat()
))

resource,hvac-demo-2021-04-29-0578,hvac-demo-2021-04-29-0578,hvac-demo-2021-04-29-0578,hvac-demo-2021-04-29-0578,hvac-demo-2021-04-29-0578,hvac-demo-2021-04-29-0578
metric,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2021-04-27 22:02:37.941000+00:00,19.50,27.033333,0.0,454.666667,0.003785,0.0
2021-04-27 22:03:37.941000+00:00,19.50,27.000000,0.0,456.000000,0.003781,0.0
2021-04-27 22:04:37.941000+00:00,19.50,27.000000,0.0,461.000000,0.003781,0.0
2021-04-27 22:05:36.941000+00:00,19.50,27.000000,0.0,458.000000,0.003781,0.0
2021-04-27 22:06:36.941000+00:00,19.50,27.000000,0.0,460.000000,0.003781,0.0
...,...,...,...,...,...,...
2021-04-29 07:17:37.941000+00:00,21.05,36.097500,433.0,787.250000,0.005579,1.0
2021-04-29 07:18:36.941000+00:00,21.05,35.995000,433.0,789.500000,0.005563,1.0
2021-04-29 07:19:36.941000+00:00,21.10,36.095000,433.0,798.500000,0.005596,1.0
2021-04-29 07:20:37.941000+00:00,21.10,36.260000,433.0,820.333333,0.005621,1.0


In [18]:
# save query
query_name = f'example_{RESOURCE_ID}'
waylay_client.analytics.query.create(body=dict(name=query_name, query=query))


{'data': [{'metric': 'Temperature'},
  {'metric': 'Humidity'},
  {'metric': 'Light'},
  {'metric': 'CO2'},
  {'metric': 'HumidityRatio'},
  {'metric': 'Occupancy'}],
 'resource': 'hvac-demo-2021-04-29-0578'}

In [19]:
# test saved query
waylay_client.analytics.query.data(query_name)

resource,hvac-demo-2021-04-29-0578,hvac-demo-2021-04-29-0578,hvac-demo-2021-04-29-0578,hvac-demo-2021-04-29-0578,hvac-demo-2021-04-29-0578,hvac-demo-2021-04-29-0578
metric,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2021-04-27 22:02:37.941000+00:00,19.50,27.033333,0.0,454.666667,0.003785,0.0
2021-04-27 22:03:37.941000+00:00,19.50,27.000000,0.0,456.000000,0.003781,0.0
2021-04-27 22:04:37.941000+00:00,19.50,27.000000,0.0,461.000000,0.003781,0.0
2021-04-27 22:05:36.941000+00:00,19.50,27.000000,0.0,458.000000,0.003781,0.0
2021-04-27 22:06:36.941000+00:00,19.50,27.000000,0.0,460.000000,0.003781,0.0
...,...,...,...,...,...,...
2021-04-29 07:17:37.941000+00:00,21.05,36.097500,433.0,787.250000,0.005579,1.0
2021-04-29 07:18:36.941000+00:00,21.05,35.995000,433.0,789.500000,0.005563,1.0
2021-04-29 07:19:36.941000+00:00,21.10,36.095000,433.0,798.500000,0.005596,1.0
2021-04-29 07:20:37.941000+00:00,21.10,36.260000,433.0,820.333333,0.005621,1.0


In [20]:
HTML(
'Use the query in the console on either'
'<ul>'
f'<li><a target="console" href="https://console.waylay.io/analytics/queries?query={query_name}">console.waylay.io</a> (enterprise platform)</li>'
f'<li><a target="console" href="https://console-io.waylay.io/analytics/queries?query={query_name}">console-io.waylay.io</a> (io platform)</li>'
'</ul>'
)













##### cleanup

In [21]:
from waylay import RestResponseError
def cleanup():
    try:
        #print(waylay_client.data.series.remove(RESOURCE_ID)  or f'removed series   {RESOURCE_ID}')
        print(waylay_client.api.resource.remove(RESOURCE_ID) or f'removed resource {RESOURCE_ID}')
        print(waylay_client.analytics.query.remove(query_name) or f'removed query {query_name}')
    except RestResponseError as exc:
        print(f'stopped processing resource {resource_id} because of:')
        print(exc)

In [22]:
cleanup()

removed resource hvac-demo-2021-04-29-0578
Response(url='https://ts-analytics-io.waylay.io/config/query/example_hvac-demo-2021-04-29-0578?api_version=0.19', method='DELETE', body={'messages': []}, headers=Headers({'server': 'envoy', 'date': 'Thu, 29 Apr 2021 07:21:59 GMT', 'content-type': 'application/json', 'content-length': '16', 'server-timing': 'config; dur=18.627643585205078; env=on-demand; method=DELETE; tenant=dc3481e5-5149-445b-b8e9-ab518cc3ba34; domain=bouncy-turkey.waylay.io', 'access-control-allow-origin': '*', 'x-envoy-upstream-service-time': '22'}), status_code=200, client_response=<Response [200 OK]>)
