In [16]:
%reload_ext autoreload
%autoreload 2

In [17]:
# | default_exp npg.etl

In [18]:
# | export
import json
import zipfile
from typing import Callable

# import more_itertools as mit
import pandas as pd
import requests as rq
from fastcore.all import *

import jupyter_playground.core as core
from jupyter_playground.core import DownloadContent, cache

if in_jupyter():
    from tqdm.notebook import tqdm

## Getting Dataset metadata and attachments

In [19]:
# | export
HOST = "northernpowergrid"
DATASET = "primary-operational-metering"


def _always_true(_):
    return True


def get_dataset_attachments(host: str, dataset: str, filter: Callable = _always_true):
    q = f"https://{host}.opendatasoft.com/api/v2/catalog/datasets/{dataset}/attachments"
    res = rq.get(q)
    res.raise_for_status()

    return L(a for a in res.json()["attachments"] if filter(a))


files = get_dataset_attachments(
    HOST, DATASET, lambda x: x["href"].endswith("_zip")
).map(lambda x: x["href"])
files.map(lambda x: x[-10:])

For reference:

In [None]:
base = "https://{host}.opendatasoft.com/api/v2/".format(host=HOST)
suffix = "catalog/datasets/{dataset_id}/attachments".format(dataset_id=DATASET)
base + suffix

'https://northernpowergrid.opendatasoft.com/api/v2/catalog/datasets/primary-operational-metering/attachments'

## Create Parser and Extraction Steps

In [None]:
# | export
pipe = core.IncrementalPipeline("npg_etl", funcs=[core.attachment_download])
pipe

IncrementalPipeline(name='npg_etl', _funcs=['attachment_download'])

In [None]:
# | eval: false

files[:1].map(pipe)

1it [00:03,  3.37s/it]


(#1) [32d8b228a35b2302ed5871f407311777]

In [None]:
# | export


@cache.cache
def parse_attachment_content(
    Download: DownloadContent, max_file_size: Union[int, float] = 4e9
) -> dict:
    with zipfile.ZipFile(io.BytesIO(Download.content)) as f:
        if f.testzip() is not None:
            raise ValueError("Content byte string is not a zipfile")
        n = f.infolist()
        if len(n) != 1:
            raise ValueError("Expecting only one file but found the following:", n)
        if n[0].file_size > int(max_file_size):
            raise ValueError(f"{n[0]} exceeds max_file_size ({int(max_file_size)})")
        file_txt = (
            zipfile.Path(f, at=n[0].filename)
            .read_text(encoding="utf-8", errors="ignore")
            .replace("[,", "[", 1)  # this is much quicker than regexing
        )
    # remove newlines and whitespace
    return DownloadContent(file_txt)


pipe.append_func(parse_attachment_content)

IncrementalPipeline(name='npg_etl', _funcs=['attachment_download', 'parse_attachment_content'])

In [None]:
# | eval: false

files[:1].map(pipe)

2it [00:12,  6.16s/it]


(#1) [4baf216cf69414089e228916246be3eb]

In [None]:
# | export

_cat_cols = ["substation", "circuit", "unit", "description"]


# @cache.cache
def build_attachment_dataframe(text: DownloadContent) -> pd.DataFrame:
    df = (
        pd.concat(  # together the timeseries arrays
            [pd.json_normalize(t) for t in json.loads(text.content)["timeseries"]],
            axis=0,
            ignore_index=True,
        )
        .rename({"values": "data"}, axis=1)
        .assign(unit=lambda d: d.unit.replace(r"^\s*$", "deg", regex=True))
        .set_index(_cat_cols)
        .groupby(_cat_cols, sort=False)  # no sort for performance
        .apply(lambda d: pd.json_normalize(d.iat[0, -1]))  # run the normalise
        .reset_index(-1, drop=True)  # discard index from second parse
        .reset_index(drop=False)  # clear rest
        .assign(  # fix up types
            timestamp=lambda d: pd.to_datetime(d.timestamp),
            value=lambda d: pd.to_numeric(d.value, downcast="float"),
            yyyy_mm=lambda d: d.timestamp.dt.strftime("%Y-%m"),
        )
    )
    d = df.yyyy_mm.value_counts()  # remove values from neighbour months
    if len(d) != 1:
        print(text, d.__repr__())  # log these for reference

    df[_cat_cols] = df[_cat_cols].astype("category")  # categorise
    return df[d.index[d.argmax()] == df.yyyy_mm]


pipe.append_func(build_attachment_dataframe)

IncrementalPipeline(name='npg_etl', _funcs=['attachment_download', 'parse_attachment_content', 'build_attachment_dataframe'])

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()

Reload so that future calls to the cache are based on the module source code and not the notebook


In [None]:
# | eval: false

from jupyter_playground.npg.etl import pipe

df = files[-1:].map(pipe)[0]

df

2it [00:15,  8.12s/it]

16e9515bba143d2ad61d80406e56fbdb 2022-12    20136044
2023-01           6
Name: yyyy_mm, dtype: int64


3it [07:35, 151.73s/it]


Unnamed: 0,substation,circuit,unit,description,timestamp,value,yyyy_mm
0,LINTON 132/25kV (ULGHAM CROSSING),LINTON - TRX1,KW,active power (kw),2022-12-01 00:00:00+00:00,400.0,2022-12
1,LINTON 132/25kV (ULGHAM CROSSING),LINTON - TRX1,KW,active power (kw),2022-12-01 00:30:00+00:00,0.0,2022-12
2,LINTON 132/25kV (ULGHAM CROSSING),LINTON - TRX1,KW,active power (kw),2022-12-01 01:00:00+00:00,0.0,2022-12
3,LINTON 132/25kV (ULGHAM CROSSING),LINTON - TRX1,KW,active power (kw),2022-12-01 01:30:00+00:00,0.0,2022-12
4,LINTON 132/25kV (ULGHAM CROSSING),LINTON - TRX1,KW,active power (kw),2022-12-01 02:00:00+00:00,0.0,2022-12
...,...,...,...,...,...,...,...
20136045,WAVERLEY BUSINESS PARK,WAVERLEY BUSINESS PARK - WHITTLE WAY,A,current (amp),2022-12-31 21:30:00+00:00,7.0,2022-12
20136046,WAVERLEY BUSINESS PARK,WAVERLEY BUSINESS PARK - WHITTLE WAY,A,current (amp),2022-12-31 22:00:00+00:00,8.0,2022-12
20136047,WAVERLEY BUSINESS PARK,WAVERLEY BUSINESS PARK - WHITTLE WAY,A,current (amp),2022-12-31 22:30:00+00:00,7.0,2022-12
20136048,WAVERLEY BUSINESS PARK,WAVERLEY BUSINESS PARK - WHITTLE WAY,A,current (amp),2022-12-31 23:00:00+00:00,8.0,2022-12


Write out the dataset with some partitioning per data and month.

In [None]:
@pipe.decorate_func
def writes_file(df):
    df.to_parquet("npg.parquet", partition_cols=["description", "yyyy_mm"])

Finally run all the files. Note the file with 6 rows from next month.

In [None]:
# |eval: false
for f in tqdm(files):
    pipe(f, tqdm_position=1)

  0%|          | 0/12 [00:00<?, ?it/s]



16e9515bba143d2ad61d80406e56fbdb 2022-12    20136044
2023-01           6
Name: yyyy_mm, dtype: int64




In [None]:
# | hide
from nbdev.showdoc import *