In [41]:
import logging
from datetime import datetime
from pathlib import Path

import pandas as pd
from pandas.api.types import CategoricalDtype
from tqdm import tqdm


def datadir(*args):
    return Path.cwd().parent.joinpath("data", *args).resolve()


path_heeten_data = datadir("GridFlexHeetenDataset.csv")  # 61.49GB
target_time_range = [
    ["2018-08-01 00:00:00+00:00", "2019-08-01 00:00:00+00:00"],
    ["2019-08-01 00:00:00+00:00", "2020-08-01 00:00:00+00:00"],
]  # The data is from 2018-08-01T01:59:00+02:00 to 2020-08-31T23:58:00+02:00
target_measurement = [
    "UNC_KW",
    "TOTAL_KW",
    "EXPORT_KW",
    "IMPORT_KW",
    "PV_KW",
]

In [66]:
_cat_house = CategoricalDtype(
    categories=[f"House{i}" for i in range(1, 78)], ordered=True
)  # We disregard "HouseTest", so "HouseTest" will resolve to NaN
_cat_appliance = CategoricalDtype(
    categories=["SMARTMETER", "PVMETER", "BATTERY"], ordered=True
)
_cat_measurement = CategoricalDtype(
    categories=[
        "BATTERY_EXPORT_KW",
        "BATTERY_IMPORT_KW",
        "BATTERY_KW",
        "BATTERY_TARGET_KW",
        "BATTERY_TARGET_MODE",
        "CHARGE_MODE",
        "CURRENT_PHASE_1",
        "CURRENT_PHASE_2",
        "CURRENT_PHASE_3",
        "EXPORT_KW",
        "EXPORT_KWH",
        "GAS_USAGE_M3",
        "IMPORT_KW",
        "IMPORT_KWH",
        "MAX_BATTERY_KW",
        "MIN_BATTERY_KW",
        "MOMENTARY_EXPORT_KW",
        "MOMENTARY_IMPORT_KW",
        "MOMENTARY_PV_KW",
        "OPERATIONAL_STATE",
        "PV_KW",
        "PV_KWH",
        "REQ_CHARGE_MODE",
        "STATE_OF_CHARGE",
        "TOTAL_KW",
        "TOTAL_KWH",
        "UNC_KW",
    ],
    ordered=True,
)
dtype = {
    "house": _cat_house,
    "appliance": _cat_appliance,
    "measurement": _cat_measurement,
    "value": "float64",
}
# FIXME: solve mixed time offsets
# CET: +01:00 (https://en.wikipedia.org/wiki/Central_European_Time)
# CEST: +02:00 (https://en.wikipedia.org/wiki/Central_European_Summer_Time)
small_df = pd.read_csv(
    path_heeten_data,
    nrows=10,
    index_col=["timestamp"],
    parse_dates=["timestamp"],
    dtype=dtype,
)
small_df
# small_df.groupby("house", observed=True).describe()
# small_df.loc["2018-08-01 00:00:00+00:00":"2018-08-01 12:08:00+00:00"]
# small_df.query("'2018-08-01 02:00:00+02:00' <= index <= '2018-08-01 02:05:00+02:00'")  # " <= index <= ".join(map(lambda x: "'" + x + "'", target_time_range[0]))

Unnamed: 0_level_0,house,appliance,measurement,value
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-08-01 01:59:00+02:00,House6,BATTERY,BATTERY_KW,-0.037
2018-08-01 02:00:00+02:00,House6,BATTERY,BATTERY_KW,-0.037
2018-08-01 02:01:00+02:00,House6,BATTERY,BATTERY_KW,-0.033
2018-08-01 02:02:00+02:00,House6,BATTERY,BATTERY_KW,-0.041
2018-08-01 02:03:00+02:00,House6,BATTERY,BATTERY_KW,-0.034
2018-08-01 02:04:00+02:00,House6,BATTERY,BATTERY_KW,-0.033
2018-08-01 02:05:00+02:00,House6,BATTERY,BATTERY_KW,-0.025
2018-08-01 02:06:00+02:00,House6,BATTERY,BATTERY_KW,-0.028
2018-08-01 02:07:00+02:00,House6,BATTERY,BATTERY_KW,-0.028
2018-08-01 02:08:00+02:00,House6,BATTERY,BATTERY_KW,-0.028


In [118]:
dtype = {
    "timestamp": str,
    "house": _cat_house,
    "appliance": _cat_appliance,
    "measurement": _cat_measurement,
    "value": "float64",
}
small_df = pd.read_csv(
    path_heeten_data,
    nrows=10,
    # index_col=["timestamp"],
    # parse_dates=["timestamp"],
    dtype=dtype,
)

# small_df.reindex(pd.to_datetime(small_df["timestamp"]))
small_df.set_index(pd.to_datetime(small_df["timestamp"]), inplace=True)
small_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10 entries, 2018-08-01 01:59:00+02:00 to 2018-08-01 02:08:00+02:00
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   timestamp    10 non-null     object  
 1   house        10 non-null     category
 2   appliance    10 non-null     category
 3   measurement  10 non-null     category
 4   value        10 non-null     float64 
dtypes: category(3), float64(1), object(1)
memory usage: 4.3+ KB


In [117]:
small_df

Unnamed: 0,timestamp,house,appliance,measurement,value
0,2018-08-01T01:59:00+02:00,House6,BATTERY,BATTERY_KW,-0.037
1,2018-08-01T02:00:00+02:00,House6,BATTERY,BATTERY_KW,-0.037
2,2018-08-01T02:01:00+02:00,House6,BATTERY,BATTERY_KW,-0.033
3,2018-08-01T02:02:00+02:00,House6,BATTERY,BATTERY_KW,-0.041
4,2018-08-01T02:03:00+02:00,House6,BATTERY,BATTERY_KW,-0.034
5,2018-08-01T02:04:00+02:00,House6,BATTERY,BATTERY_KW,-0.033
6,2018-08-01T02:05:00+02:00,House6,BATTERY,BATTERY_KW,-0.025
7,2018-08-01T02:06:00+02:00,House6,BATTERY,BATTERY_KW,-0.028
8,2018-08-01T02:07:00+02:00,House6,BATTERY,BATTERY_KW,-0.028
9,2018-08-01T02:08:00+02:00,House6,BATTERY,BATTERY_KW,-0.028


In [4]:
df = pd.read_csv(
    path_heeten_data,
    nrows=20000,
    index_col=["timestamp"],
    parse_dates=["timestamp"],
    dtype=dtype,
    names=["timestamp", "house", "appliance", "measurement", "value"],
    skiprows=7826*10000#7827*10000
)

In [72]:
small_df.index.values[0]
datetime.fromisoformat(df.index.values[0])

TypeError: fromisoformat: argument must be str

In [81]:
a = df.index.values[1]
print(a)

2018-10-19 05:15:00+02:00


datetime.datetime(2018, 10, 19, 5, 15, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)))

In [87]:
datetime.strptime('2018-10-19 05:14:00+02:00', "%Y-%m-%d %H:%M:%S%z")
datetime.fromisoformat('2018-10-19 05:14:00+02:00')

datetime.datetime(2018, 10, 19, 5, 14, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)))

In [97]:
a.strftime("%Y-%m-%d %H:%M:%S%z")

'2018-10-19 05:15:00+0200'

In [96]:
df.index.to_series()[0]

  df.index.to_series()[0]


Timestamp('2018-10-19 05:14:00+0200', tz='UTC+02:00')

In [110]:
pd.to_datetime?

[1;31mSignature:[0m
[0mpd[0m[1;33m.[0m[0mto_datetime[0m[1;33m([0m[1;33m
[0m    [0marg[0m[1;33m:[0m [1;34m'DatetimeScalarOrArrayConvertible | DictConvertible'[0m[1;33m,[0m[1;33m
[0m    [0merrors[0m[1;33m:[0m [1;34m'DateTimeErrorChoices'[0m [1;33m=[0m [1;34m'raise'[0m[1;33m,[0m[1;33m
[0m    [0mdayfirst[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0myearfirst[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mutc[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mformat[0m[1;33m:[0m [1;34m'str | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mexact[0m[1;33m:[0m [1;34m'bool | lib.NoDefault'[0m [1;33m=[0m [1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0munit[0m[1;33m:[0m [1;34m'str | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0minfer_d

In [115]:
try:
    df.sort_index().loc[
                    target_time_range[0][0] : target_time_range[0][1]
                ]
except Exception as e:
    logging.exception(df.sort_index(), exc_info=False)

ERROR:root:                            house   appliance measurement  value
timestamp                                                       
2018-10-01 00:00:00+02:00  House1  SMARTMETER   EXPORT_KW    0.0
2018-10-01 00:01:00+02:00  House1  SMARTMETER   EXPORT_KW    0.0
2018-10-01 00:03:00+02:00  House1  SMARTMETER   EXPORT_KW    0.0
2018-10-01 00:04:00+02:00  House1  SMARTMETER   EXPORT_KW    0.0
2018-10-01 00:05:00+02:00  House1  SMARTMETER   EXPORT_KW    0.0
...                           ...         ...         ...    ...
2018-10-31 23:53:00+01:00     NaN  SMARTMETER   EXPORT_KW    0.0
2018-10-31 23:54:00+01:00     NaN  SMARTMETER   EXPORT_KW    0.0
2018-10-31 23:55:00+01:00     NaN  SMARTMETER   EXPORT_KW    0.0
2018-10-31 23:57:00+01:00     NaN  SMARTMETER   EXPORT_KW    0.0
2018-10-31 23:58:00+01:00     NaN  SMARTMETER   EXPORT_KW    0.0

[20000 rows x 4 columns]


In [None]:
output_path_2018 = datadir("Heeten_2018")
output_path_2019 = datadir("Heeten_2019")

# small_df.to_csv(output_path_2018, mode="a", header=False) if output_path_2018.is_file() else small_df.to_csv(output_path_2018)
with pd.read_csv(
    path_heeten_data,
    chunksize=10_000,
    index_col=["timestamp"],
    parse_dates=["timestamp"],
    dtype=dtype,
) as reader:
    for chunk in tqdm(reader):
        if chunk["measurement"].isin(target_measurement).any():
            # remove rows containing NaN (i.e. "HouseTest") in `house` column
            chunk.dropna(subset=["house"], inplace=True)
            # picks up row containing target_measurement in `measurement` column
            chunk.query("measurement in @target_measurement", inplace=True)

            if chunk.empty:
                continue

            # TODO:
            # - async?
            try:
                chunk.sort_index().loc[
                    target_time_range[0][0] : target_time_range[0][1]
                ].to_parquet(output_path_2018, partition_cols=["house", "measurement"])
                chunk.sort_index().loc[
                    target_time_range[1][0] : target_time_range[1][1]
                ].to_parquet(output_path_2019, partition_cols=["house", "measurement"])
            except Exception as e:
                logging.
            # Write to 2018
            # if output_path_2018.is_file():
            #     chunk.sort_index().loc[
            #         target_time_range[0][0] : target_time_range[0][1]
            #     ].to_csv(output_path_2018, mode="a", header=False)
            # else:
            #     chunk.sort_index().loc[
            #         target_time_range[0][0] : target_time_range[0][1]
            #     ].to_csv(output_path_2018)

            # Write to 2019
            # if output_path_2019.is_file():
            #     chunk.sort_index().loc[
            #         target_time_range[1][0] : target_time_range[1][1]
            #     ].to_csv(output_path_2019, mode="a", header=False)
            # else:
            #     chunk.sort_index().loc[
            #         target_time_range[1][0] : target_time_range[1][1]
            #     ].to_csv(output_path_2019)

## Read parquet

In [None]:
# output_path_2018 = datadir("Heeten_2018")
# df = pd.read_parquet(output_path_2018, columns=["house", "appliance", "measurement", "value"])
df.info()