In [4]:
from pathlib import Path

import pandas as pd
from pandas.api.types import CategoricalDtype
from tqdm import tqdm


def datadir(*args):
    return Path.cwd().parent.joinpath("data", *args).resolve()


path_heeten_data = datadir("GridFlexHeetenDataset.csv")  # 61.49GB
target_time_range = [
    ["2018-08-01 00:00:00+00:00", "2019-08-01 00:00:00+00:00"],
    ["2019-08-01 00:00:00+00:00", "2020-08-01 00:00:00+00:00"],
]  # The data is from 2018-08-01T01:59:00+02:00 to 2020-08-31T23:58:00+02:00
target_measurement = [
    "UNC_KW",
    "TOTAL_KW",
    "EXPORT_KW",
    "IMPORT_KW",
    "PV_KW",
]

In [2]:
_cat_house = CategoricalDtype(
    categories=[f"House{i}" for i in range(1, 78)], ordered=True
)  # We disregard "HouseTest", so "HouseTest" will resolve to NaN
_cat_appliance = CategoricalDtype(
    categories=["SMARTMETER", "PVMETER", "BATTERY"], ordered=True
)
_cat_measurement = CategoricalDtype(
    categories=[
        "BATTERY_EXPORT_KW",
        "BATTERY_IMPORT_KW",
        "BATTERY_KW",
        "BATTERY_TARGET_KW",
        "BATTERY_TARGET_MODE",
        "CHARGE_MODE",
        "CURRENT_PHASE_1",
        "CURRENT_PHASE_2",
        "CURRENT_PHASE_3",
        "EXPORT_KW",
        "EXPORT_KWH",
        "GAS_USAGE_M3",
        "IMPORT_KW",
        "IMPORT_KWH",
        "MAX_BATTERY_KW",
        "MIN_BATTERY_KW",
        "MOMENTARY_EXPORT_KW",
        "MOMENTARY_IMPORT_KW",
        "MOMENTARY_PV_KW",
        "OPERATIONAL_STATE",
        "PV_KW",
        "PV_KWH",
        "REQ_CHARGE_MODE",
        "STATE_OF_CHARGE",
        "TOTAL_KW",
        "TOTAL_KWH",
        "UNC_KW",
    ],
    ordered=True,
)
dtype = {
    "house": _cat_house,
    "appliance": _cat_appliance,
    "measurement": _cat_measurement,
    "value": "float64",
}
small_df = pd.read_csv(
    path_heeten_data,
    nrows=10,
    index_col=["timestamp"],
    parse_dates=["timestamp"],
    dtype=dtype,
)
print(small_df)

                            house appliance measurement  value
timestamp                                                     
2018-08-01 01:59:00+02:00  House6   BATTERY  BATTERY_KW -0.037
2018-08-01 02:00:00+02:00  House6   BATTERY  BATTERY_KW -0.037
2018-08-01 02:01:00+02:00  House6   BATTERY  BATTERY_KW -0.033
2018-08-01 02:02:00+02:00  House6   BATTERY  BATTERY_KW -0.041
2018-08-01 02:03:00+02:00  House6   BATTERY  BATTERY_KW -0.034
2018-08-01 02:04:00+02:00  House6   BATTERY  BATTERY_KW -0.033
2018-08-01 02:05:00+02:00  House6   BATTERY  BATTERY_KW -0.025
2018-08-01 02:06:00+02:00  House6   BATTERY  BATTERY_KW -0.028
2018-08-01 02:07:00+02:00  House6   BATTERY  BATTERY_KW -0.028
2018-08-01 02:08:00+02:00  House6   BATTERY  BATTERY_KW -0.028


In [77]:
small_df.groupby("house", observed=True).describe()

Unnamed: 0_level_0,value,value,value,value,value,value,value,value
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
house,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
House6,10.0,-0.0324,0.005082,-0.041,-0.03625,-0.033,-0.028,-0.025


In [4]:
small_df.loc["2018-08-01 00:00:00+00:00":"2018-08-01 12:08:00+00:00"]

Unnamed: 0_level_0,house,appliance,measurement,value
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-08-01 02:00:00+02:00,House6,BATTERY,BATTERY_KW,-0.037
2018-08-01 02:01:00+02:00,House6,BATTERY,BATTERY_KW,-0.033
2018-08-01 02:02:00+02:00,House6,BATTERY,BATTERY_KW,-0.041
2018-08-01 02:03:00+02:00,House6,BATTERY,BATTERY_KW,-0.034
2018-08-01 02:04:00+02:00,House6,BATTERY,BATTERY_KW,-0.033
2018-08-01 02:05:00+02:00,House6,BATTERY,BATTERY_KW,-0.025
2018-08-01 02:06:00+02:00,House6,BATTERY,BATTERY_KW,-0.028
2018-08-01 02:07:00+02:00,House6,BATTERY,BATTERY_KW,-0.028
2018-08-01 02:08:00+02:00,House6,BATTERY,BATTERY_KW,-0.028


In [None]:
output_path_2018 = datadir("Heeten_2018.csv")
output_path_2019 = datadir("Heeten_2019.csv")

# small_df.to_csv(output_path_2018, mode="a", header=False) if output_path_2018.is_file() else small_df.to_csv(output_path_2018)
with pd.read_csv(
    path_heeten_data,
    chunksize=10_000,
    index_col=["timestamp"],
    parse_dates=["timestamp"],
    dtype=dtype,
) as reader:
    for chunk in tqdm(reader):
        if chunk["measurement"].isin(target_measurement).any():
            # remove rows containing NaN (i.e. "HouseTest") in `house` column
            chunk.dropna(subset=["house"], inplace=True)
            # picks up row containing target_measurement in `measurement` column
            chunk.query("measurement in @target_measurement", inplace=True)

            if chunk.empty:
                continue

            # TODO:
            # - Only open csv file when data meet condition
            # - async?
            # Write to 2018
            if output_path_2018.is_file():
                chunk.sort_index().loc[
                    target_time_range[0][0] : target_time_range[0][1]
                ].to_csv(output_path_2018, mode="a", header=False)
            else:
                chunk.sort_index().loc[
                    target_time_range[0][0] : target_time_range[0][1]
                ].to_csv(output_path_2018)

            # Write to 2019
            if output_path_2019.is_file():
                chunk.sort_index().loc[
                    target_time_range[1][0] : target_time_range[1][1]
                ].to_csv(output_path_2019, mode="a", header=False)
            else:
                chunk.sort_index().loc[
                    target_time_range[1][0] : target_time_range[1][1]
                ].to_csv(output_path_2019)