In [113]:
from pathlib import Path

import pandas as pd
from pandas.api.types import CategoricalDtype


def datadir(*args):
    return Path.cwd().parent.joinpath("data", *args).resolve()


heeten_data_path = datadir("GridFlexHeetenDataset.csv")  # 61.49GB
target_time_range = [
    ["2018-08-01 00:00:00+00:00", "2019-08-01 00:00:00+00:00"],
    ["2019-08-01 00:00:00+00:00", "2020-08-01 00:00:00+00:00"],
]  # The data is from 2018-08-01T01:59:00+02:00 to 2020-08-31T23:58:00+02:00
target_measurement = [
    "UNC_KW",
    "TOTAL_KW",
    "EXPORT_KW",
    "IMPORT_KW",
    "PV_KW",
]

In [129]:
_cat_house = CategoricalDtype(
    categories=[f"House{i}" for i in range(1, 78)], ordered=True
)
_cat_appliance = CategoricalDtype(
    categories=["SMARTMETER", "PVMETER", "BATTERY"], ordered=True
)
_cat_measurement = CategoricalDtype(
    categories=[
        "BATTERY_EXPORT_KW",
        "BATTERY_IMPORT_KW",
        "BATTERY_KW",
        "BATTERY_TARGET_KW",
        "BATTERY_TARGET_MODE",
        "CHARGE_MODE",
        "CURRENT_PHASE_1",
        "CURRENT_PHASE_2",
        "CURRENT_PHASE_3",
        "EXPORT_KW",
        "EXPORT_KWH",
        "GAS_USAGE_M3",
        "IMPORT_KW",
        "IMPORT_KWH",
        "MAX_BATTERY_KW",
        "MIN_BATTERY_KW",
        "MOMENTARY_EXPORT_KW",
        "MOMENTARY_IMPORT_KW",
        "MOMENTARY_PV_KW",
        "OPERATIONAL_STATE",
        "PV_KW",
        "PV_KWH",
        "REQ_CHARGE_MODE",
        "STATE_OF_CHARGE",
        "TOTAL_KW",
        "TOTAL_KWH",
        "UNC_KW",
    ],
    ordered=True,
)
dtype = {
    "house": _cat_house,
    "appliance": _cat_appliance,
    "measurement": _cat_measurement,
    "value": "float64",
}
small_df = pd.read_csv(
    heeten_data_path,
    nrows=10,
    index_col=["timestamp"],
    parse_dates=["timestamp"],
    dtype=dtype,
)
print(small_df)


                          house appliance measurement  value
timestamp                                                   
2018-08-01 01:59:00+02:00   NaN   BATTERY  BATTERY_KW -0.037
2018-08-01 02:00:00+02:00   NaN   BATTERY  BATTERY_KW -0.037
2018-08-01 02:01:00+02:00   NaN   BATTERY  BATTERY_KW -0.033
2018-08-01 02:02:00+02:00   NaN   BATTERY  BATTERY_KW -0.041
2018-08-01 02:03:00+02:00   NaN   BATTERY  BATTERY_KW -0.034
2018-08-01 02:04:00+02:00   NaN   BATTERY  BATTERY_KW -0.033
2018-08-01 02:05:00+02:00   NaN   BATTERY  BATTERY_KW -0.025
2018-08-01 02:06:00+02:00   NaN   BATTERY  BATTERY_KW -0.028
2018-08-01 02:07:00+02:00   NaN   BATTERY  BATTERY_KW -0.028
2018-08-01 02:08:00+02:00   NaN   BATTERY  BATTERY_KW -0.028


In [135]:
small_df.dropna?

[1;31mSignature:[0m
[0msmall_df[0m[1;33m.[0m[0mdropna[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0maxis[0m[1;33m:[0m [1;34m'Axis'[0m [1;33m=[0m [1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mhow[0m[1;33m:[0m [1;34m'AnyAll | lib.NoDefault'[0m [1;33m=[0m [1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mthresh[0m[1;33m:[0m [1;34m'int | lib.NoDefault'[0m [1;33m=[0m [1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0msubset[0m[1;33m:[0m [1;34m'IndexLabel | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0minplace[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mignore_index[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m [1;33m->[0m [1;34m'DataFrame | None'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Remove missing values.

See the :ref:`User Guide <missing_da

In [121]:
with pd.read_csv(heeten_data_path, chunksize=1_000) as reader:
    for chunk in reader:
        if chunk["measurement"].isin(target_measurement).any():
            chunk.dropna(subset=["house"])  # remove rows containing NaN in "house" column


                         timestamp      house   appliance      measurement  \
5289000  2018-08-31T12:31:00+02:00    House76  SMARTMETER  CURRENT_PHASE_3   
5289001  2018-08-31T12:32:00+02:00    House76  SMARTMETER  CURRENT_PHASE_3   
5289002  2018-08-31T12:33:00+02:00    House76  SMARTMETER  CURRENT_PHASE_3   
5289003  2018-08-31T12:34:00+02:00    House76  SMARTMETER  CURRENT_PHASE_3   
5289004  2018-08-31T12:35:00+02:00    House76  SMARTMETER  CURRENT_PHASE_3   
...                            ...        ...         ...              ...   
5289995  2018-08-01T07:07:00+02:00  HouseTest  SMARTMETER        EXPORT_KW   
5289996  2018-08-01T07:08:00+02:00  HouseTest  SMARTMETER        EXPORT_KW   
5289997  2018-08-01T07:09:00+02:00  HouseTest  SMARTMETER        EXPORT_KW   
5289998  2018-08-01T07:10:00+02:00  HouseTest  SMARTMETER        EXPORT_KW   
5289999  2018-08-01T07:11:00+02:00  HouseTest  SMARTMETER        EXPORT_KW   

         value  
5289000    1.0  
5289001    1.0  
5289002    1

KeyboardInterrupt: 

In [123]:
small_df.dropna?

[1;31mSignature:[0m
[0msmall_df[0m[1;33m.[0m[0mdropna[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0maxis[0m[1;33m:[0m [1;34m'Axis'[0m [1;33m=[0m [1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mhow[0m[1;33m:[0m [1;34m'AnyAll | lib.NoDefault'[0m [1;33m=[0m [1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mthresh[0m[1;33m:[0m [1;34m'int | lib.NoDefault'[0m [1;33m=[0m [1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0msubset[0m[1;33m:[0m [1;34m'IndexLabel | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0minplace[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mignore_index[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m [1;33m->[0m [1;34m'DataFrame | None'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Remove missing values.

See the :ref:`User Guide <missing_da

In [77]:
small_df.groupby("house", observed=True).describe()

Unnamed: 0_level_0,value,value,value,value,value,value,value,value
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
house,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
House6,10.0,-0.0324,0.005082,-0.041,-0.03625,-0.033,-0.028,-0.025


In [96]:
small_df.loc["2018-08-01 00:00:00+00:00":"2018-08-01 12:08:00+00:00"]

Unnamed: 0_level_0,house,appliance,measurement,value
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-08-01 02:00:00+02:00,House6,BATTERY,BATTERY_KW,-0.037
2018-08-01 02:01:00+02:00,House6,BATTERY,BATTERY_KW,-0.033
2018-08-01 02:02:00+02:00,House6,BATTERY,BATTERY_KW,-0.041
2018-08-01 02:03:00+02:00,House6,BATTERY,BATTERY_KW,-0.034
2018-08-01 02:04:00+02:00,House6,BATTERY,BATTERY_KW,-0.033
2018-08-01 02:05:00+02:00,House6,BATTERY,BATTERY_KW,-0.025
2018-08-01 02:06:00+02:00,House6,BATTERY,BATTERY_KW,-0.028
2018-08-01 02:07:00+02:00,House6,BATTERY,BATTERY_KW,-0.028
2018-08-01 02:08:00+02:00,House6,BATTERY,BATTERY_KW,-0.028
