In [4]:
import h5py
import pandas as pd
from pandas_profiling import ProfileReport

import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_selection import VarianceThreshold

import seaborn as sns

sns.set_theme(style="white")
plt.style.use("seaborn")

In [6]:
def load_df(
    path="data/raw/2019_data_15min.hdf5", levels=["NO_PV", None, "HOUSEHOLD"]
) -> pd.DataFrame:
    """Converts hdf5 file to dataframe and perform preliminary processing

    Args:
        path (str, optional): path to file. Defaults to 'data/raw/2019_data_15min.hdf5'.
        levels (list, optional): levels in tree. Defaults to ['NO_PV', None,'HOUSEHOLD'].

    Returns:
        pd.DataFrame: master dataframe
    """
    households = []
    df_all = []

    f = h5py.File(path, "r")

    if levels[1] is None:
        # acccess all households
        households = list(f[levels[0]].keys())
    else:
        households = levels[1]

    for household in households:
        df_household = pd.DataFrame(f[levels[0]][household][levels[2]]["table"][:])
        df_household["index"] = pd.to_datetime(
            df_household["index"], unit="s", utc=True
        )
        df_household.set_index("index", inplace=True)
        df_household = df_household.add_prefix(f"{household}_")
        df_all.append(df_household)
    return pd.concat(df_all, axis=1)


df_2018 = load_df("../data/raw/2018_data_15min.hdf5")
df_2019 = load_df("../data/raw/2019_data_15min.hdf5")
df_2020 = load_df("../data/raw/2020_data_15min.hdf5")


data = pd.concat([df_2018, df_2019, df_2020], axis=0)
print(f"data.shape: {data.shape}")

In [7]:
# sum all households
s_tot_households = list(filter(lambda x: x.endswith("_S_TOT"), data.columns))
data["TARGET"] = data[s_tot_households].sum(axis=1).shift(periods=-4 * 24)

# drop nans in target
data = data[data["TARGET"].notna()]
data.index.names = ["date"]
data.to_pickle("../data/preprocessed/master_df.pkl")

In [8]:
data.head()

In [None]:
% % script false - -no-raise-error
profile = ProfileReport(data, correlations=None,
                        dark_mode=True, minimal=True)
profile.to_file("../docs/raw_pandas_profiling.html")


## Correlations

In [11]:
corr = data.sample(n=1000, replace=False, axis=0, random_state=1).corr()
f, ax = plt.subplots(figsize=(20, 20))

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, square=True)

In [12]:
highest_corr_features = corr["TARGET"].abs().sort_values(ascending=False)
highest_corr_featurenames = highest_corr_features.index.to_list()
highest_corr_features.head(5)

In [13]:
# plot the six most correlated features

# create random sample with nobservations
sample = data.sample(n=1000, replace=False, axis=0, random_state=1)

# Highest correlation
fig, ax = plt.subplots(2, 3, figsize=(20, 10))
for i in range(2):
    for j in range(3):
        sns.scatterplot(
            x=sample[highest_corr_featurenames[i * 3 + j]],
            y=sample["TARGET"],
            ax=ax[i, j],
        )

plt.suptitle("Distribution of most correlated features with target on sample")
plt.show()

## Analyze distributions of observations over time

In [14]:
counts = data.groupby(level="date")["TARGET"].agg(["count"])
counts.columns = counts.columns.to_flat_index()
counts.head()

In [15]:
fig, ax = plt.subplots(2, 1, figsize=(20, 10))
sns.distplot(x=counts["count"], ax=ax[0])
sns.lineplot(data=counts, x="date", y="count", ax=ax[1])
plt.show()

## Analyze distribution of target

In [16]:
plt.figure(figsize=(20, 10))

# create plot with histogram and distribution
sns.distplot(data["TARGET"])

print(f"Mean of target:{data['TARGET'].mean()}")
print(f"Minimum value of target:{data['TARGET'].min()}")
print(f"Maximum value of target:{data['TARGET'].max()}")

## Analyze distribution of selected features

In [17]:
selected_features = ["SFH10_S_1", "SFH10_S_2", "SFH10_S_3", "SFH10_S_TOT"]

# create random sample with n observations
sample = data.sample(n=1000, replace=False, axis=0, random_state=1)

fig, ax = plt.subplots(2, 2, figsize=(20, 10))

for i in range(2):
    for j in range(2):
        sns.violinplot(y=sample[selected_features[i * 2 + j]], ax=ax[i, j])

## Analyze quasi-constant features

In [18]:
# adopted from here: https://www.kaggle.com/prashant111/comprehensive-guide-on-feature-selection?scriptVersionId=47174422&cellId=30
sel = VarianceThreshold(threshold=0.05)
sel.fit(data)

In [19]:
print(len([x for x in data.columns if x not in data.columns[sel.get_support()]]))

[x for x in data.columns if x not in data.columns[sel.get_support()]]

## Analyze mean and standard deviation over time

In [20]:
data_stats = data.groupby(by=[data.index.year, data.index.month]).agg(["mean", "std"])
data_stats.columns = data_stats.columns.to_flat_index()
data_stats.columns = ["_".join(tups) for tups in data_stats.columns]
data_stats.reset_index(drop=True, inplace=True)

In [21]:
data_stats

In [22]:
selected_features = ["TARGET_mean", "TARGET_std"]
data_stats_selected = data_stats[selected_features]
data_stats_selected.reset_index(inplace=True, drop=True)

sns.lineplot(data=data_stats_selected)
plt.suptitle("Load over time")
plt.ylabel("Load")
plt.show()