# 5. Sensor Data Semi-Supervised Classification

In [1]:
from datetime import datetime
import random

from sklearn.preprocessing import MinMaxScaler
from IPython.display import display
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
ANON_HIGH_RES_DATASET = "../data/Anonymized High Resolution Dataset (8 Days).csv"

## 5.1 Import datasets

In [3]:
df_anon_high_res = pd.read_csv(ANON_HIGH_RES_DATASET, sep=';')

## 5.2 Preprocess data

### 5.2.1 Prepare data

**Rename columns**

In [4]:
df_anon_high_res = df_anon_high_res.rename(columns={
    "TS": "timestamp",
    "Tag": "sensor_tag",
    "Value": "value",
    "Sensor Type": "sensor_type",
    "Equipment Type": "equipment_type",
})

**Set categorical values**

In [5]:
df_anon_high_res["sensor_tag"] = df_anon_high_res["sensor_tag"].str.split(" ").apply(lambda x: f"{x[0].upper()}_{x[1].zfill(2)}")
df_anon_high_res["sensor_type"] = df_anon_high_res["sensor_type"].str.upper()
df_anon_high_res["equipment_type"] = df_anon_high_res["equipment_type"].str.upper()

In [6]:
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_01", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_02", "sensor_type"] = "FLOW"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_03", "sensor_type"] = "COUNTER"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_04", "sensor_type"] = "UNKNOWN"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_05", "sensor_type"] = "MEASUREMENT"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_06", "sensor_type"] = "MEASUREMENT"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_07", "sensor_type"] = "MEASUREMENT"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_08", "sensor_type"] = "MEASUREMENT"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_09", "sensor_type"] = "MEASUREMENT"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_10", "sensor_type"] = "MEASUREMENT"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_11", "sensor_type"] = "MEASUREMENT"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_12", "sensor_type"] = "MEASUREMENT"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_13", "sensor_type"] = "MEASUREMENT"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_14", "sensor_type"] = "SETTING"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_15", "sensor_type"] = "TEMPERATURE"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_16", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_17", "sensor_type"] = "ACTUAL"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_18", "sensor_type"] = "SPEED"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_19", "sensor_type"] = "ACTUAL"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_20", "sensor_type"] = "SPEED"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_21", "sensor_type"] = "PERCENTAGE"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_22", "sensor_type"] = "CURRENT"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_23", "sensor_type"] = "FREQUENCY"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_24", "sensor_type"] = "TEMPERATURE"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_25", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_26", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_27", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_28", "sensor_type"] = "FLOW"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_29", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_30", "sensor_type"] = "TEMPERATURE"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_31", "sensor_type"] = "SETTING"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_32", "sensor_type"] = "SETTING"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_33", "sensor_type"] = "SETTING"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_34", "sensor_type"] = "SETTING"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_35", "sensor_type"] = "SETTING"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_36", "sensor_type"] = "SETTING"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_37", "sensor_type"] = "SETTING"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_38", "sensor_type"] = "SETTING"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_39", "sensor_type"] = "PRESSURE"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_40", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_41", "sensor_type"] = "PRESSURE"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_42", "sensor_type"] = "TEMPERATURE"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_43", "sensor_type"] = "PRESSURE"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_44", "sensor_type"] = "TEMPERATURE"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_45", "sensor_type"] = "FREQUENCY"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_46", "sensor_type"] = "FREQUENCY"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_47", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_48", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_49", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_50", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_51", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_52", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_53", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_54", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_55", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_56", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_57", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_58", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_59", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_60", "sensor_type"] = "FLOW"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_61", "sensor_type"] = "SPEED"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_62", "sensor_type"] = "PERCENTAGE"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_63", "sensor_type"] = "TIME"

df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_64", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_65", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_66", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_67", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_68", "sensor_type"] = "FLOW"
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_69", "sensor_type"] = "FLOW"

In [7]:
df_anon_high_res.loc[df_anon_high_res["sensor_tag"] == "SENSOR_15", "equipment_type"] = "TANK"

**Set data types**

In [8]:
df_anon_high_res["timestamp"] = pd.to_datetime(df_anon_high_res["timestamp"])
df_anon_high_res["sensor_tag"] = df_anon_high_res["sensor_tag"].astype("category")
df_anon_high_res["value"] = df_anon_high_res["value"].astype("float")
df_anon_high_res["sensor_type"] = df_anon_high_res["sensor_type"].astype("category")
df_anon_high_res["equipment_type"] = df_anon_high_res["equipment_type"].astype("category")

### 5.2.2 Clean data

**Remove duplicate rows**

In [9]:
print(f"Rows (Before): {df_anon_high_res.shape[0]}")
df_anon_high_res = df_anon_high_res.drop_duplicates(subset = ['timestamp', 'sensor_tag'], keep = False)
print(f"Rows (After): {df_anon_high_res.shape[0]}")

Rows (Before): 1416864
Rows (After): 1416381


**Remove missing data**

In [10]:
print(f"Rows (Before): {df_anon_high_res.shape[0]}")
df_anon_high_res = df_anon_high_res.dropna(subset="value")
print(f"Rows (After): {df_anon_high_res.shape[0]}")

Rows (Before): 1416381
Rows (After): 1393345


**Remove unused sensors**

In [11]:
exclude = [
    "SENSOR_04", # No variation
    "SENSOR_05", # Irregular lab measurements
    "SENSOR_06", # Irregular lab measurements
    "SENSOR_07", # Irregular lab measurements
    "SENSOR_08", # Irregular lab measurements
    "SENSOR_09", # Irregular lab measurements
    "SENSOR_10", # Irregular lab measurements
    "SENSOR_11", # Irregular lab measurements
    "SENSOR_12", # Irregular lab measurements
    "SENSOR_13", # Irregular lab measurements
    "SENSOR_14", # Ignore this sensor (Arjen)
    "SENSOR_17", # No data points
    "SENSOR_18", # No data points
    "SENSOR_19", # No data points
    "SENSOR_20", # No data points
    "SENSOR_46", # No variation
    "SENSOR_50", # No variation
    "SENSOR_54", # No variation
]

df_anon_high_res = df_anon_high_res[~df_anon_high_res["sensor_tag"].isin(exclude)]

**Reset index**

In [12]:
df_anon_high_res = df_anon_high_res.reset_index(drop=True)

**Sort rows**

In [13]:
df_anon_high_res = df_anon_high_res.sort_values(["sensor_tag", "timestamp"])

### 5.2.3 Interpolate data

In [14]:
%%time
# Set index to timestamp
df_anon_high_res.index = df_anon_high_res["timestamp"]
df_anon_high_res = df_anon_high_res.drop(["timestamp"], axis=1)

# Create dataframe and min/max dates
df_anon_high_res_interpolated = pd.DataFrame(columns=df_anon_high_res.columns)
df_anon_high_res_min_date = df_anon_high_res.index.min()
df_anon_high_res_max_date = df_anon_high_res.index.max()

for sensor_tag in tqdm(df_anon_high_res["sensor_tag"].unique()):
    df_subset = df_anon_high_res[df_anon_high_res["sensor_tag"] == sensor_tag]

    # Update min/max dates
    if df_subset.index.min() >= df_anon_high_res_min_date:
        df_anon_high_res_min_date = df_subset.index.min()
    if df_subset.index.max() <= df_anon_high_res_max_date:
        df_anon_high_res_max_date = df_subset.index.max()

    # Determine interpolation method
    if sensor_tag in [
        "SENSOR_01",
        "SENSOR_02",
        "SENSOR_28",
        "SENSOR_29",
        "SENSOR_31",
        "SENSOR_32",
        "SENSOR_32",
        "SENSOR_33",
        "SENSOR_34",
        "SENSOR_35",
        "SENSOR_36",
        "SENSOR_37",
        "SENSOR_38",
        "SENSOR_32",
        "SENSOR_40",
    ]:
        method = "ffill"
    else:
        method = "linear"

    # Interpolate data
    df_subset_interpolated = pd.DataFrame(df_subset["value"].resample("1S").interpolate(method))
    df_subset_interpolated["sensor_tag"] = sensor_tag
    df_subset_interpolated["sensor_type"] = df_subset["sensor_type"][0]
    df_subset_interpolated["equipment_type"] = df_subset["equipment_type"][0]

    # Add interpolated data
    df_anon_high_res_interpolated = pd.concat([df_anon_high_res_interpolated, df_subset_interpolated], axis=0)

# Set data types
df_anon_high_res_interpolated["sensor_tag"] = df_anon_high_res_interpolated["sensor_tag"].astype("category")
df_anon_high_res_interpolated["value"] = df_anon_high_res_interpolated["value"].astype("float")
df_anon_high_res_interpolated["sensor_type"] = df_anon_high_res_interpolated["sensor_type"].astype("category")
df_anon_high_res_interpolated["equipment_type"] = df_anon_high_res_interpolated["equipment_type"].astype("category")

# Restore index to range
print(f"Rows (Before): {df_anon_high_res_interpolated.shape[0]}")
df_anon_high_res_interpolated = df_anon_high_res_interpolated.reset_index(names="timestamp")
df_anon_high_res_interpolated = df_anon_high_res_interpolated[
    (df_anon_high_res_interpolated["timestamp"] >= df_anon_high_res_min_date)
    & (df_anon_high_res_interpolated["timestamp"] <= df_anon_high_res_max_date)
]
df_anon_high_res = df_anon_high_res.reset_index(names="timestamp")
print(f"Rows (After): {df_anon_high_res_interpolated.shape[0]}")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [00:21<00:00,  2.33it/s]


Rows (Before): 35246354
Rows (After): 35239419
CPU times: user 18.8 s, sys: 8.69 s, total: 27.5 s
Wall time: 27.5 s


## 5.3 Explore data

### 5.3.1 Inspect rows

In [15]:
df_anon_high_res.head(10)

Unnamed: 0,timestamp,sensor_tag,value,sensor_type,equipment_type
0,2022-01-24 00:00:15,SENSOR_01,0.0,FLOW,PUMP
1,2022-01-24 00:02:15,SENSOR_01,0.0,FLOW,PUMP
2,2022-01-24 00:04:15,SENSOR_01,0.0,FLOW,PUMP
3,2022-01-24 00:06:15,SENSOR_01,0.0,FLOW,PUMP
4,2022-01-24 00:08:15,SENSOR_01,0.0,FLOW,PUMP
5,2022-01-24 00:10:15,SENSOR_01,0.0,FLOW,PUMP
6,2022-01-24 00:12:15,SENSOR_01,0.0,FLOW,PUMP
7,2022-01-24 00:14:15,SENSOR_01,0.0,FLOW,PUMP
8,2022-01-24 00:16:15,SENSOR_01,0.0,FLOW,PUMP
9,2022-01-24 00:18:15,SENSOR_01,0.0,FLOW,PUMP


In [16]:
df_anon_high_res_interpolated.head(10)

Unnamed: 0,timestamp,sensor_tag,value,sensor_type,equipment_type
102,2022-01-24 00:01:57,SENSOR_01,0.0,FLOW,PUMP
103,2022-01-24 00:01:58,SENSOR_01,0.0,FLOW,PUMP
104,2022-01-24 00:01:59,SENSOR_01,0.0,FLOW,PUMP
105,2022-01-24 00:02:00,SENSOR_01,0.0,FLOW,PUMP
106,2022-01-24 00:02:01,SENSOR_01,0.0,FLOW,PUMP
107,2022-01-24 00:02:02,SENSOR_01,0.0,FLOW,PUMP
108,2022-01-24 00:02:03,SENSOR_01,0.0,FLOW,PUMP
109,2022-01-24 00:02:04,SENSOR_01,0.0,FLOW,PUMP
110,2022-01-24 00:02:05,SENSOR_01,0.0,FLOW,PUMP
111,2022-01-24 00:02:06,SENSOR_01,0.0,FLOW,PUMP


### 5.3.2 Inspect data types

In [17]:
df_anon_high_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1318459 entries, 0 to 1318458
Data columns (total 5 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   timestamp       1318459 non-null  datetime64[ns]
 1   sensor_tag      1318459 non-null  category      
 2   value           1318459 non-null  float64       
 3   sensor_type     1318459 non-null  category      
 4   equipment_type  1318459 non-null  category      
dtypes: category(3), datetime64[ns](1), float64(1)
memory usage: 23.9 MB


In [18]:
df_anon_high_res_interpolated.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35239419 entries, 102 to 35246301
Data columns (total 5 columns):
 #   Column          Dtype         
---  ------          -----         
 0   timestamp       datetime64[ns]
 1   sensor_tag      category      
 2   value           float64       
 3   sensor_type     category      
 4   equipment_type  category      
dtypes: category(3), datetime64[ns](1), float64(1)
memory usage: 907.4 MB


### 5.3.3 Inspect basic statistics

In [19]:
df_anon_high_res.describe(include="all")

  df_anon_high_res.describe(include="all")


Unnamed: 0,timestamp,sensor_tag,value,sensor_type,equipment_type
count,1318459,1318459,1318459.0,1318459,1318459
unique,300063,51,,10,4
top,2022-01-24 17:48:02,SENSOR_03,,FLOW,PUMP
freq,44,86327,,616221,523361
first,2022-01-24 00:00:04,,,,
last,2022-01-31 23:59:57,,,,
mean,,,1392.77,,
std,,,4411.07,,
min,,,-447.6077,,
25%,,,37.91175,,


In [20]:
df_anon_high_res_interpolated.describe(include="all")

  df_anon_high_res_interpolated.describe(include="all")


Unnamed: 0,timestamp,sensor_tag,value,sensor_type,equipment_type
count,35239419,35239419,35239420.0,35239419,35239419
unique,690969,51,,10,4
top,2022-01-24 00:01:57,SENSOR_01,,FLOW,PUMP
freq,51,690969,,18656163,15892287
first,2022-01-24 00:01:57,,,,
last,2022-01-31 23:58:05,,,,
mean,,,536.1748,,
std,,,2709.614,,
min,,,-447.6077,,
25%,,,1.0,,


### 5.3.4 Inspect missing data

In [21]:
df_anon_high_res_percent_missing = (df_anon_high_res.isnull().sum() * 100) / len(df_anon_high_res)

df_anon_high_res_missing_data = pd.DataFrame({"PERCENT_MISSING": df_anon_high_res_percent_missing}, index=df_anon_high_res.columns)
df_anon_high_res_missing_data

Unnamed: 0,PERCENT_MISSING
timestamp,0.0
sensor_tag,0.0
value,0.0
sensor_type,0.0
equipment_type,0.0


In [22]:
df_anon_high_res_interpolated_percent_missing = (df_anon_high_res_interpolated.isnull().sum() * 100) / len(df_anon_high_res_interpolated)

df_anon_high_res_interpolated_missing_data = pd.DataFrame(
    {"PERCENT_MISSING": df_anon_high_res_interpolated_percent_missing},
    index=df_anon_high_res_interpolated.columns,
)

df_anon_high_res_interpolated_missing_data

Unnamed: 0,PERCENT_MISSING
timestamp,0.0
sensor_tag,0.0
value,0.0
sensor_type,0.0
equipment_type,0.0


### 5.3.5 Inspect data overview

In [23]:
def display_overview(df, subset=None, start=None, end=None):
    tags = [tag for tag in df["sensor_tag"].unique() if subset is None or tag in subset]
    start = start or df["timestamp"].min()
    end = end or df["timestamp"].max()
    overview = pd.DataFrame()

    for tag in tags:
        subset = df[(df["sensor_tag"] == tag) & (df["timestamp"] >= start) & (df["timestamp"] <= end)].copy()

        overview = pd.concat([overview, pd.DataFrame([{
            "SENSOR_TAG": tag,
            "SENSOR_TYPE": subset["sensor_type"].iloc[0] if len(subset["sensor_type"]) > 0 else np.nan,
            "EQUIPMENT_TYPE": subset["equipment_type"].iloc[0] if len(subset["equipment_type"]) > 0 else np.nan,
            "VALUE_MEAN": subset["value"].mean(),
            "VALUE_STD": subset["value"].std(),
            "VALUE_MIN": subset["value"].min(),
            "VALUE_25": subset["value"].quantile(0.25),
            "VALUE_50": subset["value"].quantile(0.50),
            "VALUE_75": subset["value"].quantile(0.75),
            "VALUE_MAX": subset["value"].max(),
            "TIMESTAMP_INTERVAL": (subset["timestamp"].max() - subset["timestamp"].min()) / len(subset),
        }])], ignore_index=True)

    with pd.option_context("display.max_rows", 70, "display.max_columns", 20):
        display(overview)


def plot_overview(df, subset=None, start=None, end=None, kind="line", normalize=False):
    tags = [tag for tag in df["sensor_tag"].unique() if subset is None or tag in subset]
    colors = plt.cm.get_cmap("tab20").colors
    start = start or df["timestamp"].min()
    end = end or df["timestamp"].max()

    fig, axes = plt.subplots(
        len(tags),
        1,
        figsize=(24, len(tags)),
        sharex=True,
        sharey=False,
    )

    for idx, tag in enumerate(tags):
        subset = df[(df["sensor_tag"] == tag) & (df["timestamp"] >= start) & (df["timestamp"] <= end)].copy()
        subset["sensor_tag"] = subset.apply(
            lambda row: f"{row['sensor_tag'].split('_')[1]}_{row['sensor_type']}_{row['equipment_type']}",
            axis=1,
        )

        if normalize:
            scaler = MinMaxScaler()
            values = scaler.fit_transform(subset["value"].to_numpy().reshape(-1, 1))
        else:
            values = subset["value"]

        fig.add_subplot(axes[idx])

        if kind == "line":
            plt.plot(
                subset["timestamp"],
                values,
                color=colors[idx % len(colors)],
                label=subset["sensor_tag"].iloc[0],
                marker=".",
            )
            plt.legend(loc="upper right")
        elif kind == "scatter":
            plt.scatter(
                subset["timestamp"],
                values,
                color=colors[idx % len(colors)],
                label=subset["sensor_tag"].iloc[0],
                marker=".",
            )
            plt.legend(loc="upper right")
        else:
            raise ValueError("'kind' must be either 'line' or 'scatter'")

    plt.tight_layout()
    plt.subplots_adjust(wspace=0, hspace=0.07)
    plt.margins(x=0, y=0)
    plt.show()


def plot_correlation(df, subset=None, start=None, end=None, freq=None):
    tags = [tag for tag in df["sensor_tag"].unique() if subset is None or tag in subset]
    start = start or df["timestamp"].min()
    end = end or df["timestamp"].max()

    subset = df[df["sensor_tag"].isin(tags) & (df["timestamp"] >= start) & (df["timestamp"] <= end)].copy()
    subset["sensor_tag"] = subset.apply(
        lambda row: f"{row['sensor_tag'].split('_')[1]}_{row['sensor_type']}_{row['equipment_type']}",
        axis=1,
    )

    pivot = subset.pivot_table(index="timestamp", columns="sensor_tag", values="value")
    pivot = pivot.reset_index()
    pivot = pivot.groupby("timestamp" if not freq else pd.Grouper(key="timestamp", freq=freq)).mean()

    _, ax = plt.subplots(figsize=(24, 24))

    sns.heatmap(
        pivot.corr(),
        vmax=1.0,
        cmap=sns.diverging_palette(220, 10, as_cmap=True),
        cbar=False,
        square=True,
        annot=True,
        ax=ax,
        linewidths=0.1,
        linecolor="white",
    )

    plt.tight_layout()
    plt.xlabel(None)
    plt.ylabel(None)
    plt.show()

In [None]:
%%time
display_overview(
    df_anon_high_res_interpolated,
    subset=None,
    start="2022-01-24 00:00:00",
    end="2022-01-24 23:59:59",
)

In [None]:
%%time
plot_overview(
    df_anon_high_res_interpolated,
    subset=None,
    start="2022-01-24 10:30:00",
    end="2022-01-24 11:30:00",
    kind="scatter",
    normalize=False,
)

In [None]:
%%time
plot_correlation(
    df_anon_high_res_interpolated,
    subset=None,
    start="2022-01-24 00:00:00",
    end="2022-01-24 23:59:59",
    freq=None,
)

## 5.4 Modeling

### 5.4.1 Pivot data

In [24]:
%%time
df_anon_high_res_pivot = df_anon_high_res_interpolated.pivot_table(index="timestamp", columns="sensor_tag", values="value")
df_anon_high_res_pivot = df_anon_high_res_pivot.groupby(pd.Grouper(freq="10min")).mean()

CPU times: user 15.4 s, sys: 2.83 s, total: 18.2 s
Wall time: 18.2 s


### 5.4.2 Normalize data

In [25]:
%%time
scaler = MinMaxScaler()
df_anon_high_res_normal = pd.DataFrame(scaler.fit_transform(df_anon_high_res_pivot), columns=df_anon_high_res_pivot.columns, index=df_anon_high_res_pivot.index)

CPU times: user 3.97 ms, sys: 16 µs, total: 3.98 ms
Wall time: 3 ms


### 5.4.3 Segment data

In [None]:
def segment_data(df):
    pass

### 5.4.4 Label data

### 5.4.5 Classify data

# 99. Scratch