In [None]:
# setup darts
!pip install -r requirements.txt

In [None]:
# Download data from data source to local
import io
import os
import requests
import shutil
import zipfile

import numpy as np
import pandas as pd

In [None]:
data_dir = "data"
fc_dir = os.path.join(data_dir, "forecasting")
anom_dir = os.path.join(data_dir, "anomaly_detection")
pdm_dir = os.path.join(data_dir, "pdm")
for dir_path in [data_dir, fc_dir, anom_dir, pdm_dir]:
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

## Data Download and Preprocessing

### Electricity Dataset

In [None]:
def download_weather_data():
    """Concatenate the yearly csv files into a single dataframe and reshape it"""
    # download the csv from the url
    base_url = "https://data.stadt-zuerich.ch/dataset/ugz_meteodaten_stundenmittelwerte/download/"
    filenames = [f"ugz_ogd_meteo_h1_{year}.csv" for year in range(2015, 2023)]
    df = pd.concat([pd.read_csv(base_url + fname) for fname in filenames])
    # retain only one weather station
    df = df.loc[df["Standort"] == "Zch_Stampfenbachstrasse"]
    # pivot the df to get all measurements as columns
    df["param_name"] = df["Parameter"] + " [" + df["Einheit"] + "]"
    df = df.pivot(index="Datum", columns="param_name", values="Wert")
    # convert time index to from CET to UTC and extract the required time range
    df.index = pd.DatetimeIndex(pd.to_datetime(df.index, utc=True)).tz_localize(
        None
    )
    df = df.loc[
        (pd.Timestamp("2015-01-01") <= df.index)
        & (df.index <= pd.Timestamp("2022-08-31"))
    ]
    return df

In [None]:
def download_electricity_data():
    # URL of the file
    energy_url = "https://data.stadt-zuerich.ch/dataset/ewz_stromabgabe_netzebenen_stadt_zuerich/download/ewz_stromabgabe_netzebenen_stadt_zuerich.csv"
    
    file_path = os.path.join(fc_dir, energy_url.split("/")[-1])
    processed_path = file_path.replace(".csv", "_proc.csv")
    
    if not os.path.exists(processed_path):
        if not os.path.exists(file_path):
            # Send a GET request to download the zip file
            response = requests.get(energy_url)
            
            # Check if the request was successful
            if response.status_code == 200:
                # Save the zip file to the local drive
                with open(file_path, "wb") as file:
                    file.write(response.content)
                print("File downloaded successfully.")
            else:
                raise ValueError("Failed to download.")
        print("Processing file..")
        df = pd.read_csv(file_path, index_col=0)
        # convert time index
        df.index = pd.DatetimeIndex(pd.to_datetime(df.index, utc=True)).tz_localize(None)
        # extract pre-determined period
        df = df.loc[(pd.Timestamp("2015-01-01") <= df.index) & (df.index <= pd.Timestamp("2022-08-31"))]
        # download and preprocess the weather information
        df_weather = download_weather_data()
        # add weather data as additional features
        df = pd.concat([df, df_weather], axis=1)
        # interpolate weather data
        df = df.interpolate()
        # raining duration is given in minutes -> we divide by 4 from hourly to quarter-hourly records
        df["RainDur [min]"] = df["RainDur [min]"] / 4
    
        # round Electricity cols to 4 decimals, other columns to 2 decimals
        cols_precise = ["Value_NE5", "Value_NE7"]
        df = df.round(decimals={col: (4 if col in cols_precise else 2) for col in df.columns})
    
        # export the dataset
        df.index.name = "Timestamp"
        df.to_csv(processed_path)
        os.remove(file_path)
        print("File successfully processed.")
    else:
        print("File already downloaded.")    

### Anomaly Detection - ECG

In [None]:
def download_ecg():
    # URL of the zip file
    zip_url = "https://my.hidrive.com/api/sharelink/download?id=lmCmAjUP"
    
    file_path = os.path.join(anom_dir, "svdb.zip")
    unzipped_path = os.path.join(anom_dir, "multivariate")
    processed_path = os.path.join(anom_dir, "842.test.csv")
    if not os.path.exists(processed_path):
        if not os.path.exists(file_path):
            # Send a GET request to download the zip file
            response = requests.get(zip_url)
            
            # Check if the request was successful
            if response.status_code == 200:
                # Save the zip file to the local drive
                with open(file_path, "wb") as file:
                    file.write(response.content)
                print("Zip file downloaded successfully.")
            else:
                print("Failed to download.")
        else:
            print("Zip file already downloaded.")    
        
        # Extract the zip file
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(anom_dir)
        file_path
        df = pd.read_csv(os.path.join(anom_dir, "multivariate", "SVDB", "842.test.csv"))
        df.to_csv(processed_path, index=False)
        shutil.rmtree(unzipped_path, ignore_errors=True)
        os.remove(file_path)
        print("Zip file extracted successfully.")
    else:
        print("File already downloaded.")

### Predictive Maintenance - Wind Turbine Data

In [None]:
FREQ = pd.tseries.frequencies.to_offset("10min")

def preprocess_data(df):
    """Preprocess the turbine signals
    - drops duplicate time stamps (wrong times from daylight saving, no chance to identify correct times)
    - resamples the data to a contiguous 10 Min DatetimeIndex
    - flags missing dates with feature "missin_date"
    - computes a generator/turbine on/off timer for each turbine.
    """
    # drop duplicate time stamps (wrong times from daylight saving, no chance to
    # identify correct times)
    df = df.drop_duplicates(["Turbine_ID", "Timestamp"], keep="first").reset_index(drop=True)
    # remove time zone information
    df["Timestamp"] = pd.DatetimeIndex(df["Timestamp"]).tz_localize(None)
    df = df.sort_values(by=["Turbine_ID", "Timestamp"])
    df = df.set_index("Timestamp")
    df = df.groupby("Turbine_ID").apply(compute_on_off_timer).reset_index("Turbine_ID", drop=True)
    return df.reset_index()


def compute_on_off_timer(df):
    """Computes the time (number of 10 minute steps) since the generator last crossed the 
    1200 rotations per minute (RPM) mark.

    This is a useful feature to let the model know how long that the turbine has been active/inactive,
    and should ultimately improve generator temperature modelling.

    - positive counter when the generator went from below 1200 RPM to above
    - negative counter when the generator went from above 1200 RPM to below
    - upper- and lower-bound by +/- 4 hours (4 * 6 (10 minutes steps) = 24)
    """
    
    df = df.ffill()
    df = df.asfreq(FREQ)
    df["missing_date"] = df.isna().any(axis=1)
    df = df.ffill()

    # let's use a proxy for generator on if RPM >= 1200
    df["state_on"] = df["Gen_RPM_Avg"] >= 1200
    df["state_off"] = df["Gen_RPM_Avg"] < 1200

    # count how many steps since last time the rotation speed went above threshold (and limit the counter)
    b = (~df["state_on"]).cumsum()
    df["timer_on"] = df.groupby(b)["state_on"].cumsum()
    df.loc[df["timer_on"] > 4 * 6, "timer_on"] = 4 * 6

    # count how many steps since last time the rotation speed went below threshold (and limit the counter)
    b = (~df["state_off"]).cumsum()
    df["timer_off"] = -(df.groupby(b)["state_off"]).cumsum()
    df.loc[df["timer_off"] < -4 * 6, "timer_off"] = -4 * 6

    # combine on and off timers
    df["timer_on_off"] = np.where(df["timer_on"] > 0., df["timer_on"], df["timer_off"])

    # remove the dedicated counters
    return df.drop(columns=["timer_on", "timer_off", "state_off"])


def preprocess_anomalies(df):
    df["Timestamp"] = pd.DatetimeIndex(df["Timestamp"]).tz_localize(None).round(FREQ.freqstr)
    df["start"] = df["Timestamp"]
    df["end"] = df["start"] + (6 * 24 - 1) * FREQ
    df = df.sort_values(by=["Turbine_ID", "Timestamp"]).reset_index(drop=True)
    return df


def download_wind_turbine():
    # URL of the files
    url_data = "https://www.edp.com/sites/default/files/2023-04/Wind-Turbine-SCADA-signals-2016.xlsx"
    url_failures = "https://www.edp.com/sites/default/files/2023-04/Historical-Failure-Logbook-2016.xlsx"
    
    for url, preproc_fn in zip([url_data, url_failures], [preprocess_data, preprocess_anomalies]):
        file_path = os.path.join(pdm_dir, url.split("/")[-1])
        csv_path = file_path.replace("xlsx", "csv")
        if not (os.path.exists(file_path) or os.path.exists(csv_path)):
            # Send a GET request to download the zip file
            response = requests.get(url)
    
            # Check if the request was successful
            if response.status_code == 200:
                # Save the zip file to the local drive
                with open(file_path, "wb") as file:
                    file.write(response.content)
                print("File downloaded successfully.")
            else:
                print("Failed to download.")
        else:
            print("File already downloaded.")
    
        # Extract the zip file
        if os.path.exists(file_path):
            print("Converting to csv..")
            df = pd.read_excel(file_path)
            df = preproc_fn(df)
            df.to_csv(csv_path, index=False)
            print("Successfully converted to csv.")
            os.remove(file_path)

In [None]:
print("Downloading Electricity Data..")
download_electricity_data()

print("\nDownloading ECG Data..")
download_ecg()

print("\nDownloading Wind Turbine Data..")
download_wind_turbine()