In [56]:
import numpy as np
import matplotlib.pyplot as plt
import os
# from datetime import datetime
from torch.utils.data import Dataset, DataLoader
import torch
# from distutils.util import strtobool
import random

# from scipy.stats import norm
# import torch.optim as optim
# import torch.nn as nn
from tqdm import tqdm
import pandas as pd

# This is just until temporary implementation
import os 
import sys
cwd = os.getcwd()
sys.path.insert(0,cwd + '/../timetransformers')

from utils import convert_tsf_to_dataframe

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [102]:
if not os.path.exists("../data"):
    # If the directory does not exist, create it
    os.makedirs("../data")
    print(f"Directory data created.")
else:
    print(f"Directory data already exists.")


def download_single_datafile(dataset_name, dataset_id):
    os.system(f"zenodo_get {dataset_id}")
    os.system(f"mv {dataset_name}.zip ../data/{dataset_name}.zip")
    print(f"Downloaded {dataset_name}.tsf")

    # Unzip the dataset
    os.system(f"unzip -o ../data/{dataset_name}.zip -d ../data/")
    print(f"Unzipped {dataset_name}.tsf")

    # Remove the zip file
    os.system(f"rm ../data/{dataset_name}.zip")
    os.system(f"rm md5sums.txt")

    # Convert the tsf file to a pandas dataframe
    return convert_tsf_to_dataframe(f"../data/{dataset_name}.tsf")[0]


def download_data(dataset_dict):
    df_list = []
    for dataset_name, dataset_id in dataset_dict.items():
        df_list.append(download_single_datafile(dataset_name, dataset_id))

    return df_list


# List of datasets and zenondo IDs
datasets_to_load = {
    # "oikolab_weather_dataset": "10.5281/zenodo.5184708",
    "covid_deaths_dataset": "10.5281/zenodo.4656009",
    "us_births_dataset": "10.5281/zenodo.4656049",
    # "solar_4_seconds_dataset": "10.5281/zenodo.4656027",
    # "wind_4_seconds_dataset": "10.5281/zenodo.4656032",
    # "weather_dataset": "10.5281/zenodo.4654822",
    # "hospital_dataset": "10.5281/zenodo.4656014",
    # "electricity_hourly_dataset": "10.5281/zenodo.4656140",
    # "traffic_hourly_dataset": "10.5281/zenodo.4656132",
    # "rideshare_dataset_without_missing_values": "10.5281/zenodo.5122232",
    # "bitcoin_dataset_without_missing_values": "10.5281/zenodo.5122101",
    # "australian_electricity_demand_dataset": "10.5281/zenodo.4659727",
    # "sunspot_dataset_without_missing_values": "10.5281/zenodo.4654722",
    # "london_smart_meters_dataset_with_missing_values": "10.5281/zenodo.4656072",
}

# "covid_deaths_dataset": "10.5281/zenodo.4656009",
# "m4_monthly_dataset": "10.5281/zenodo.4656480",

dfs = download_data(datasets_to_load)

Directory data already exists.


Title: COVID-19 Deaths Dataset
Keywords: covid19, forecasting, daily series
Publication date: 2020-08-21
DOI: 10.5281/zenodo.4656009
Total size: 0.0 MB

Link: https://zenodo.org/record/4656009/files/covid_deaths_dataset.zip   size: 0.0 MB

Checksum is correct. (65c233f20dba0b1e63d1fac2448b7c08)
All files have been downloaded.


Downloaded covid_deaths_dataset.tsf
Archive:  ../data/covid_deaths_dataset.zip
  inflating: ../data/covid_deaths_dataset.tsf  
Unzipped covid_deaths_dataset.tsf


Title: US Births Dataset
Keywords: births, forecasting, daily series
Publication date: 2020-06-22
DOI: 10.5281/zenodo.4656049
Total size: 0.0 MB

Link: https://zenodo.org/record/4656049/files/us_births_dataset.zip   size: 0.0 MB


Downloaded us_births_dataset.tsf
Archive:  ../data/us_births_dataset.zip
  inflating: ../data/us_births_dataset.tsf  
Unzipped us_births_dataset.tsf



Checksum is correct. (6680000cb82f7db6ced6b480bde89410)
All files have been downloaded.


In [108]:
# def convert_df_to_numpy(df):
#     # Filter the DataFrame
#     # data_list = list(df["type"].unique())
#     # filtered_df = df[df["type"].isin(data_list)]

#     # Select the 'series_value' column from the filtered DataFrame
#     selected_series_values = df["series_value"]

#     # T_means_ = df[df["obs_or_fcst"] == ("T_MEAN", "PRCP_SUM")]["series_value"]
#     selected_series_values = selected_series_values.to_numpy()


#     def fill_nans(array):
#         array = pd.Series(array)
#         array.ffill(inplace=True)  # Forward fill
#         array.bfill(inplace=True)
#         return array.to_numpy()


#     N_data = selected_series_values.shape[0]
#     training_data = selected_series_values[0].astype(float)

#     for i in range(1, N_data):
#         new_data = fill_nans(selected_series_values[i].to_numpy().astype(float))
#         training_data = np.vstack((training_data, new_data))

#     return training_data

# training_data_list = []

# for df in dfs:
#     training_data_list.append(convert_df_to_numpy(df))


def convert_df_to_numpy(dfs):
    training_data = []

    for df in dfs:
        # Select the 'series_value' column from the filtered DataFrame
        selected_series_values = df["series_value"]

        # T_means_ = df[df["obs_or_fcst"] == ("T_MEAN", "PRCP_SUM")]["series_value"]
        selected_series_values = selected_series_values.to_numpy()

        def fill_nans(array):
            array = pd.Series(array)
            array.ffill(inplace=True)  # Forward fill
            array.bfill(inplace=True)
            return array.to_numpy()

        N_data = selected_series_values.shape[0]

        for i in range(N_data):
            new_data = fill_nans(selected_series_values[i].to_numpy().astype(float))
            training_data.append(new_data)

    return training_data


training_data_list = convert_df_to_numpy(dfs)
len(training_data_list)

267

In [118]:
def normalize_data(data, mean, std):
    if mean == 0 and std == 0:
        return data
    else:
        return (data - mean) / std


class TimeSeriesDataset(Dataset):
    def __init__(self, data, max_sequence_length):
        self.max_sequence_length = max_sequence_length
        self.means = np.array([np.mean(data[i]) for i in range(len(data))])
        self.std = np.array([np.std(data[i]) for i in range(len(data))])
        self.data = [
            normalize_data(data[i], self.means[i], self.std[i])
            for i in range(len(data))
        ]
        self.probs = (
            np.array([len(self.data[i]) for i in range(len(self.data))])
            / self.__len__()
        )

    def __len__(self):
        l = 0
        for i in range(len(self.data)):
            l += len(self.data[i])
        return l

    def __getitem__(self, idx):
        # I will just randomly select one of the time series
        # and then randomly select a subsequence of length max_sequence_length
        new_idx = np.random.choice(a=len(self.probs), p=self.probs)
        series = self.data[new_idx]
        if len(series) > self.max_sequence_length:
            # Randomly select a starting point for the sequence
            start_index = random.randint(0, len(series) - self.max_sequence_length - 1)

            # Slice the series to get a random subsequence of length max_sequence_length
            train_series = torch.tensor(
                series[start_index : start_index + self.max_sequence_length],
                dtype=torch.float32,
            ).unsqueeze(-1)
            true_series = torch.tensor(
                series[start_index + 1 : start_index + self.max_sequence_length + 1],
                dtype=torch.float32,
            )
            mask = torch.ones_like(train_series, dtype=torch.bool)

            return (
                train_series,
                true_series,
                mask,
            )

        else:
            train_series = torch.tensor(
                series[: self.max_sequence_length - 1],
                dtype=torch.float32,
            )
            true_series = series[1 : 1 + self.max_sequence_length]

            mask = torch.ones(len(true_series) - 1, dtype=torch.bool)

            # Calculate the number of padding elements needed
            padding_length = self.max_sequence_length - len(train_series)

            # Create padding tensors
            train_series_padding = torch.zeros(padding_length)
            true_series_padding = torch.zeros(padding_length)
            mask_padding = torch.zeros(padding_length)

            # Concatenate the original tensors with their respective paddings
            # FIXME: This currently does not work.......
            train_series = torch.cat([train_series, train_series_padding])
            true_series = torch.cat([true_series, true_series_padding])
            mask = torch.cat([mask, mask_padding])

            return (
                train_series.unsqueeze(-1),
                true_series,
                mask,
            )


max_seq_length = 512
batch_size = 64

dataset = TimeSeriesDataset(training_data_list, max_seq_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


for batch in dataloader:
    train, true, mask = batch
    plt.plot(train[0])
    plt.plot(true[0])
    plt.plot(mask[0])
    plt.show()

TypeError: expected Tensor as element 0 in argument 0, but got numpy.ndarray