In [1]:
import joblib
import sys
import torch
import math
import numpy as np
# from tqdm.notebook import tqdm
from tqdm import tqdm
import os
import pandas as pd
from sktime.datasets import load_from_tsfile_to_dataframe

import warnings
warnings.filterwarnings("ignore")

### Prepare the UEA classification datasets

In [6]:
dataset_names=[
    'ArticularyWordRecognition', 
    'AtrialFibrillation', 
    'BasicMotions', 
    'CharacterTrajectories', 
    'Cricket', 
    'DuckDuckGeese', 
    'ERing', 
    'EigenWorms', 
    'Epilepsy', 
    'EthanolConcentration', 
    'FaceDetection', 
    'FingerMovements', 
    'HandMovementDirection', 
    'Handwriting', 
    'Heartbeat', 
    # 'InsectWingbeat',
    'JapaneseVowels', 
    'LSST', 
    'Libras', 
    'MotorImagery', 
    'NATOPS', 
    'PEMS-SF', 
    'PenDigits', 
    'PhonemeSpectra', 
    'RacketSports', 
    'SelfRegulationSCP1', 
    'SelfRegulationSCP2', 
    'SpokenArabicDigits', 
    'StandWalkJump', 
    'UWaveGestureLibrary'
]


# Change the dataset root to the path of your dataset
dataset_root='~/data/UEA_multivariate'
save_root = '~/data/VQShape/uea'

def interpolate_uts(x, new_len):
    x = torch.from_numpy(x)
    x = torch.nn.functional.interpolate(x.view(1, 1, -1), new_len, mode='linear')
    return x.squeeze()

def dataframe_to_list(df):
    """
    Convert each element of a pandas DataFrame to a list.

    :param df: pandas DataFrame.
    :return: List containing all elements of the DataFrame.
    """
    elements_list = []
    for _, row in df.iterrows():
        elements_list.extend(row.tolist())
    return elements_list

def load_single(dataset_root, dataset):
    df, labels = load_from_tsfile_to_dataframe(f"{dataset_root}/{dataset}/{dataset}_{flag}.ts")
    data_list = dataframe_to_list(df)
    df = None
    data_list = [interpolate_uts(x.values.flatten(), new_len=512).float() for x in data_list]
    # data_list = [(x - np.mean(x))/(np.std(x) + np.finfo(float).eps) for x in data_list]
    data_list = [(x - x.mean()) / (x.var() + 1e-5).sqrt() for x in data_list]
    return torch.stack(data_list, dim=0).numpy()


# Load the UEA datasets and write each univariate TS into a csv file
for flag in ['TRAIN', 'TEST']:
    for dataset in dataset_names:
        print(dataset, end=": ")
        save_dir = f"{save_root}/{flag}/{dataset}"
        os.makedirs(save_dir, exist_ok=True)
        data = load_single(dataset_root, dataset)
        print(data.shape)

        for i, x in enumerate(tqdm(data)):
            df = pd.DataFrame(x)
            df.to_csv(f"{save_dir}/{i}.csv", index=False, header=False)

ArticularyWordRecognition
(2475, 512)
AtrialFibrillation
(30, 512)
BasicMotions
(240, 512)
CharacterTrajectories
(4266, 512)
Cricket
(648, 512)
DuckDuckGeese
(67250, 512)
ERing
(120, 512)
EigenWorms
(768, 512)
Epilepsy
(411, 512)
EthanolConcentration
(783, 512)
FaceDetection
(848160, 512)
FingerMovements
(8848, 512)
HandMovementDirection
(1600, 512)
Handwriting
(450, 512)
Heartbeat
(12444, 512)
JapaneseVowels
(3240, 512)
LSST
(14754, 512)
Libras
(360, 512)
MotorImagery
(17792, 512)
NATOPS
(4320, 512)
PEMS-SF
(257121, 512)
PenDigits
(14988, 512)
PhonemeSpectra
(36465, 512)
RacketSports
(906, 512)
SelfRegulationSCP1
(1608, 512)
SelfRegulationSCP2
(1400, 512)
SpokenArabicDigits
(85787, 512)
StandWalkJump
(48, 512)
UWaveGestureLibrary
(360, 512)
ArticularyWordRecognition
(2700, 512)
AtrialFibrillation
(30, 512)
BasicMotions
(240, 512)
CharacterTrajectories
(4308, 512)
Cricket
(432, 512)
DuckDuckGeese
(67250, 512)
ERing
(1080, 512)
EigenWorms
(786, 512)
Epilepsy
(414, 512)
EthanolConcentrat

### Prepare the Forecasting datasets

In [7]:
import pandas as pd
import glob

glob.glob("../data/PILE/forecasting/autoformer/*")

['../data/PILE/forecasting/autoformer/weather.csv',
 '../data/PILE/forecasting/autoformer/ETTm2.csv',
 '../data/PILE/forecasting/autoformer/ETTh1.csv',
 '../data/PILE/forecasting/autoformer/electricity.csv',
 '../data/PILE/forecasting/autoformer/national_illness.csv',
 '../data/PILE/forecasting/autoformer/traffic.csv',
 '../data/PILE/forecasting/autoformer/exchange_rate.csv',
 '../data/PILE/forecasting/autoformer/ETTh2.csv',
 '../data/PILE/forecasting/autoformer/ETTm1.csv']

In [58]:
from tqdm import tqdm
import os
import numpy as np

dataset = 'traffic'
df = pd.read_csv(f'../data/PILE/forecasting/autoformer/{dataset}.csv')
save_dir = f'../data/VQShape/forecasting/TRAIN/{dataset}'
test_size = 0.2
step_size = 48
seq_lengths = [512] # [96, 192, 336, 512]
data = df.values[:, 1:]
data = data[:int(data.shape[0]*(1-test_size)), :]
start_idx = np.arange(0, data.shape[0] - step_size, step_size)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

for l in seq_lengths:
    for c in tqdm(range(data.shape[1]), desc=f'{l}'):
        for start in start_idx:
            x = data[start:start+l, c]
            pd.DataFrame(x).to_csv(f"{save_dir}/{c}_{start}_{l}.csv", index=False, header=False)


512: 100%|██████████| 862/862 [01:22<00:00, 10.49it/s]


In [63]:
from tqdm import tqdm
import os
import numpy as np

dataset = 'ETTh2'
df = pd.read_csv(f'../data/PILE/forecasting/autoformer/{dataset}.csv')
save_dir = f'../data/VQShape/forecasting/TRAIN/{dataset}'

step_size = 48
seq_lengths = [512] # [96, 192, 336, 512]
data = df.values[:, 1:]
data = data[:12 * 30 * 24 + 4 * 30 * 24, :]
print(data.shape)
start_idx = np.arange(0, data.shape[0] - step_size, step_size)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

for l in seq_lengths:
    for c in tqdm(range(data.shape[1]), desc=f'{l}'):
        for start in start_idx:
            x = data[start:start+l, c]
            pd.DataFrame(x).to_csv(f"{save_dir}/{c}_{start}_{l}.csv", index=False, header=False)


(11520, 7)


512: 100%|██████████| 7/7 [00:00<00:00,  9.77it/s]


In [64]:
from tqdm import tqdm
import os
import numpy as np

dataset = 'national_illness'
df = pd.read_csv(f'../data/PILE/forecasting/autoformer/{dataset}.csv')
save_dir = f'../data/VQShape/forecasting/TRAIN/{dataset}'
test_size = 0.2
step_size = 12
seq_lengths = [24, 36, 48, 60]
data = df.values[:, 1:]
data = data[:int(data.shape[0]*(1-test_size)), :]
start_idx = np.arange(0, data.shape[0] - step_size, step_size)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

for l in seq_lengths:
    for c in tqdm(range(data.shape[1]), desc=f'{l}'):
        for start in start_idx:
            x = data[start:start+l, c]
            pd.DataFrame(x).to_csv(f"{save_dir}/{c}_{start}_{l}.csv", index=False, header=False)


24: 100%|██████████| 7/7 [00:00<00:00, 85.36it/s]
36: 100%|██████████| 7/7 [00:00<00:00, 85.98it/s]
48: 100%|██████████| 7/7 [00:00<00:00, 85.31it/s]
60: 100%|██████████| 7/7 [00:00<00:00, 83.13it/s]


### Prepare the UCR classification datasets


In [11]:
from sktime.datasets import load_from_ucr_tsv_to_dataframe
import os
from tqdm import tqdm
import numpy as np


def df_to_feature(df, labels):
    features = []
    for _, row in df.iterrows():
        sample = []
        for c in row.tolist():
            # sample.append(interpolate_uts(c.values.flatten(), seq_len))
            sample.append(c.values.flatten())
        features.append(np.stack(sample, axis=0))
    features = np.stack(features, axis=0)

    labels = pd.Series(labels, dtype="category")
    labels = pd.DataFrame(labels.cat.codes, dtype=np.int8).values

    return features, labels


root = f"../data/timeseries_lib/UCR_2018"
datasets = os.listdir(root)
datasets = sorted([d for d in datasets if not d.startswith(".")])

count = 0

for d in tqdm(datasets):
    save_dir = f'../data/VQShape/ucr/TEST/{d}'
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    features, labels = load_from_ucr_tsv_to_dataframe(f"{root}/{d}/{d}_TEST.tsv")
    features, labels = df_to_feature(features, labels)

    for i in range(features.shape[0]):
        x = features[i].flatten()
        pd.DataFrame(x).to_csv(f"{save_dir}/{i}.csv", index=False, header=False)
        count += 1

count

100%|██████████| 128/128 [01:01<00:00,  2.09it/s]


130603