# convert to univariate distance config

- input dir: "/data/Blob_EastUS/v-zhenwzhang/tsfm_datasets/lotsa_weights"
- each folder corresponds to a subdataset, and contains .npy files for each dataset

In [12]:
import numpy as np
import yaml

from uni2ts.data.builder.lotsa_v1 import (
    Buildings900KDatasetBuilder,
    BuildingsBenchDatasetBuilder,
    CloudOpsTSFDatasetBuilder,
    CMIP6DatasetBuilder,
    ERA5DatasetBuilder,
    GluonTSDatasetBuilder,
    LargeSTDatasetBuilder,
    LibCityDatasetBuilder,
    OthersLOTSADatasetBuilder,
    ProEnFoDatasetBuilder,
    SubseasonalDatasetBuilder,
)

In [2]:
dataset_list = (
    Buildings900KDatasetBuilder.dataset_list
    + BuildingsBenchDatasetBuilder.dataset_list
    + CloudOpsTSFDatasetBuilder.dataset_list
    + CMIP6DatasetBuilder.dataset_list
    + ERA5DatasetBuilder.dataset_list
    + GluonTSDatasetBuilder.dataset_list
    + LargeSTDatasetBuilder.dataset_list
    + LibCityDatasetBuilder.dataset_list
    + OthersLOTSADatasetBuilder.dataset_list
    + ProEnFoDatasetBuilder.dataset_list
    + SubseasonalDatasetBuilder.dataset_list
)

In [6]:
import os

path = "/data/Blob_EastUS/v-zhenwzhang/tsfm_datasets/lotsa_weights"

npy_files = []
dataset_names = []
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith("_to_ETTh1_distances.npy"):
            npy_files.append(os.path.join(root, file))
            dataset_name = file.split('_to_ETTh1_distances.npy')[0]
            dataset_names.append(dataset_name)

print(len(npy_files))
print(dataset_names)

npy_contents = {}
for dataset_name, npy_file in zip(dataset_names, npy_files):
    npy_contents[dataset_name] = np.load(npy_file).mean(axis=1)

# print(npy_contents['bdg-2_bear'])

169
['bdg-2_bear', 'bdg-2_fox', 'bdg-2_panther', 'bdg-2_rat', 'borealis', 'ideal', 'lcl', 'sceaux', 'smart', 'cmip6_1850', 'cmip6_1855', 'cmip6_1860', 'cmip6_1865', 'cmip6_1870', 'cmip6_1875', 'cmip6_1880', 'cmip6_1885', 'cmip6_1890', 'cmip6_1895', 'cmip6_1900', 'cmip6_1905', 'cmip6_1910', 'cmip6_1915', 'cmip6_1920', 'cmip6_1925', 'cmip6_1930', 'cmip6_1935', 'cmip6_1940', 'cmip6_1945', 'cmip6_1950', 'cmip6_1955', 'cmip6_1960', 'cmip6_1965', 'cmip6_1970', 'cmip6_1975', 'cmip6_1980', 'cmip6_1985', 'cmip6_1990', 'cmip6_1995', 'cmip6_2000', 'cmip6_2005', 'cmip6_2010', 'alibaba_cluster_trace_2018', 'azure_vm_traces_2017', 'borg_cluster_data_2011', 'era5_1989', 'era5_1990', 'era5_1991', 'era5_1992', 'era5_1993', 'era5_1994', 'era5_1995', 'era5_1996', 'era5_1997', 'era5_1998', 'era5_1999', 'era5_2000', 'era5_2001', 'era5_2002', 'era5_2003', 'era5_2004', 'era5_2005', 'era5_2006', 'era5_2007', 'era5_2008', 'era5_2009', 'era5_2010', 'era5_2011', 'era5_2012', 'era5_2013', 'era5_2014', 'era5_2015'

In [13]:
# print(npy_contents['bdg-2_bear'].shape) (91,)

# format npy_contents to yaml file
# each dataset has a list of distances
# for example:
# bdg-2_bear:
#  1: distance_1
#  2: distance_2
#  3: .....

formatted_npy_contents = {}
for dataset, distances in npy_contents.items():
    formatted_npy_contents[dataset] = {i + 1: float(distance) for i, distance in enumerate(distances)}

# save to yaml file
with open('distances_to_etth1.yaml', 'w') as yaml_file:
    yaml.dump(formatted_npy_contents, yaml_file)

(91,)


In [14]:
# Calculate the mean of all distances
all_distances = np.concatenate(list(npy_contents.values()))
mean_distance = np.mean(all_distances)

print("Mean of all distances:", mean_distance)

Mean of all distances: 0.41921082


In [13]:
# Find the common elements between dataset_names and dataset_list
common_elements = set(dataset_names).intersection(dataset_list)

# Print the common elements
print(len(common_elements))

169
