In [1]:
import datasets
import pyarrow.compute as pc
import numpy as np

from uni2ts.common.env import env
from uni2ts.data.builder.lotsa_v1 import (
    Buildings900KDatasetBuilder,
    BuildingsBenchDatasetBuilder,
    CloudOpsTSFDatasetBuilder,
    CMIP6DatasetBuilder,
    ERA5DatasetBuilder,
    GluonTSDatasetBuilder,
    LargeSTDatasetBuilder,
    LibCityDatasetBuilder,
    OthersLOTSADatasetBuilder,
    ProEnFoDatasetBuilder,
    SubseasonalDatasetBuilder,
)

In this notebook, we will see how to calculate the dataset weighting in the [pre-training dataset config file](../cli/conf/pretrain/data/lotsa_v1_weighted.yaml). We will see how to automatically generate this file to avoid excessive manual labor.

In [2]:
dataset_list = (
    Buildings900KDatasetBuilder.dataset_list
    + BuildingsBenchDatasetBuilder.dataset_list
    + CloudOpsTSFDatasetBuilder.dataset_list
    + CMIP6DatasetBuilder.dataset_list
    + ERA5DatasetBuilder.dataset_list
    + GluonTSDatasetBuilder.dataset_list
    + LargeSTDatasetBuilder.dataset_list
    + LibCityDatasetBuilder.dataset_list
    + OthersLOTSADatasetBuilder.dataset_list
    + ProEnFoDatasetBuilder.dataset_list
    + SubseasonalDatasetBuilder.dataset_list
)

1. Obtain the weights of all dataset from all available datasets.

In [3]:
from pathlib import Path
import json

total_distances = {}

WEIGHTS_PATH = "/data/Blob_EastUS/v-zhenwzhang/tsfm_datasets/lotsa_weights"
path = Path(WEIGHTS_PATH)
# go through all folders in WEIGHTS_PATH
for folder in path.iterdir():
    # iterate through all files names in the folder
    if not folder.is_dir():
        continue
    for file in folder.iterdir():
        # check if .json file
        if file.suffix == ".json":
            # load json file and get the weights
            with open(file, "r") as f:
                distance = json.load(f)
            total_distances |= distance

print(total_distances)

{'sceaux': 0.13499212265014648, 'borealis': 0.20265986025333405, 'ideal': 0.2898980975151062, 'bdg-2_panther': 0.20341044664382935, 'bdg-2_fox': 0.20915815234184265, 'bdg-2_rat': 0.15185955166816711, 'bdg-2_bear': 0.2121816724538803, 'smart': 0.16377359628677368, 'lcl': 0.2011713683605194, 'cmip6_1850': 0.2656000256538391, 'cmip6_1855': 0.26432132720947266, 'cmip6_1860': 0.262317031621933, 'cmip6_1865': 0.2660820782184601, 'cmip6_1870': 0.26280030608177185, 'cmip6_1875': 0.2659946382045746, 'cmip6_1880': 0.26181861758232117, 'cmip6_1885': 0.26711806654930115, 'cmip6_1890': 0.2643161714076996, 'cmip6_1895': 0.26815423369407654, 'cmip6_1900': 0.2622115910053253, 'cmip6_1905': 0.2649420499801636, 'cmip6_1910': 0.25890663266181946, 'cmip6_1915': 0.2665177881717682, 'cmip6_1920': 0.25985220074653625, 'cmip6_1925': 0.264789879322052, 'cmip6_1930': 0.2657538950443268, 'cmip6_1935': 0.259886771440506, 'cmip6_1940': 0.26843950152397156, 'cmip6_1945': 0.26085537672042847, 'cmip6_1950': 0.2619962

In [4]:
for dataset in dataset_list:
    if dataset not in total_distances:
        print(f"Dataset {dataset} not found in total_distances")
        total_distances[dataset] = 0.4

Dataset buildings_900k not found in total_distances


In [24]:
# def get_dataset_group(name):
#     if name.startswith("era5"):
#         return "era5"
#     if name.startswith("cmip6"):
#         return "cmip6"
#     if name.startswith("largest"):
#         return "largest"
#     return name

# # total_distances dict group datasets, if group use mean
# grouped_distances = {}

# for dataset, distance in total_distances.items():
#     group = get_dataset_group(dataset)
#     if group not in grouped_distances:
#         grouped_distances[group] = []
#     grouped_distances[group].append(distance)

# for group, distances in grouped_distances.items():
#     grouped_distances[group] = np.mean(distances)


2. Some datasets have been split into smaller chunks for efficiency -- group them back together

In [5]:
# Define the parameter T to control the sampling ratio
T = 2.4  # You can adjust this to control the sharpness of the distance effect

# Calculate the sampling weights based on distances
dataset_names = list(total_distances.keys())
distances = np.array([total_distances[dataset] for dataset in dataset_names])

# Inverse the distances (closer datasets get higher weights)
inverse_distances = np.exp(-T * distances)  # Use exponential function for a smoother decay based on T

# Normalize the weights to make them sum to 1
sampling_weights = inverse_distances / inverse_distances.sum()

# Create a dictionary to map each dataset to its corresponding sampling weight
sampling_weight_map = {dataset: weight for dataset, weight in zip(dataset_names, sampling_weights)}

print("Final sampling weights:")
print(sampling_weight_map)

Final sampling weights:
{'sceaux': 0.008956260397402526, 'borealis': 0.007613707211513147, 'ideal': 0.0061754294942996495, 'bdg-2_panther': 0.007600004169564883, 'bdg-2_fox': 0.007495885741649626, 'bdg-2_rat': 0.008600935220017768, 'bdg-2_bear': 0.00744168910987803, 'smart': 0.008358485386140752, 'lcl': 0.007640954911612925, 'cmip6_1850': 0.006546259424534958, 'cmip6_1855': 0.006566379942616081, 'cmip6_1860': 0.00659804235378829, 'cmip6_1865': 0.006538690265112224, 'cmip6_1870': 0.006590393993303604, 'cmip6_1875': 0.006540062592704978, 'cmip6_1880': 0.006605939612835756, 'cmip6_1885': 0.00652245284326, 'cmip6_1890': 0.0065664611950068275, 'cmip6_1895': 0.0065062529513486055, 'cmip6_1900': 0.006599712249039168, 'cmip6_1905': 0.006556605061689941, 'cmip6_1910': 0.00665226866736756, 'cmip6_1915': 0.006531856305052009, 'cmip6_1920': 0.00663718936898028, 'cmip6_1925': 0.006559000033974049, 'cmip6_1930': 0.006543842425370746, 'cmip6_1935': 0.0066366387064427535, 'cmip6_1940': 0.0065018000166

In [6]:
sampling_weight_map

{'sceaux': 0.008956260397402526,
 'borealis': 0.007613707211513147,
 'ideal': 0.0061754294942996495,
 'bdg-2_panther': 0.007600004169564883,
 'bdg-2_fox': 0.007495885741649626,
 'bdg-2_rat': 0.008600935220017768,
 'bdg-2_bear': 0.00744168910987803,
 'smart': 0.008358485386140752,
 'lcl': 0.007640954911612925,
 'cmip6_1850': 0.006546259424534958,
 'cmip6_1855': 0.006566379942616081,
 'cmip6_1860': 0.00659804235378829,
 'cmip6_1865': 0.006538690265112224,
 'cmip6_1870': 0.006590393993303604,
 'cmip6_1875': 0.006540062592704978,
 'cmip6_1880': 0.006605939612835756,
 'cmip6_1885': 0.00652245284326,
 'cmip6_1890': 0.0065664611950068275,
 'cmip6_1895': 0.0065062529513486055,
 'cmip6_1900': 0.006599712249039168,
 'cmip6_1905': 0.006556605061689941,
 'cmip6_1910': 0.00665226866736756,
 'cmip6_1915': 0.006531856305052009,
 'cmip6_1920': 0.00663718936898028,
 'cmip6_1925': 0.006559000033974049,
 'cmip6_1930': 0.006543842425370746,
 'cmip6_1935': 0.0066366387064427535,
 'cmip6_1940': 0.0065018000

4. Finally, we can generate the YAML file required for the pre-training dataset with the appropriate `weight_map`.

In [7]:
for builder_cls in [
    Buildings900KDatasetBuilder,
    BuildingsBenchDatasetBuilder,
    CloudOpsTSFDatasetBuilder,
    CMIP6DatasetBuilder,
    ERA5DatasetBuilder,
    GluonTSDatasetBuilder,
    LargeSTDatasetBuilder,
    LibCityDatasetBuilder,
    OthersLOTSADatasetBuilder,
    ProEnFoDatasetBuilder,
    SubseasonalDatasetBuilder,
]:
    print(f"- _target_: uni2ts.data.builder.lotsa_v1.{builder_cls.__name__}")
    print("  datasets: ${cls_getattr:${._target_},dataset_list}")
    print("  weight_map:")
    for dataset in builder_cls.dataset_list:
        print(f"    {dataset}: {sampling_weight_map[dataset]}")
    print("  sample_time_series:")
    print("    _target_: uni2ts.data.dataset.SampleTimeSeriesType")
    print('    _args_: ["proportional"]')

- _target_: uni2ts.data.builder.lotsa_v1.Buildings900KDatasetBuilder
  datasets: ${cls_getattr:${._target_},dataset_list}
  weight_map:
    buildings_900k: 0.004741406719124674
  sample_time_series:
    _target_: uni2ts.data.dataset.SampleTimeSeriesType
    _args_: ["proportional"]
- _target_: uni2ts.data.builder.lotsa_v1.BuildingsBenchDatasetBuilder
  datasets: ${cls_getattr:${._target_},dataset_list}
  weight_map:
    sceaux: 0.008956260397402526
    borealis: 0.007613707211513147
    ideal: 0.0061754294942996495
    bdg-2_panther: 0.007600004169564883
    bdg-2_fox: 0.007495885741649626
    bdg-2_rat: 0.008600935220017768
    bdg-2_bear: 0.00744168910987803
    smart: 0.008358485386140752
    lcl: 0.007640954911612925
  sample_time_series:
    _target_: uni2ts.data.dataset.SampleTimeSeriesType
    _args_: ["proportional"]
- _target_: uni2ts.data.builder.lotsa_v1.CloudOpsTSFDatasetBuilder
  datasets: ${cls_getattr:${._target_},dataset_list}
  weight_map:
    azure_vm_traces_2017: 0.0