In [1]:
import datasets
import pyarrow.compute as pc
import numpy as np

from uni2ts.common.env import env
from uni2ts.data.builder.lotsa_v1 import (
    Buildings900KDatasetBuilder,
    BuildingsBenchDatasetBuilder,
    CloudOpsTSFDatasetBuilder,
    CMIP6DatasetBuilder,
    ERA5DatasetBuilder,
    GluonTSDatasetBuilder,
    LargeSTDatasetBuilder,
    LibCityDatasetBuilder,
    OthersLOTSADatasetBuilder,
    ProEnFoDatasetBuilder,
    SubseasonalDatasetBuilder,
)

  from .autonotebook import tqdm as notebook_tqdm


In this notebook, we will see how to calculate the dataset weighting in the [pre-training dataset config file](../cli/conf/pretrain/data/lotsa_v1_weighted.yaml). We will see how to automatically generate this file to avoid excessive manual labor.

In [2]:
dataset_list = (
    Buildings900KDatasetBuilder.dataset_list
    + BuildingsBenchDatasetBuilder.dataset_list
    + CloudOpsTSFDatasetBuilder.dataset_list
    + CMIP6DatasetBuilder.dataset_list
    + ERA5DatasetBuilder.dataset_list
    + GluonTSDatasetBuilder.dataset_list
    + LargeSTDatasetBuilder.dataset_list
    + LibCityDatasetBuilder.dataset_list
    + OthersLOTSADatasetBuilder.dataset_list
    + ProEnFoDatasetBuilder.dataset_list
    + SubseasonalDatasetBuilder.dataset_list
)

1. Obtain the weights of all dataset from all available datasets.

In [3]:
from pathlib import Path
import json

total_distances = {}

WEIGHTS_PATH = "/data/Blob_EastUS/v-zhenwzhang/tsfm_datasets/lotsa_weights"
path = Path(WEIGHTS_PATH)
# go through all folders in WEIGHTS_PATH
for folder in path.iterdir():
    # iterate through all files names in the folder
    if not folder.is_dir():
        continue
    for file in folder.iterdir():
        # check if .json file
        if file.suffix == ".json":
            # load json file and get the weights
            with open(file, "r") as f:
                distance = json.load(f)
            total_distances |= distance

print(total_distances)

{'sceaux': 0.13499212265014648, 'borealis': 0.20265986025333405, 'ideal': 0.2898980975151062, 'bdg-2_panther': 0.20341044664382935, 'bdg-2_fox': 0.20915815234184265, 'bdg-2_rat': 0.15185955166816711, 'bdg-2_bear': 0.2121816724538803, 'smart': 0.16377359628677368, 'lcl': 0.2011713683605194, 'cmip6_1850': 0.2656000256538391, 'cmip6_1855': 0.26432132720947266, 'cmip6_1860': 0.262317031621933, 'cmip6_1865': 0.2660820782184601, 'cmip6_1870': 0.26280030608177185, 'cmip6_1875': 0.2659946382045746, 'cmip6_1880': 0.26181861758232117, 'cmip6_1885': 0.26711806654930115, 'cmip6_1890': 0.2643161714076996, 'cmip6_1895': 0.26815423369407654, 'cmip6_1900': 0.2622115910053253, 'cmip6_1905': 0.2649420499801636, 'cmip6_1910': 0.25890663266181946, 'cmip6_1915': 0.2665177881717682, 'cmip6_1920': 0.25985220074653625, 'cmip6_1925': 0.264789879322052, 'cmip6_1930': 0.2657538950443268, 'cmip6_1935': 0.259886771440506, 'cmip6_1940': 0.26843950152397156, 'cmip6_1945': 0.26085537672042847, 'cmip6_1950': 0.2619962

In [4]:
for dataset in dataset_list:
    if dataset not in total_distances:
        print(f"Dataset {dataset} not found in total_distances")
        total_distances[dataset] = 0.4

Dataset buildings_900k not found in total_distances


In [5]:
def get_dataset_group(name):
    if name.startswith("era5"):
        return "era5"
    if name.startswith("cmip6"):
        return "cmip6"
    if name.startswith("largest"):
        return "largest"
    return name

# total_distances dict group datasets, if group use mean
grouped_distances = {}

for dataset, distance in total_distances.items():
    group = get_dataset_group(dataset)
    if group not in grouped_distances:
        grouped_distances[group] = []
    grouped_distances[group].append(distance)

for group, distances in grouped_distances.items():
    grouped_distances[group] = np.mean(distances)


2. Some datasets have been split into smaller chunks for efficiency -- group them back together

In [6]:
# Define the parameter T to control the sampling ratio
T = 1.5  # You can adjust this to control the sharpness of the distance effect

# Calculate the sampling weights based on distances
dataset_names = list(total_distances.keys())
distances = np.array([total_distances[dataset] for dataset in dataset_names])

# Inverse the distances (closer datasets get higher weights)
inverse_distances = np.exp(-T * distances)  # Use exponential function for a smoother decay based on T

# Normalize the weights to make them sum to 1
sampling_weights = inverse_distances / inverse_distances.mean()

# Create a dictionary to map each dataset to its corresponding sampling weight
sampling_weight_map = {dataset: weight for dataset, weight in zip(dataset_names, sampling_weights)}

print("Final sampling weights:")
print(sampling_weight_map)

Final sampling weights:
{'sceaux': 1.344134681738664, 'borealis': 1.2143984366350067, 'ideal': 1.0654438622586182, 'bdg-2_panther': 1.2130319396261324, 'bdg-2_fox': 1.2026186675592025, 'bdg-2_rat': 1.3105531541702484, 'bdg-2_bear': 1.1971768044810323, 'smart': 1.287340208888493, 'lcl': 1.2171128991903442, 'cmip6_1850': 1.1049925492665187, 'cmip6_1855': 1.1071140115296774, 'cmip6_1860': 1.110447495576657, 'cmip6_1865': 1.104193841327335, 'cmip6_1870': 1.1096428109041518, 'cmip6_1875': 1.1043386769126733, 'cmip6_1880': 1.1112779999223137, 'cmip6_1885': 1.1024792759727438, 'cmip6_1890': 1.1071225736533612, 'cmip6_1895': 1.1007670777047778, 'cmip6_1900': 1.1106231388692096, 'cmip6_1905': 1.1060836749540814, 'cmip6_1910': 1.116142653704061, 'cmip6_1915': 1.1034724147320039, 'cmip6_1920': 1.114560692551249, 'cmip6_1925': 1.10633617399131, 'cmip6_1930': 1.104737541900948, 'cmip6_1935': 1.114502897344858, 'cmip6_1940': 1.100296158312363, 'cmip6_1945': 1.1128848030142482, 'cmip6_1950': 1.110981

In [7]:
import yaml

with open("/home/v-zhenwzhang/uni2ts/cli/conf/pretrain/data/lotsa_v1_weighted.yaml", "r") as file:
    data = yaml.safe_load(file)

def find_weight(data, target_name):
    # 递归搜索函数
    if isinstance(data, dict):
        for key, value in data.items():
            # 如果找到 weight_map
            if key == 'weight_map' and isinstance(value, dict) and target_name in value:
                return value[target_name]
            else:
                # 递归搜索
                found = find_weight(value, target_name)
                if found:
                    return found
    elif isinstance(data, list):
        for item in data:
            found = find_weight(item, target_name)
            if found:
                return found
    return None

In [8]:
for dataset in sampling_weight_map:
    ori_weight = find_weight(data, dataset)
    if ori_weight:
        print(f"Dataset {dataset} original weight: {ori_weight}")
    else:
        print(f"Dataset {dataset} not found in original weight, set to 1")
        ori_weight = 1
    sampling_weight_map[dataset] *= ori_weight

Dataset sceaux original weight: 156.53911429030117
Dataset borealis original weight: 25.39199857374493
Dataset ideal original weight: 26.45917685053796
Dataset bdg-2_panther original weight: 40.069036647372776
Dataset bdg-2_fox original weight: 78.76137356782516
Dataset bdg-2_rat original weight: 77.24149753397461
Dataset bdg-2_bear original weight: 74.50802078734561
Dataset smart original weight: 87.55633398363929
Dataset lcl original weight: 61.224515184913
Dataset cmip6_1850 original weight: 0.014186655115280208
Dataset cmip6_1855 original weight: 0.014186655115280208
Dataset cmip6_1860 original weight: 0.014186655115280208
Dataset cmip6_1865 original weight: 0.014186655115280208
Dataset cmip6_1870 original weight: 0.014186655115280208
Dataset cmip6_1875 original weight: 0.014186655115280208
Dataset cmip6_1880 original weight: 0.014186655115280208
Dataset cmip6_1885 original weight: 0.014186655115280208
Dataset cmip6_1890 original weight: 0.014186655115280208
Dataset cmip6_1895 orig

In [9]:
sampling_weight_map

{'sceaux': 179.2751784951192,
 'borealis': 27.922965603690855,
 'ideal': 27.61269109116093,
 'bdg-2_panther': 44.04310939822583,
 'bdg-2_fox': 86.27493372652975,
 'bdg-2_rat': 87.56946929135444,
 'bdg-2_bear': 81.46789989819497,
 'smart': 98.55645626883774,
 'lcl': 67.38727230613894,
 'cmip6_1850': 0.015022559311836416,
 'cmip6_1855': 0.015034089328228521,
 'cmip6_1860': 0.015052179859021584,
 'cmip6_1865': 0.015018214942184993,
 'cmip6_1870': 0.015047815891295944,
 'cmip6_1875': 0.015019002878607815,
 'cmip6_1880': 0.015056681862807697,
 'cmip6_1885': 0.01500888262567868,
 'cmip6_1890': 0.015034135835971105,
 'cmip6_1895': 0.014999554498994738,
 'cmip6_1900': 0.015053132155819868,
 'cmip6_1905': 0.01502849116981497,
 'cmip6_1910': 0.01508301175610769,
 'cmip6_1915': 0.015014289303900516,
 'cmip6_1920': 0.015074456974352947,
 'cmip6_1925': 0.015029863369691624,
 'cmip6_1930': 0.015021172468628288,
 'cmip6_1935': 0.015074144296932507,
 'cmip6_1940': 0.014996987384478766,
 'cmip6_1945': 

4. Finally, we can generate the YAML file required for the pre-training dataset with the appropriate `weight_map`.

In [9]:
for builder_cls in [
    Buildings900KDatasetBuilder,
    BuildingsBenchDatasetBuilder,
    CloudOpsTSFDatasetBuilder,
    CMIP6DatasetBuilder,
    ERA5DatasetBuilder,
    GluonTSDatasetBuilder,
    LargeSTDatasetBuilder,
    LibCityDatasetBuilder,
    OthersLOTSADatasetBuilder,
    ProEnFoDatasetBuilder,
    SubseasonalDatasetBuilder,
]:
    print(f"- _target_: uni2ts.data.builder.lotsa_v1.{builder_cls.__name__}")
    print("  datasets: ${cls_getattr:${._target_},dataset_list}")
    print("  weight_map:")
    for dataset in builder_cls.dataset_list:
        print(f"    {dataset}: {sampling_weight_map[dataset]}")
    print("  sample_time_series:")
    print("    _target_: uni2ts.data.dataset.SampleTimeSeriesType")
    print('    _args_: ["proportional"]')

- _target_: uni2ts.data.builder.lotsa_v1.Buildings900KDatasetBuilder
  datasets: ${cls_getattr:${._target_},dataset_list}
  weight_map:
    buildings_900k: 0.06367618653303218
  sample_time_series:
    _target_: uni2ts.data.dataset.SampleTimeSeriesType
    _args_: ["proportional"]
- _target_: uni2ts.data.builder.lotsa_v1.BuildingsBenchDatasetBuilder
  datasets: ${cls_getattr:${._target_},dataset_list}
  weight_map:
    sceaux: 210.4096525662463
    borealis: 30.836003370994163
    ideal: 28.19076757582099
    bdg-2_panther: 48.60502124331318
    bdg-2_fox: 94.71989813527048
    bdg-2_rat: 101.22908822598389
    bdg-2_bear: 89.19927423440075
    smart: 112.71478928000887
    lcl: 74.51714717823272
  sample_time_series:
    _target_: uni2ts.data.dataset.SampleTimeSeriesType
    _args_: ["proportional"]
- _target_: uni2ts.data.builder.lotsa_v1.CloudOpsTSFDatasetBuilder
  datasets: ${cls_getattr:${._target_},dataset_list}
  weight_map:
    azure_vm_traces_2017: 0.6803494333788178
    borg_