In [3]:
import numpy as np
dist = np.load("/home/zgjgroup/wwd/Time-LLM/dataset/metr_la/metr_la_dist.npy")
print(dist.shape)
print(dist)

(207, 207)
[[    0.      inf     inf ...     inf  8114.8 10009.7]
 [    inf     0.   2504.6 ...     inf     inf     inf]
 [    inf  1489.3     0.  ...     inf     inf  9837. ]
 ...
 [    inf     inf     inf ...     0.      inf     inf]
 [ 9599.8     inf     inf ...     inf     0.      inf]
 [10119.9  9374.8     inf ...     inf  9018.7     0. ]]


In [4]:
import os
import pandas as pd
datasets_path={'la':"/home/zgjgroup/wwd/Time-LLM/dataset/metr_la"}
datasets_path

{'la': '/home/zgjgroup/wwd/Time-LLM/dataset/metr_la'}

In [None]:
class MetrLA(PandasDataset):
    """
    METR-LA dataset
    207
    with 0, no nan
    """
    def __init__(self, impute_zeros=False, freq='5T'):

        df, dist, mask = self.load(impute_zeros=impute_zeros)
        self.dist = dist
        # PandasDataset中可以采用不同的方式对TS进行重采样
        super().__init__(dataframe=df, u=None, mask=mask, name='la', freq=freq, aggr='nearest')

    def load(self, impute_zeros=True):
        path = os.path.join(datasets_path['la'], 'metr_la.h5')
        # df.shape = (34272,207)
        df = pd.read_hdf(path)
        datetime_idx = sorted(df.index)
        # 5T表示以5min作为采样率
        date_range = pd.date_range(datetime_idx[0], datetime_idx[-1], freq='5T')
        df = df.reindex(index=date_range)
        # 将为值为非空的mask
        mask = ~np.isnan(df.values)
        if impute_zeros:
            mask = mask * (df.values != 0.).astype('uint8')
            # 使用前向填充（forward fill）方法将0值替换为前一个非0值
            df = df.replace(to_replace=0., method='ffill')
        else:
            mask = None
        dist = self.load_distance_matrix()
        return df, dist, mask

    def load_distance_matrix(self):
        # dist.shape=(207,207)
        path = os.path.join(datasets_path['la'], 'metr_la_dist.npy')
        try:
            dist = np.load(path)
        except:
            distances = pd.read_csv(os.path.join(datasets_path['la'], 'distances_la.csv'))
            with open(os.path.join(datasets_path['la'], 'sensor_ids_la.txt')) as f:
                ids = f.read().strip().split(',')
            num_sensors = len(ids)
            dist = np.ones((num_sensors, num_sensors), dtype=np.float32) * np.inf
            # Builds sensor id to index map.
            sensor_id_to_ind = {int(sensor_id): i for i, sensor_id in enumerate(ids)}

            # Fills cells in the matrix with distances.
            for row in distances.values:
                if row[0] not in sensor_id_to_ind or row[1] not in sensor_id_to_ind:
                    continue
                dist[sensor_id_to_ind[row[0]], sensor_id_to_ind[row[1]]] = row[2]
            np.save(path, dist)
        return dist

    def get_similarity(self, thr=0.1, force_symmetric=False, sparse=False):
        finite_dist = self.dist.reshape(-1)
        finite_dist = finite_dist[~np.isinf(finite_dist)] # 移除无穷大值
        sigma = finite_dist.std() 
        # 使用高斯核函数计算相似性矩阵
        adj = np.exp(-np.square(self.dist / sigma)) 
        adj[adj < thr] = 0.
        if force_symmetric:
            adj = np.maximum.reduce([adj, adj.T])
        if sparse:
            import scipy.sparse as sps
            # 稀疏矩阵在存储和处理大型图结构数据时更为高效
            adj = sps.coo_matrix(adj)
        return adj

    @property
    def mask(self):
        return self._mask