In [1]:
### This script is to impute all traffic speed readings
### Due to 0 traffic volume, traffic speed is miscalculated as 0 in the official database
### This script impute all speed=0 (when volume=0) values into the closest non-zero traffic speed tradings in the history.

### The traffic dataset undergoes an initial preprocessing step to address instances where both volume and speed metrics are recorded as zero. Such occurrences, typically observed at midnight during 15-minute intervals when no vehicles are detected, are attributed to systematic issues in algorithmic processing, given that speed cannot be computed with a zero denominator. To rectify this, traffic speed values of zero are replaced with the nearest preceding non-zero measurement. 

### The h5 files for all sensor398 are original files and are not processed using this script yet.

from tqdm import tqdm
import tensorflow as tf
import h5py

with h5py.File('sensor498_2019-01-01_2019-12-31.h5', 'r') as f:  
    data = tf.convert_to_tensor(f['data'][:]) 
    sites_ = tf.convert_to_tensor(f['sites'][:]) 
    timestamps_ = tf.convert_to_tensor(f['timestamps'][:]) 


In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
data_shape = data.shape

# Convert the data tensor into a Variable for mutability
mutable_data = tf.Variable(data)

volume_zero_mask = tf.equal(mutable_data[:, :, 1], 0)

for i in tqdm(range(data_shape[0])):  # Loop through all sites
    prev_nonzero_speed = None
    for j in range(data_shape[1]):  # Loop through all timestamps
        if volume_zero_mask[i, j]:
            if prev_nonzero_speed is not None:
                # Impute speed with the last known non-zero speed
                mutable_data[i, j, 0].assign(prev_nonzero_speed)
        else:
            # Update the previous non-zero speed
            prev_nonzero_speed = mutable_data[i, j, 0].numpy()

# Convert back to a regular tensor if needed
imputed_data = tf.convert_to_tensor(mutable_data)


100%|██████████████████████████████████████████████████████████████████████████████| 498/498 [8:07:14<00:00, 58.70s/it]


In [4]:
with h5py.File('sensor180_2019-01-01_2019-12-31_imputed.h5', 'w') as f:  
    data = f.create_dataset('data', shape=imputed_data.shape)  
    data[:] = imputed_data
    timestamps = f.create_dataset('timestamps', shape=timestamps_.shape, dtype=h5py.special_dtype(vlen=str)) 
    timestamps[:] = timestamps_
    sites = f.create_dataset('sites', shape=sites_.shape) 
    sites[:] = sites_

In [5]:
imputed_data

<tf.Tensor: shape=(498, 35040, 2), dtype=float32, numpy=
array([[[56.,  5.],
        [59., 12.],
        [56.,  9.],
        ...,
        [55., 16.],
        [55., 10.],
        [58.,  9.]],

       [[56., 11.],
        [57., 15.],
        [58., 15.],
        ...,
        [57., 18.],
        [56., 18.],
        [56.,  9.]],

       [[62.,  5.],
        [62.,  6.],
        [58.,  4.],
        ...,
        [53., 12.],
        [56.,  4.],
        [55.,  1.]],

       ...,

       [[63., 51.],
        [67., 90.],
        [66., 96.],
        ...,
        [69., 85.],
        [69., 73.],
        [69., 71.]],

       [[nan, nan],
        [nan, nan],
        [nan, nan],
        ...,
        [nan, nan],
        [nan, nan],
        [nan, nan]],

       [[nan, nan],
        [nan, nan],
        [nan, nan],
        ...,
        [nan, nan],
        [nan, nan],
        [nan, nan]]], dtype=float32)>

In [None]:
data