In [None]:
import tensorflow as tf
import numpy as np
import glob
from tqdm import tqdm

FEATURES = [
    "u_component_of_wind_10m_above_ground",
    "v_component_of_wind_10m_above_ground",
    "temperature_2m_above_ground",
    "precipitable_water_entire_atmosphere",
]

TILE_SIZE = (128, 128)
feature_values = {key: [] for key in FEATURES}

tfrecord_files = sorted(glob.glob("./data/data_full_train__*.tfrecord"))

In [9]:
def get_feature_spec():
    return {
        key: tf.io.FixedLenFeature(shape=TILE_SIZE, dtype=tf.float32)
        for key in FEATURES
    }

def parse_fn(example_proto):
    features = tf.io.parse_single_example(example_proto, get_feature_spec())
    return features

#iterate over paths and get values
for tfrecord_path in tqdm(tfrecord_files):
    dataset = tf.data.TFRecordDataset(tfrecord_path)
    dataset = dataset.map(parse_fn)

    for sample in dataset:
        for key in FEATURES:
            arr = sample[key].numpy().flatten()
            feature_values[key].append(arr)

  0%|          | 0/34 [00:00<?, ?it/s]2025-04-23 23:30:13.195214: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at example_parsing_ops.cc:98 : INVALID_ARGUMENT: Key: precipitable_water_entire_atmosphere.  Can't parse serialized Example.
2025-04-23 23:30:13.195575: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at example_parsing_ops.cc:98 : INVALID_ARGUMENT: Key: precipitable_water_entire_atmosphere.  Can't parse serialized Example.
2025-04-23 23:30:13.195785: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at example_parsing_ops.cc:98 : INVALID_ARGUMENT: Key: precipitable_water_entire_atmosphere.  Can't parse serialized Example.
2025-04-23 23:30:13.195998: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at example_parsing_ops.cc:98 : INVALID_ARGUMENT: Key: precipitable_water_entire_atmosphere.  Can't parse serialized Example.
2025-04-23 23:30:13.196323: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES

InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_4_device_/job:localhost/replica:0/task:0/device:CPU:0}} Key: precipitable_water_entire_atmosphere.  Can't parse serialized Example.
	 [[{{node ParseSingleExample/ParseExample/ParseExampleV2}}]] [Op:IteratorGetNext] name: 

In [None]:
# Combine all values
final_stats = {}
for key in FEATURES:
    all_vals = np.concatenate(feature_values[key])
    p001 = np.percentile(all_vals, 0.1)
    p999 = np.percentile(all_vals, 99.9)
    mean = np.mean(all_vals)
    std = np.std(all_vals)

    final_stats[key] = {
        "min_clip": float(p001),
        "max_clip": float(p999),
        "mean": float(mean),
        "std": float(std),
    }

# Print final stats
for key, stats in final_stats.items():
    print(f"{key}: {stats}")