In [3]:
import numpy as np
from pathlib import Path

data_dir = Path("/home/wangxc1117/Weather2K")
path = data_dir / "weather2k.npy"

print("Exists:", path.exists(), "|", path)
print("File size (MB):", path.stat().st_size / (1024**2))

arr = np.load(path, allow_pickle=False)

print("type:", type(arr))
print("dtype:", arr.dtype)
print("shape:", arr.shape)
print("ndim :", arr.ndim)
print("nbytes (MB):", arr.nbytes / (1024**2))

flat = arr.ravel()
print("head (first 10):", flat[:10])

if np.issubdtype(arr.dtype, np.number):
    print("min:", np.nanmin(arr))
    print("max:", np.nanmax(arr))
    print("mean:", np.nanmean(arr))
    print("std:", np.nanstd(arr))
    print("nan_count:", np.isnan(arr).sum())


Exists: True | /home/wangxc1117/Weather2K/weather2k.npy
File size (MB): 2522.9268798828125
type: <class 'numpy.ndarray'>
dtype: float64
shape: (1866, 13, 13632)
ndim : 3
nbytes (MB): 2522.9267578125
head (first 10): [52.97 52.97 52.97 52.97 52.97 52.97 52.97 52.97 52.97 52.97]
min: -47.2
max: 4613.0
mean: 166.6639429223546
std: 360.45094694573464
nan_count: 0


In [4]:
import numpy as np
from pathlib import Path

path = Path("/home/wangxc1117/Weather2K/weather2k.npy")
print("Exists:", path.exists(), "|", path)
print("File size (GB):", path.stat().st_size / (1024**3))

arr = np.load(path, allow_pickle=False)
print("type:", type(arr))
print("dtype:", arr.dtype)
print("shape:", arr.shape)     # (stations, vars, time)
print("ndim :", arr.ndim)
print("nbytes (GB):", arr.nbytes / (1024**3))

# 快速確認沒有 NaN/Inf
print("nan_count:", np.isnan(arr).sum())
print("inf_count:", np.isinf(arr).sum())


Exists: True | /home/wangxc1117/Weather2K/weather2k.npy
File size (GB): 2.463795781135559
type: <class 'numpy.ndarray'>
dtype: float64
shape: (1866, 13, 13632)
ndim : 3
nbytes (GB): 2.4637956619262695
nan_count: 0
inf_count: 0


In [5]:
stations, nvars, T = arr.shape
assert nvars == 13, f"Expected 13 vars, got {nvars}"
print(f"Stations={stations}, Vars={nvars}, Timesteps={T}")


Stations=1866, Vars=13, Timesteps=13632


In [6]:
var_info = [
    ("lat", "Latitude", "deg"),
    ("lon", "Longitude", "deg"),
    ("alt", "Altitude", "m"),
    ("ap",  "Air pressure", "hPa"),
    ("t",   "Air temperature", "C"),
    ("mxt", "Max temperature", "C"),
    ("mnt", "Min temperature", "C"),
    ("rh",  "Relative humidity", "%"),
    ("p3",  "Precipitation in 3h", "mm"),
    ("wd",  "Wind direction", "deg"),
    ("ws",  "Wind speed", "m/s"),
    ("mwd", "Max wind direction", "deg"),
    ("mws", "Max wind speed", "m/s"),
]


In [7]:
def stats(x):
    return float(np.min(x)), float(np.max(x)), float(np.mean(x)), float(np.std(x))

print("Per-variable stats over ALL stations & time:")
for i, (short, longname, unit) in enumerate(var_info):
    v = arr[:, i, :]  # (stations, time)
    mn, mx, mu, sd = stats(v)
    print(f"[{i:02d}] {short:>3s} | {longname:<26s} | unit={unit:<4s} "
          f"| min={mn:9.2f} max={mx:9.2f} mean={mu:9.2f} std={sd:9.2f}")


Per-variable stats over ALL stations & time:
[00] lat | Latitude                   | unit=deg  | min=    18.65 max=    53.47 mean=    33.38 std=     6.63
[01] lon | Longitude                  | unit=deg  | min=    75.25 max=   133.99 mean=   112.43 std=     8.48
[02] alt | Altitude                   | unit=m    | min=   -47.20 max=  4613.00 mean=   605.86 std=   819.78
[03]  ap | Air pressure               | unit=hPa  | min=   564.90 max=  1050.00 mean=   948.94 std=    84.84
[04]   t | Air temperature            | unit=C    | min=   -17.50 max=    47.20 mean=    14.42 std=    11.54
[05] mxt | Max temperature            | unit=C    | min=   -16.85 max=    48.10 mean=    14.91 std=    11.52
[06] mnt | Min temperature            | unit=C    | min=   -17.75 max=    46.90 mean=    13.94 std=    11.54
[07]  rh | Relative humidity          | unit=%    | min=     0.00 max=   100.00 mean=    67.57 std=    23.82
[08]  p3 | Precipitation in 3h        | unit=mm   | min=    -1.00 max=  1049.70 mea

In [8]:
static_names = ["lat", "lon", "alt"]
for i, name in enumerate(static_names):
    v = arr[:, i, :]                 # (stations, time)
    drift = np.max(v, axis=1) - np.min(v, axis=1)  # 每個站的時間漂移幅度
    print(f"{name}: max drift over time (per-station) -> "
          f"min={drift.min():.6f}, max={drift.max():.6f}, mean={drift.mean():.6f}")

# 如果 drift 幾乎是 0，表示靜態特徵正確


lat: max drift over time (per-station) -> min=0.000000, max=0.000000, mean=0.000000
lon: max drift over time (per-station) -> min=0.000000, max=0.000000, mean=0.000000
alt: max drift over time (per-station) -> min=0.000000, max=0.000000, mean=0.000000


In [9]:
static = arr[:, 0:3, 0]  # lat/lon/alt 在 t=0 的值
print("static shape:", static.shape)  # (stations, 3)
print("first 5 stations static (lat,lon,alt):\n", static[:5])


static shape: (1866, 3)
first 5 stations static (lat,lon,alt):
 [[ 52.97 122.51 439.7 ]
 [ 53.47 122.38 297.3 ]
 [ 52.35 124.72 363.  ]
 [ 52.04 123.57 515.4 ]
 [ 51.67 124.39 502.2 ]]


In [10]:
import numpy as np

# arr shape: (stations, 13, time)
lat = arr[:, 0, 0]  # (stations,)
lon = arr[:, 1, 0]  # (stations,)

# Beijing bounding box (approx.)
LAT_MIN, LAT_MAX = 39.4, 41.1
LON_MIN, LON_MAX = 115.4, 117.5

bj_mask = (lat >= LAT_MIN) & (lat <= LAT_MAX) & (lon >= LON_MIN) & (lon <= LON_MAX)
bj_idx = np.where(bj_mask)[0]

print("Total stations:", arr.shape[0])
print("Beijing stations:", len(bj_idx))
print("First 10 bj_idx:", bj_idx[:10])

# 看一下經緯度範圍是否合理
if len(bj_idx) > 0:
    print("lat range:", float(lat[bj_idx].min()), "to", float(lat[bj_idx].max()))
    print("lon range:", float(lon[bj_idx].min()), "to", float(lon[bj_idx].max()))


Total stations: 1866
Beijing stations: 31
First 10 bj_idx: [545 546 548 549 550 552 553 554 555 557]
lat range: 39.42 to 40.93
lon range: 115.5 to 117.47
