In [1]:
import os
import numpy as np
import pandas as pd
import xarray as xr
from tqdm import tqdm

In [2]:
def check_data_folder(folder):
    return os.path.exists(folder) and os.path.isdir(folder)

def generate_date_range(start_date, end_date):
    """
    Generate a list of dates from start_date to end_date.
    """
    return pd.date_range(start=start_date, end=end_date, freq='D').strftime('%Y%m%d').tolist()

def load_data(file_path):
    """
    Load data from a NetCDF file.
    """
    if os.path.exists(file_path):
        return xr.open_dataset(file_path)
    else:
        raise FileNotFoundError(f'File not found: {file_path}')


# main program
## check data folder
data_folder = f'nc4'
if check_data_folder(data_folder):
    print(f'Data folder found: {data_folder}')
else:
    raise FileNotFoundError(f'Data folder not found: {data_folder}')

## load data
start_date = '2024-01-01'
end_date = '2024-01-14'
date_list = generate_date_range(start_date, end_date)

## get location and shape
path = os.path.join(data_folder, f'M2T1NXFLX.5.12.4%3AMERRA2_400.tavg1_2d_flx_Nx.{date_list[0]}.nc4.dap.nc4')
nc4_data = load_data(path)
lat = nc4_data['lat'].values
lon = nc4_data['lon'].values
shape = nc4_data['TLML'].shape
total_locations = shape[1] * shape[2]

## combine data
# 預先讀取第一個檔案以取得 shape 資訊
sample_path = os.path.join(data_folder, f'M2T1NXFLX.5.12.4%3AMERRA2_400.tavg1_2d_flx_Nx.{date_list[0]}.nc4.dap.nc4')
sample_data = load_data(sample_path)
shape_per_file = sample_data['TLML'].shape   # e.g. (24, 361, 576)
time_per_file = len(sample_data['time'])

# 預先配置 array（假設每天 24 筆資料）
total_samples = len(date_list)
combined = np.empty((total_samples * shape_per_file[0], *shape_per_file[1:]), dtype=np.float32)
time_list = np.empty(total_samples * time_per_file, dtype=sample_data['time'].dtype)

# 批次載入並填入預分配的 array
for i, date in enumerate(tqdm(date_list, desc="Combining")):
    path = os.path.join(data_folder, f'M2T1NXFLX.5.12.4%3AMERRA2_400.tavg1_2d_flx_Nx.{date}.nc4.dap.nc4')
    nc4_data = load_data(path)

    start = i * shape_per_file[0]
    end = (i + 1) * shape_per_file[0]

    combined[start:end] = nc4_data['TLML'].values
    time_list[start:end] = nc4_data['time'].values

print(f'Combined data shape: {combined.shape}')

Data folder found: nc4


Combining: 100%|██████████| 14/14 [00:02<00:00,  5.46it/s]

Combined data shape: (336, 361, 576)





In [13]:
# to Rcode y1 cell × time
ntot, nlat, nlon = combined.shape
ncell = nlat * nlon
y1 = combined.reshape(ntot, ncell).T

# to Rcode gg cell × (lon, lat)
lon_grid, lat_grid = np.meshgrid(lon, lat)
gg = np.vstack([lon_grid.ravel(), lat_grid.ravel()]).T

m = 5000
np.random.seed(123)
all_idx  = np.arange(ncell)
pickm    = np.random.choice(all_idx, size=m,   replace=False)


In [14]:
pickm.shape

(5000,)