In [12]:
import boto3
import s3fs
import xarray as xr
import fsspec
import numpy as np
import time

In [2]:
%%time
bucket_url = 's3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/precip.zarr'

# create an instace of the S3FileSystem class from s3fs and list files
s3 = boto3.client('s3')
s3_filesystem = s3fs.S3FileSystem(anon=True)
files = s3_filesystem.ls(f'{bucket_url}') 
files

CPU times: user 154 ms, sys: 93.5 ms, total: 248 ms
Wall time: 758 ms


['noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/precip.zarr/',
 'noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/precip.zarr/.zattrs',
 'noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/precip.zarr/.zgroup',
 'noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/precip.zarr/.zmetadata',
 'noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/precip.zarr/RAINRATE',
 'noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/precip.zarr/crs',
 'noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/precip.zarr/time',
 'noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/precip.zarr/x',
 'noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/precip.zarr/y']

In [3]:
%%time
conus_bucket_url = 's3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/precip.zarr'

ds = xr.open_zarr(
        fsspec.get_mapper(
            conus_bucket_url,
            anon=True
        ),
        consolidated=True
)
ds

CPU times: user 2.31 s, sys: 97.1 ms, total: 2.41 s
Wall time: 8.63 s


Unnamed: 0,Array,Chunk
Bytes,24.83 TiB,314.03 MiB
Shape,"(385704, 3840, 4608)","(672, 350, 350)"
Dask graph,88396 chunks in 2 graph layers,88396 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 24.83 TiB 314.03 MiB Shape (385704, 3840, 4608) (672, 350, 350) Dask graph 88396 chunks in 2 graph layers Data type float32 numpy.ndarray",4608  3840  385704,

Unnamed: 0,Array,Chunk
Bytes,24.83 TiB,314.03 MiB
Shape,"(385704, 3840, 4608)","(672, 350, 350)"
Dask graph,88396 chunks in 2 graph layers,88396 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [5]:
ds_input_coords = xr.open_dataset("my_dataset.nc")
ds_input_coords

In [13]:
%%time
data = ds_input_coords
all_datasets = []


for dataset in data:
        start_time = time.time()
        print(f'{dataset} started')
        temp_lat = data[dataset].sel(coordinate='lat')
        temp_lat = temp_lat.where(np.logical_not(np.isnan(temp_lat)), drop=True)
        temp_lon = data[dataset].sel(coordinate='lon')
        temp_lon = temp_lon.where(np.logical_not(np.isnan(temp_lon)), drop=True)
        precip = ds.isel(x=temp_lat.astype(int), y=temp_lon.astype(int)) #.squeeze().RAINRATE
        precip_1 = precip.RAINRATE.resample(time='D').sum() * 24 * 3600
        precip_2 = precip_1.mean(dim='point')
        precip_2['station'] = dataset
        all_datasets.append(precip_2)
        precip_2.to_zarr(f'{dataset}.zarr', mode='w')
        print(f'{dataset} ended')
        print(f'time {int((time.time() - start_time) // 60)} minutes, {int((time.time() - start_time) % 60)} seconds ')


combined_data = xr.concat(all_datasets, dim="station")


10011500 started
10011500 ended
time 3 minutes, 42 seconds 
10105900 started
10105900 ended
time 3 minutes, 5 seconds 
10109000 started
10109000 ended
time 3 minutes, 6 seconds 
10126000 started
10126000 ended
time 3 minutes, 5 seconds 
10129900 started
10129900 ended
time 3 minutes, 7 seconds 
10131000 started
10131000 ended
time 3 minutes, 5 seconds 
10133650 started
10133650 ended
time 3 minutes, 5 seconds 
10133800 started
10133800 ended
time 3 minutes, 5 seconds 
10133980 started
10133980 ended
time 3 minutes, 4 seconds 
10134500 started
10134500 ended
time 3 minutes, 4 seconds 
10136500 started
10136500 ended
time 3 minutes, 5 seconds 
10137500 started
10137500 ended
time 2 minutes, 59 seconds 
10140100 started
10140100 ended
time 2 minutes, 38 seconds 
10140700 started
10140700 ended
time 2 minutes, 59 seconds 
10141000 started
10141000 ended
time 3 minutes, 7 seconds 
10145400 started
10145400 ended
time 3 minutes, 8 seconds 
10146400 started
10146400 ended
time 3 minutes, 14 s

In [14]:
combined_data

Unnamed: 0,Array,Chunk
Bytes,1.59 MiB,4 B
Shape,"(26, 16071)","(1, 1)"
Dask graph,417846 chunks in 2089456 graph layers,417846 chunks in 2089456 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.59 MiB 4 B Shape (26, 16071) (1, 1) Dask graph 417846 chunks in 2089456 graph layers Data type float32 numpy.ndarray",16071  26,

Unnamed: 0,Array,Chunk
Bytes,1.59 MiB,4 B
Shape,"(26, 16071)","(1, 1)"
Dask graph,417846 chunks in 2089456 graph layers,417846 chunks in 2089456 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [23]:
%%time
combined_data.to_zarr(f'final_rain.zarr', mode='w')

<xarray.backends.zarr.ZarrStore at 0x79b0c5e389c0>

In [16]:
%%time
df_combined = combined_data.to_dataframe()

CPU times: user 27min 7s, sys: 5min 55s, total: 33min 3s
Wall time: 18min 10s


In [17]:
%%time
df_combined_modified = df_combined.reset_index()

CPU times: user 11.7 ms, sys: 210 µs, total: 11.9 ms
Wall time: 28 ms


In [22]:
df_combined_modified.dtypes
df_combined_modified.to_csv('final_rain.csv')

In [21]:
df_combined_modified

Unnamed: 0,station,time,RAINRATE
0,10011500,1979-02-01,109.788467
1,10011500,1979-02-02,6.566367
2,10011500,1979-02-03,0.965529
3,10011500,1979-02-04,0.000000
4,10011500,1979-02-05,0.000000
...,...,...,...
417841,10172952,2023-01-27,0.000000
417842,10172952,2023-01-28,0.000000
417843,10172952,2023-01-29,0.000000
417844,10172952,2023-01-30,12.361973
