# Compare time series extraction COG Data
Using Xarray, Rasterio and Dask

In [None]:
import xarray as xr
import hvplot.xarray
import dask
from dask.distributed import Client, progress
#from dask_kubernetes import KubeCluster
import os
import fsspec

Use `fsspec` to explore an S3 "requester pays" bucket like a filesystem.  We don't actually pay anything here since the USGS Pangeo and the data being read here live in the same AWS region (us-west-2)

In [None]:
fs = fsspec.filesystem('s3', anon=False, requester_pays=True)

In [None]:
# return all the COGS for a given year
def get_tifs(year):
    files = fs.ls(f'dev-et-data/test/compressed/NDVI_filled/{year}')
    return [f for f in files if f.endswith('tif')]

Create a Dask cluster.  We can run KubeCluster on pangeo, but for rasterio to be able to read from "requester pays" buckets we need to set an environment variable and pass it to the workers also. 

In [None]:
os.environ["AWS_REQUEST_PAYER"] = "requester" 
##cluster = KubeCluster(n_workers=8, env={'AWS_REQUEST_PAYER': 'requester'})
#client = Client(cluster)

In [None]:
#cluster.close()
# local client
client = Client()

In [None]:
client

In [None]:
(512*512*4)*(10)/1e6

Create a delayed function to return an xarray dataarray from a tif filename

In [None]:
@dask.delayed
def tif_to_da(tif):
    return xr.open_rasterio('s3://'+tif, chunks={'band':1, 'x':512, 'y':512})

Create data array for first 30 yeardays of a given year (just reading metadata)

In [None]:
%%time
tifs = get_tifs(2001)
lazy_da =[tif_to_da(tif) for tif in tifs[:180]]
dalist = dask.compute(*lazy_da)
da = xr.concat(dalist, dim='band')
da = da.rename({'band':'yearday'})

In [None]:
da

Assign values to the coordinates

In [None]:
da = da.assign_coords(yearday=range(0,180))

In [None]:
da

In [None]:
ds = da.to_dataset(name='ndvi')

In [None]:
ds = ds.isel(x=slice(20480,20480+1024), y=slice(1024,2048))

In [None]:
ds

In [None]:
%%time
ds.ndvi[:,0,0].load().hvplot(grid=True)

In [None]:
ds = ds.chunk(chunks={'yearday':10, 'x':512, 'y':512})

In [None]:
ds.ndvi.encoding

In [None]:
ds.ndvi.encoding = {'chunks': (10,512,512)}

In [None]:
print(ds.ndvi[0,0,0].load().values)

In [None]:
# Write data to zarr using my "s3" aws credentionals profile

d = fsspec.get_mapper('s3://chs-pangeo-data-bucket/rsignell/zarr/zarr_test', 
                                    anon=False, profile='s3')
ds.load().to_zarr(store=d, mode='w', consolidated=True)

##  open and read rechunked zarr data

In [None]:
dszs3 = xr.open_zarr(fsspec.get_mapper('s3://chs-pangeo-data-bucket/rsignell/zarr/zarr_test', 
                                    anon=False, profile='s3'))

In [None]:
%%time
dszs3.ndvi[:,0,0].load().hvplot(grid=True)

### Bottom Line: for this test and this toolset, it's 20 times faster reading the rechunked zarr compared to original COGS!