## Load Copernicus-Pretrain (GeoTiff / WebDataset format)

### GeoTiff

In [None]:
# download and extract the 100-grid subset
!mkdir -p ../data/
!wget https://huggingface.co/datasets/wangyi111/Copernicus-Pretrain/resolve/main/example_100_grids/fnames_100_union.json.gz -P ../data/
!wget https://huggingface.co/datasets/wangyi111/Copernicus-Pretrain/resolve/main/example_100_grids/example_100_geotiff.zip -P ../data/
!unzip ../data/example_100_geotiff.zip -d ../data/example_100_geotiff/
!rm ../data/example_100_geotiff.zip

--2025-03-14 09:25:23--  https://huggingface.co/datasets/wangyi111/Copernicus-Pretrain/resolve/main/example_100_grids/example_100_geotiff.zip
Resolving huggingface.co (huggingface.co)... 3.160.150.119, 3.160.150.2, 3.160.150.7, ...
Connecting to huggingface.co (huggingface.co)|3.160.150.119|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/ca/4e/ca4e5f6ac6c00101a7ddd1a7c2a16f77584145cf72635774ff11203b11c4d1b9/b08c322daee5c479c7df4c4234dd1a2e77d1438015147eaaf95bafc24b499f2a?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27example_100_geotiff.zip%3B+filename%3D%22example_100_geotiff.zip%22%3B&response-content-type=application%2Fzip&Expires=1741944323&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MTk0NDMyM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2NhLzRlL2NhNGU1ZjZhYzZjMDAxMDFhN2RkZDFhN2MyYTE2Zjc3NTg0MTQ1Y2Y3MjYzNTc3NGZmMTEyMDNiMTFjNGQxYjkvYjA4YzMyM

In [None]:
# install dependencies
!pip install kornia rasterio gzip # torch torchvision

Collecting kornia
  Using cached kornia-0.8.0-py2.py3-none-any.whl.metadata (17 kB)
Collecting kornia_rs>=0.1.0 (from kornia)
  Downloading kornia_rs-0.1.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Using cached kornia-0.8.0-py2.py3-none-any.whl (1.1 MB)
Downloading kornia_rs-0.1.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kornia_rs, kornia
Successfully installed kornia-0.8.0 kornia_rs-0.1.8


In [6]:
from torch.utils.data import DataLoader
from copernicuspretrain_dataset_geotiff import CopernicusPretrain
import logging
logging.getLogger("rasterio").setLevel(logging.ERROR)

fnames_path = '../data/example_100_geotiff/fnames_100_union.json.gz'
root_dir = '../data/example_100_geotiff/'

CopernicusPretrain = CopernicusPretrain(
    fnames_path, root_dir, 
    transform_s1=None, transform_s2=None, transform_s3=None, transform_s5p=None, transform_dem=None
    )
dataloader = DataLoader(CopernicusPretrain, batch_size=1, shuffle=True, num_workers=2) # batch size can only be 1 because of varying number of images per grid

for i, (sample, meta_data) in enumerate(dataloader):
    #print(i)
    print('Grid ID:', meta_data['dem'][0])
    print(sample.keys())
    print(meta_data.keys())

    
    print('### S1 GRD ###')
    print('Number of s1 local patches:', len(meta_data['s1_grd']), '  ', 'Number of time stamps for first local patch:', len(meta_data['s1_grd'][0]))
    print('Example for one image:', sample['s1_grd'][0][0].shape, meta_data['s1_grd'][0][0])
    print('### S2 TOA ###')
    print('Number of s2 local patches:', len(meta_data['s2_toa']), '  ', 'Number of time stamps for first local patch:', len(meta_data['s2_toa'][0]))
    print('Example for one image:', sample['s2_toa'][0][0].shape, meta_data['s2_toa'][0][0])
    print('### S3 OLCI ###')
    print('Number of s3 time stamps:', len(meta_data['s3_olci']))
    print('Example for one image:', sample['s3_olci'][0].shape, meta_data['s3_olci'][0])
    print('### S5P ###')
    print('Number of s5p time stamps for CO/NO2/O3/SO2:', len(meta_data['s5p_co']), len(meta_data['s5p_no2']), len(meta_data['s5p_o3']), len(meta_data['s5p_so2']))
    print('Example for one CO image:', sample['s5p_co'][0].shape, meta_data['s5p_co'][0])
    print('Example for one NO2 image:', sample['s5p_no2'][0].shape, meta_data['s5p_no2'][0])
    print('Example for one O3 image:', sample['s5p_o3'][0].shape, meta_data['s5p_o3'][0])
    print('Example for one SO2 image:', sample['s5p_so2'][0].shape, meta_data['s5p_so2'][0])
    print('### DEM ###')
    print('One DEM image for the grid:', sample['dem'].shape, meta_data['dem'][0])

    break

Grid ID: ('0906994_128.50_67.25',)
dict_keys(['s1_grd', 's2_toa', 's3_olci', 's5p_co', 's5p_no2', 's5p_o3', 's5p_so2', 'dem'])
dict_keys(['s1_grd', 's2_toa', 's3_olci', 's5p_co', 's5p_no2', 's5p_o3', 's5p_so2', 'dem'])
### S1 GRD ###
Number of s1 local patches: 1    Number of time stamps for first local patch: 3
Example for one image: torch.Size([1, 2, 268, 267]) ('0906994_128.50_67.25/1087609_128.50_67.25/20210827',)
### S2 TOA ###
Number of s2 local patches: 1    Number of time stamps for first local patch: 4
Example for one image: torch.Size([1, 13, 268, 267]) ('0906994_128.50_67.25/1087609_128.50_67.25/20200215',)
### S3 OLCI ###
Number of s3 time stamps: 8
Example for one image: torch.Size([1, 21, 67, 174]) ('0906994_128.50_67.25/20210313',)
### S5P ###
Number of s5p time stamps for CO/NO2/O3/SO2: 7 5 11 4
Example for one CO image: torch.Size([1, 1, 26, 66]) ('0906994_128.50_67.25/20210401',)
Example for one NO2 image: torch.Size([1, 1, 26, 66]) ('0906994_128.50_67.25/20210501',)


### WebDataset

[WebDataset](https://github.com/webdataset/webdataset) is a data storage format designed for efficient large-scale deep learning workloads. It stores datasets as sharded tar archives, allowing direct streaming without extraction, which significantly reduces disk I/O overhead and improves training speed. It is particularly useful for pretraining foundation models on large datasets.

The webdataset library is an implementation of PyTorch IterableDataset, which we will use to build a dataloader. One cool thing for webdataset (and other streaming formats) is that you can stream the data from cloud without downloading the whole dataset. This can be done by parsing urls to the paths of the data shards. The loading speed will depend on many things including the network. In this demo, we simply download the data beforehand and store it locally.

In [None]:
# download the 100-grid subset
!mkdir -p ../data/example_100_webdataset/
!wget https://huggingface.co/datasets/wangyi111/Copernicus-Pretrain/resolve/main/example_100_grids/example_100_webdataset/example-{000000..000009}.tar -P ../data/example_100_webdataset/

In [9]:
# install dependencies
!pip install webdataset

Collecting webdataset
  Downloading webdataset-0.2.111-py3-none-any.whl.metadata (15 kB)
Collecting braceexpand (from webdataset)
  Downloading braceexpand-0.1.7-py2.py3-none-any.whl.metadata (3.0 kB)
Downloading webdataset-0.2.111-py3-none-any.whl (85 kB)
Downloading braceexpand-0.1.7-py2.py3-none-any.whl (5.9 kB)
Installing collected packages: braceexpand, webdataset
Successfully installed braceexpand-0.1.7 webdataset-0.2.111


In [3]:
from copernicuspretrain_dataset_webdataset import CopernicusPretrain

shards_path = '../data/example_100_webdataset/example-{000000..000009}.tar'
data_size = 100
batch_size = 1

copernicus_pretrain = CopernicusPretrain(shards_path, batch_size=batch_size, num_workers=2, shuffle=10, shardshuffle=True, resampled=True)
dataloader = copernicus_pretrain.get_dataloader()

# # Unbatch, shuffle between workers, then rebatch. This may explode memory usage?!
# dataloader = dataloader.unbatched().shuffle(100).batched(batch_size)

# Since we are using resampling, the dataset is infinite; set an artificial epoch size.
dataloader = dataloader.with_epoch(data_size // batch_size)
dataloader = dataloader.with_length(data_size // batch_size)


for sample in dataloader:
    # get one image for each modality
    sample_s1, sample_s2, sample_s3, sample_co, sample_no2, sample_o3, sample_so2, sample_dem, meta = sample
    print(meta)
    break




[{'s1_grd': ['0772176_-96.00_44.00/0865831_-95.92_43.96/20221121'], 's2_toa': ['0772176_-96.00_44.00/0865831_-95.92_43.96/20210303'], 's3_olci': ['0772176_-96.00_44.00/20210127'], 's5p_co': ['0772176_-96.00_44.00/20211201'], 's5p_no2': ['0772176_-96.00_44.00/20210901'], 's5p_o3': ['0772176_-96.00_44.00/20210201'], 's5p_so2': ['0772176_-96.00_44.00/20211001'], 'dem': ['0772176_-96.00_44.00']}]
