# Conversion to NetCDF


We have a bunch of `.grib` ERA files, one per month.

Can we process these into NetCDF files, one per hour?

---

In [11]:
#Load the data
%%time 
import xarray as xr
f = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/ERA_skin/sfc_skin_unstructured_2018_01.grib'
ds = xr.open_dataset(f,engine='cfgrib',filter_by_keys={'typeOfLevel': 'surface'})

Ignoring index file '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/ERA_skin/sfc_skin_unstructured_2018_01.grib.923a8.idx' incompatible with GRIB file


CPU times: user 17.9 s, sys: 3.7 s, total: 21.6 s
Wall time: 21.6 s


In [18]:
#Relabel longitude coordinate to be consistent with MODIS
ds_long = ds.assign_coords({"longitude": (((ds.longitude + 180) % 360) - 180)})

In [19]:
#Group it in time
ds_grouped = ds_long.groupby("time")

In [26]:
%%time
#Output path
path = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/sandbox/'

counter = 0
for label,group in ds_grouped:    
    outname = path+str(label)+'.nc'
    print(outname)
    group.to_netcdf(outname)

    
    #Exit condition
    counter += 1
    if counter > 5: break


/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/sandbox/2018-01-01T00:00:00.000000000.nc
/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/sandbox/2018-01-01T01:00:00.000000000.nc
/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/sandbox/2018-01-01T02:00:00.000000000.nc
/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/sandbox/2018-01-01T03:00:00.000000000.nc
/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/sandbox/2018-01-01T04:00:00.000000000.nc
/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/sandbox/2018-01-01T05:00:00.000000000.nc


---

We can then open one of these files much faster:

In [27]:
%%time
ds_nc = xr.open_dataset(path+'2018-01-01T00:00:00.000000000.nc')

CPU times: user 6.04 ms, sys: 12.8 ms, total: 18.8 ms
Wall time: 22.1 ms


---


# Skin temperature checks


In [8]:
%%time

import pandas as pd
import xarray as xr
def process_grib_file_skt(f,output_path):
    
    #Open file
    ds = xr.open_dataset(f,engine='cfgrib',filter_by_keys={'typeOfLevel': 'surface'},backend_kwargs={'indexpath': ''})
    
    display(ds)
    
    #Relabel longitude coordinate to be consistent with MODIS
    ds = ds.assign_coords({"longitude": (((ds.longitude + 180) % 360) - 180)})
    
    display(ds.time)
    
    #Group it by time 
    ds_grouped = ds.groupby("time")
    
    
    
    #Output path
    
    counter=0
    
    
    for label,group in ds_grouped:    
        outname = output_path+str(label)+'.nc'
        print(outname)
        sys.exit()
        group.to_netcdf('test.nc')
        counter += 1
        if counter > 10: break
    #Explictly close everything
    ds.close()
    del ds_grouped
    
    
    

        
    
#Paths
root = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw' 

#Parameters
dates = pd.date_range('2018-01-01','2020-12-01', 
              freq='MS').strftime("%Y-%m").tolist()


source = 'ERA_skt' #'ERA_skin'


counter = 0
for dt in dates:
    d=dt.replace('-','_')
    
    
    
    fname = f'{root}/{source}/skt_unstructured_{d}.grib'
    out = f'{root}/ERA_skt_netcdf/'
    
    print(fname)

    print('Processing month:', out)
    process_grib_file_skt(fname,out)
    
 
    
    sys.exit()

/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/ERA_skt/skt_unstructured_2018_01.grib
Processing month: /network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/ERA_skt_netcdf/


/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/ERA_skt_netcdf/2018-01-01T00:00:00.000000000.nc


NameError: name 'sys' is not defined

---

# Appendix

Test script used in `scripts/convert_grib_to_netcdf.py`

In [75]:
%%time

def process_grib_file(f,output_path):
    
    #Open file
    ds = xr.open_dataset(f,engine='cfgrib',filter_by_keys={'typeOfLevel': 'surface'})
    
    #Relabel longitude coordinate to be consistent with MODIS
    ds = ds.assign_coords({"longitude": (((ds.longitude + 180) % 360) - 180)})
    
    
    #Group it by time 
    ds_grouped = ds.groupby("time")
    
    
    
    #Output path
    
    counter=0
    
    
    for label,group in ds_grouped:    
        outname = output_path+str(label)+'.nc'
        print(outname)
        sys.exit()
        group.to_netcdf('test.nc')
        counter += 1
        if counter > 10: break
    #Explictly close everything
    ds.close()
    del ds_grouped
    
    
    

        
    
#Paths
root = '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw' 

#Parameters
dates = pd.date_range('2018-01-01','2020-12-01', 
              freq='MS').strftime("%Y-%m").tolist()


source = 'ERA_sfc' #'ERA_skin'

if source == 'ERA_skin':
    name = '_skin_'
if source == 'ERA_sfc':
    name = '_'

counter = 0
for dt in dates:
    d=dt.replace('-','_')
    
    
    
    fname = f'{root}/{source}/sfc{name}unstructured_{d}.grib'
    out = f'{root}/{source}/NetCDF/'

    print('Processing month:', out)
    process_grib_file(fname,out)
    
 
    
    sys.exit()

Ignoring index file '/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/ERA_sfc/sfc_unstructured_2018_01.grib.923a8.idx' incompatible with GRIB file


Processing month: /network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/ERA_sfc/NetCDF/
/network/group/aopp/predict/TIP016_PAXTON_RPSPEEDY/ML4L/ECMWF_files/raw/ERA_sfc/NetCDF/2018-01-01T00:00:00.000000000.nc


SystemExit: 

---