# Convert ERA5 hourly data to daily data

In [None]:
proj_dir='/path/to/main_project_folder/' # edit this line

import pandas as pd
import numpy as np
import xarray as xr
import netCDF4 as nc4
import re
import sys
import os
import time
import multiprocessing as mp

years = range(1979, 2024)
yr_mon_args = []
for yr in years:
    for mon in range(1,13):
        yr_mon_args.append((yr, mon))
        
pool_num = 10
region_str_tpl = ['southcentral_north_america', 'southern_europe', 'western_russia', 'western_india', 'pacific_northwest']

## Define functions to extract daily max, daily mean, and 00UTC values.

In [None]:
def parallel_by_yr_daily_max(yr, mon=99):
    path_in = proj_dir+'input_data_ERA5/'+region_str+'/hourly/'+var_in+"/"
    path_out = proj_dir+'input_data_ERA5/'+region_str+"/daily/"+var_out+"/"

    if mon == 99:
        f_in = '_'.join([region_str, var_in, 'hourly', str(yr)+'.nc']) 
        f_out = '_'.join([region_str, var_out, 'daily', str(yr)+'.nc'])
    else:
        f_in = '_'.join([region_str, var_in, 'hourly', str(yr), str(mon).zfill(2)+'.nc']) 
        f_out = '_'.join([region_str, var_out, 'daily', str(yr), str(mon).zfill(2)+'.nc']) 

    if os.path.exists(path_in + f_in):
        if not os.path.exists(path_out + f_out):
            ds = xr.open_dataset(path_in + f_in)

            daily_var = ds.resample(time='24h').max(dim="time", skipna=True, keep_attrs=True)
            daily_var = daily_var.rename({var_rename: var_out})
            daily_var.to_netcdf(path_out + f_out)

def parallel_by_yr_daily_mean(yr, mon=99):
    path_in = proj_dir+'input_data_ERA5/'+region_str+"/hourly/"+var_in+"/"
    path_out = proj_dir+'input_data_ERA5/'+region_str+"/daily/"+var_out+"/"

    if mon == 99:
        f_in = '_'.join([region_str, var_in, 'hourly', str(yr)+'.nc']) 
        f_out = '_'.join([region_str, var_out, 'daily', str(yr)+'.nc'])
    else:
        f_in = '_'.join([region_str, var_in, 'hourly', str(yr), str(mon).zfill(2)+'.nc']) 
        f_out = '_'.join([region_str, var_out, 'daily', str(yr), str(mon).zfill(2)+'.nc']) 
        
    if os.path.exists(path_in + f_in):
        if not os.path.exists(path_out + f_out):
            ds = xr.open_dataset(path_in + f_in)

            daily_var = ds.resample(time='24h').mean(dim="time", skipna=True, keep_attrs=True)
            daily_var = daily_var.rename({var_rename: var_out})
            daily_var.to_netcdf(path_out + f_out)
            
def parallel_by_yr_daily_at_00UTC(yr, mon=99):
    path_in = proj_dir+'input_data_ERA5/'+region_str+"/hourly/"+var_in+"/"
    path_out = proj_dir+'input_data_ERA5/'+region_str+"/daily/"+var_out+"/"

    if mon == 99:
        f_in = '_'.join([region_str, var_in, 'hourly', str(yr)+'.nc']) 
        f_out = '_'.join([region_str, var_out, 'daily', str(yr)+'.nc'])
    else:
        f_in = '_'.join([region_str, var_in, 'hourly', str(yr), str(mon).zfill(2)+'.nc']) 
        f_out = '_'.join([region_str, var_out, 'daily', str(yr), str(mon).zfill(2)+'.nc']) 
         
    if os.path.exists(path_in + f_in):
        if not os.path.exists(path_out + f_out):
            ds = xr.open_dataset(path_in + f_in)
            
            midnight_UTC_timestamps = ds.time[ds.time.dt.hour == 0]
            daily_var = ds.sel(time=midnight_UTC_timestamps)
            daily_var['time'] = midnight_UTC_timestamps.time - np.timedelta64(1,'D')
            daily_var = daily_var.rename({var_rename: var_out})
            daily_var.to_netcdf(path_out + f_out)

### Daily maximum 2-meter air temperature

In [None]:
global var_in
var_in="t2m"
global var_rename
var_rename="t2m"
global var_out
var_out = "tmax"

for region_str_loc in region_str_tpl:
    global region_str
    region_str = region_str_loc
    print(region_str)
    with mp.Pool(pool_num) as p:
        p.starmap(parallel_by_yr_daily_max, yr_mon_args)

### Daily accumulated precipitation: 
take daily value at 00 UTC as total accumulated pr (in m) for the prior day

In [None]:
global var_in
var_in="pr"
global var_rename
var_rename="tp"
global var_out
var_out = "pr"

for region_str_loc in ['pacific_northwest']:
    global region_str
    region_str = region_str_loc
    print(region_str)
    with mp.Pool(pool_num) as p:
        p.starmap(parallel_by_yr_daily_at_00UTC, yr_mon_args)

### Daily mean soil moisture

In [None]:
global var_in
var_in="swvl1"
global var_rename
var_rename="swvl1"
global var_out
var_out = "swvl1"

for region_str_loc in region_str_tpl:
    global region_str
    region_str = region_str_loc
    print(region_str)
    with mp.Pool(pool_num) as p:
        p.starmap(parallel_by_yr_daily_mean, yr_mon_args)

### Daily mean sea-level pressure

In [None]:
global var_in
var_in="psl"
global var_rename
var_rename="msl"
global var_out
var_out = "psl"

for region_str_loc in region_str_tpl:
    global region_str
    region_str = region_str_loc
    print(region_str)
    with mp.Pool(pool_num) as p:
        p.map(parallel_by_yr_daily_mean, years)

### Daily mean 700mb geopotential height

In [None]:
global var_in
var_in="z700"
global var_rename
var_rename="z"
global var_out
var_out = "z700"

for region_str_loc in region_str_tpl:
    global region_str
    region_str = region_str_loc
    print(region_str)
    with mp.Pool(pool_num) as p:
        p.map(parallel_by_yr_daily_mean, years)

### Daily mean 500mb geopotential height

In [None]:
global var_in
var_in="z500"
global var_rename
var_rename="z"
global var_out
var_out = "z500"

for region_str_loc in region_str_tpl:
    global region_str
    region_str = region_str_loc
    print(region_str)
    with mp.Pool(pool_num) as p:
        p.map(parallel_by_yr_daily_mean, years)

### Daily mean 250mb geopotential height

In [None]:
global var_in
var_in="z250"
global var_rename
var_rename="z"
global var_out
var_out = "z250"

for region_str_loc in region_str_tpl:
    global region_str
    region_str = region_str_loc
    print(region_str)
    with mp.Pool(pool_num) as p:
        p.map(parallel_by_yr_daily_mean, years)