# GEFS operational ensemble mean wget cmd generator

This notebook generates qsub scripts to download operational GEFS ensemble mean
* Source: https://noaa-gefs-pds.s3.amazonaws.com/

In [1]:
import os
import numpy as np
from datetime import datetime, timedelta

In [None]:
import sys
sys.path.insert(0, '/glade/u/home/ksha/GAN_proj/')
from namelist import *

In [2]:
keys = ['gec00',]

for i in range(1, 31, 1):
    keys.append('gep{:02d}'.format(i))

In [3]:
key_leads = []

for i in range(6, 144+6, 6):
    key_leads.append('{:03d}'.format(i))

In [4]:
base = datetime(2021, 1, 1)
date_list = [base + timedelta(days=day) for day in range(365)]

base = datetime(2020, 12, 1)
date_list = [base + timedelta(days=day) for day in range(31)]

In [5]:
#key_leads

## Fill-in-the-gap version

In [6]:
# check if files exist
date_list_fill = []

for dt in date_list:
    
    dt_str = datetime.strftime(dt, '%Y%m%d')
    filename = camp_dir+'wget_GEFSv12_idx/{}_done.txt'.format(dt_str)
    
    if os.path.isfile(filename):
        continue;
    else:
        date_list_fill.append(dt)

In [7]:
L_fill = len(date_list_fill)
print('need to download {} files'.format(L_fill))

need to download 0 files


## Download idx files

In [8]:
N_scripts = 3
N = int(L_fill/N_scripts) + 1
index = np.arange(0, L_fill, N)

for i_, i_start in enumerate(index):

    f = open(work_dir+'qsub/wget_gefs_{:03d}.sh'.format(i_), 'w') 
    
    heads = '''
    #!/bin/bash -l
    
    #PBS -N wget_gefs_idx
    #PBS -A NAML0001
    #PBS -l walltime=23:59:59
    #PBS -l select=1:ncpus=4:mem=12GB
    #PBS -q casper
    #PBS -o wget_gefs.log
    #PBS -e wget_gefs.err
    
    cd {}wget_GEFSv12_idx/
    
    '''.format(camp_dir)
    
    print(heads, file=f)
    
    for i in range(i_start, i_start+N, 1):
            if i < L_fill:
                dt = date_list_fill[i]
                dt_str = datetime.strftime(dt, '%Y%m%d')
                
                for member in keys:
                    for ff in key_leads:
                        # download idx file
                        download_link = 'https://noaa-gefs-pds.s3.amazonaws.com/gefs.{}/00/atmos/pgrb2sp25/{}.t00z.pgrb2s.0p25.f{}.idx'.format(
                            dt_str, member, ff)
                        
                        save_name = '{}_00_{}.t00z.pgrb2s.0p25.f{}.idx'.format(dt_str, member, ff)                        
                        print('wget -L -O {} {}'.format(save_name, download_link), file=f)
                        
                print('touch {}_done.txt'.format(dt_str), file=f)
    
    f.close()

f = open(work_dir+'qsub/wget_gefs_all.sh', 'w')

for i_, i_start in enumerate(index):
    print('qsub wget_gefs_{:03d}.sh'.format(i_), file=f)
    
f.close()

In [9]:
# dt_str = ['20210120', '20210122', '20210227', '20210404', '20210511', '20210617', '20210724', '20210912', '20211006', '20211114', '20211220']
# member = ['gep08', 'gep01', 'gep07', 'gep12', 'gep29', 'gep24', 'gep28', 'gep10', 'gep15', 'gep30', 'gep06']
# ff = ['084', '012', '084', '060', '090', '036', '156', '114', '108', '114', '132']

# for i in range(len(dt_str)):
#     download_link = 'https://noaa-gefs-pds.s3.amazonaws.com/gefs.{}/00/atmos/pgrb2sp25/{}.t00z.pgrb2s.0p25.f{}.idx'.format(dt_str[i], member[i], ff[i])
#     save_name = '{}_00_{}.t00z.pgrb2s.0p25.f{}.idx'.format(dt_str[i], member[i], ff[i])
#     print('wget -L -O {} {}'.format(save_name, download_link))

## Download APCP based on the idx byte ranges

```
curl -H "range: bytes=6598193-6836482" -o test.grib https://noaa-gefs-pds.s3.amazonaws.com/gefs.20210629/00/atmos/pgrb2sp25/gec00.t00z.pgrb2s.0p25.f045
```

```python
import pygrib

with pygrib.open('test.grib') as grbio:
    var_name = grbio[1]

print(var_name)
```

In [10]:
import re

In [11]:
# check if files exist
date_list_fill = []

for dt in date_list:
    
    dt_str = datetime.strftime(dt, '%Y%m%d')
    filename = camp_dir+'wget_GEFSv12_members/{}_apcp_done.txt'.format(dt_str)
    
    if os.path.isfile(filename):
        continue;
    else:
        date_list_fill.append(dt)

In [12]:
L_fill = len(date_list_fill)
print('need to download {} files'.format(L_fill))

need to download 31 files


In [1]:
N_scripts = 3
N = int(L_fill/N_scripts) + 1
index = np.arange(0, L_fill, N)

for i_, i_start in enumerate(index):

    f = open(work_dir+'qsub/wget_gefs_apcp_{:03d}.sh'.format(i_), 'w') 
    
    heads = '''
    #!/bin/bash -l
    
    #PBS -N wget_gefs_idx
    #PBS -A NAML0001
    #PBS -l walltime=23:59:59
    #PBS -l select=1:ncpus=4:mem=12GB
    #PBS -q casper
    #PBS -o wget_gefs.log
    #PBS -e wget_gefs.err
    
    cd {}wget_GEFSv12_members/
    
    '''.format(camp_dir)
    
    print(heads, file=f)
    
    for i in range(i_start, i_start+N, 1):
            if i < L_fill:
                dt = date_list_fill[i]
                dt_str = datetime.strftime(dt, '%Y%m%d')
                
                for member in keys:
                    for ff in key_leads:

                        # get the location of the GEFS idx file
                        idx_name = '{}_00_{}.t00z.pgrb2s.0p25.f{}.idx'.format(dt_str, member, ff)  
                        filename_idx = camp_dir+'wget_GEFSv12_idx/{}'.format(idx_name)
                        
                        # read the byte ranges from the idx file
                        # no error handlings --> 
                        #     if the file is missing, the current bash cmd will fail and the next one will start
                        APCP_line = 'XXX'
                        APCP_line_after = 'XXX'
                        try:
                            f_lines = open(filename_idx).readlines()
                            for i, line in enumerate(f_lines):
                                if 'APCP' in line:
                                    APCP_line = line
                                    APCP_line_after = f_lines[i+1]
                            byte_start = re.findall(r'\d+', APCP_line)[1]
                            byte_end = re.findall(r'\d+', APCP_line_after)[1]
                            
                            # download idx file
                            download_link = 'https://noaa-gefs-pds.s3.amazonaws.com/gefs.{}/00/atmos/pgrb2sp25/{}.t00z.pgrb2s.0p25.f{}'.format(
                                dt_str, member, ff)
                            
                            save_name = '{}_00_{}.t00z.pgrb2s.0p25.f{}'.format(dt_str, member, ff)  
                            
                            print('curl -H "range: bytes={}-{}" -o {} {}'.format(byte_start, byte_end, save_name, download_link), file=f)
                        except:
                            print('Missing {}'.format(filename_idx))
                            
                print('touch {}_apcp_done.txt'.format(dt_str), file=f)
    
    f.close()

f = open(work_dir+'qsub/wget_gefs_apcp_all.sh', 'w')

for i_, i_start in enumerate(index):
    print('qsub wget_gefs_apcp_{:03d}.sh'.format(i_), file=f)
    
f.close()

In [14]:
import pygrib

In [15]:
filename = '20210101_00_gec00.t00z.pgrb2s.0p25.f006'

with pygrib.open(filename) as grbio:
    #print(dir(grbio))
    var_name = grbio[1]

print(var_name)

1:Total Precipitation:kg m**-2 (accum):regular_ll:surface:level 0:fcst time 0-6 hrs (accum):from 202101010000:hi res cntl fcst
