# Program - Read VOCALS-REx GFS SCM forcing data (binary) and save it into a new netCDF file

**Purpose**

Read VOCALS-REx GFS SCM forcing data and save it into a new netCDF file

**Data**

- NCAR/UCAR EOL -  VOCALS: NCEP GFS Single Column Model Forcing Data
  - https://data.eol.ucar.edu/dataset/89.105
- Data access
  - ORDER data for delivery by FTP
- Documentation
  - https://data.eol.ucar.edu/file/download/41B38ABB023/NCEP_GFS_Single_Column_Model_Forcing_Data_for_VOCALS_Rex.pdf

**Author:** Yi-Hsuan Chen (yihsuan@umich.edu)

**Date:** May 2024



# Functions

## read_data

In [113]:
import struct
import numpy as np

####################
####################
####################
def create_return_arrays(num_time_steps, npoint=25, levs=64):

    variable_shapes = {

    #--- 2d variable, (time)
    'date': (num_time_steps),

    #--- 2d variable, (station)
    'station': (npoint),

    #--- 2d variable, (time, station)
    'latitude': (num_time_steps, npoint),
    'longitude': (num_time_steps, npoint),
    'zsfc': (num_time_steps, npoint),
    'psfc': (num_time_steps, npoint),
    'tsfc': (num_time_steps, npoint),
    'u10': (num_time_steps, npoint),
    'v10': (num_time_steps, npoint),
    't2': (num_time_steps, npoint),
    'q2': (num_time_steps, npoint),
    'hpbl': (num_time_steps, npoint),

    #--- 2d variable, (time, levels+1)
    'sigi': (num_time_steps, levs+1),
    'ak5': (num_time_steps, levs+1),
    'bk5': (num_time_steps, levs+1),
        
    #--- 3d variable, (time, station, levels)
    'u': (num_time_steps, npoint, levs),
    'v': (num_time_steps, npoint, levs),
    't': (num_time_steps, npoint, levs),
    'q': (num_time_steps, npoint, levs),
    'p': (num_time_steps, npoint, levs),
    'omega': (num_time_steps, npoint, levs),
    'dtdt': (num_time_steps, npoint, levs),
    'dqdt': (num_time_steps, npoint, levs),
    'sigl': (num_time_steps, npoint, levs),
}

    data_arrays = {var: np.zeros(shape) for var, shape in variable_shapes.items()}

    #--- set a new string variable
    #new_string_var = [['' for _ in range(npoint)] for _ in range(num_time_steps)]
    #data_arrays['date_string'] = new_string_var

    # Initialize the 'station' variable as a list of lists (strings)
    #data_arrays['station_string'] = [['' for _ in range(1)] for _ in range(npoint)]
    
    return data_arrays

####################
####################
####################
def read_data(filename, 
              do_print=False):
    with open(filename, 'rb') as file:
        # Read the header
        header_format = '>13i'
        header_size = struct.calcsize(header_format)
        header_data = struct.unpack(header_format, file.read(header_size))
        
        # Unpack the header data
        uu, hour, month, day, year, nsfc, nflx, nvar, levs, npoint, start_hour, end_hour, step_hour = header_data

        if (do_print):
            print(f'Hour: {hour}, Month: {month}, Day: {day}, Year: {year}')
            print(f'Number of surface variables: {nsfc}')
            print(f'Number of flux variables: {nflx}')
            print(f'Number of variables for each sounding: {nvar}')
            print(f'Number of vertical levels: {levs}')
            print(f'Number of station points: {npoint}')
            print(f'Starting forecast hour: {start_hour}')
            print(f'Ending forecast hour: {end_hour}')
            print(f'Forecast output step: {step_hour}')

        #--- create return data array
        data = create_return_arrays(num_time_steps=1, npoint=npoint, levs=levs)
        time_step = 0
        
        # Skip the first two values before reading sigi
        file.seek(8, 1)  # Skip 8 bytes (2 values)
        
        # Read the second record of vertical sounding levels
        sigi = np.fromfile(file, dtype='>f4', count=levs + 1)
        sigl = np.fromfile(file, dtype='>f4', count=levs)
        ak5 = np.fromfile(file, dtype='>f4', count=levs + 1)
        bk5 = np.fromfile(file, dtype='>f4', count=levs + 1)
            
        # Calculate the number of time steps
        num_time_steps = (end_hour - start_hour) // step_hour + 1

        # Loop over station points
        for i in range(npoint):

            # Skip the first two values before reading surface_data
            file.seek(8, 1)  # Skip 8 bytes (2 values)

            # Read surface variables
            surface_vars_format = f'>{nsfc}f'
            #surface_vars_format = f'>50f'
            surface_vars_size = struct.calcsize(surface_vars_format)
            surface_data = struct.unpack(surface_vars_format, file.read(surface_vars_size))

            # Print each element in surface_data
            #print(f'Surface Data for Station {i+1}:')                 
            if (do_print):
                for j, value in enumerate(surface_data):
                    print(f'surface_data {j}: {value}')

            # Skip the first two values before reading flux_data
            file.seek(8, 1)  # Skip 8 bytes (2 values)
            
            # Read flux type variables if nflx > 0
            if nflx > 0:
                flux_vars_format = f'>{nflx}f'
                flux_vars_size = struct.calcsize(flux_vars_format)
                flux_data = struct.unpack(flux_vars_format, file.read(flux_vars_size))

                # Print flux data for debugging purposes
                if (do_print):
                    for j, value in enumerate(flux_data):
                        print(f'flux_data {j}: {value}')
                        
            # Read vertical levels data
            file.seek(8, 1)  # Skip 8 bytes (2 values)  
            u = np.fromfile(file, dtype='>f4', count=levs)

            file.seek(8, 1)  # Skip 8 bytes (2 values)  
            v = np.fromfile(file, dtype='>f4', count=levs)

            file.seek(8, 1)  # Skip 8 bytes (2 values)  
            t = np.fromfile(file, dtype='>f4', count=levs)

            file.seek(8, 1)  # Skip 8 bytes (2 values)  
            q = np.fromfile(file, dtype='>f4', count=levs)

            file.seek(8, 1)  # Skip 8 bytes (2 values)  
            p = np.fromfile(file, dtype='>f4', count=levs)

            if nvar > 5:
                file.seek(8, 1)  # Skip 8 bytes (2 values)
                omega = np.fromfile(file, dtype='>f4', count=levs)

                file.seek(8, 1)  # Skip 8 bytes (2 values)
                dtdt = np.fromfile(file, dtype='>f4', count=levs)

                file.seek(8, 1)  # Skip 8 bytes (2 values)
                dqdt = np.fromfile(file, dtype='>f4', count=levs)

            if nvar > 8:
                file.seek(8, 1)  # Skip 8 bytes (2 values)
                cloud_water = np.fromfile(file, dtype='>f4', count=levs)

                file.seek(8, 1)  # Skip 8 bytes (2 values)
                cloud_water_tend = np.fromfile(file, dtype='>f4', count=levs)

                file.seek(8, 1)  # Skip 8 bytes (2 values)
                cloud_fraction = np.fromfile(file, dtype='>f4', count=levs)

            #--- save variables into data array

            str1 = f"{year}{month}{day:02}{hour:02}"
            data['date'][time_step] = str1
            data['station'][i] = f"{i+1:02}"
            
            #--- 2d variable, (time, station)
            data['latitude'][time_step, i] = surface_data[0]
            data['longitude'][time_step, i] = surface_data[1]
            data['zsfc'][time_step, i] = surface_data[2]
            data['psfc'][time_step, i] = surface_data[3]
            data['tsfc'][time_step, i] = surface_data[4]
            
            data['u10'][time_step, i] = flux_data[17]
            data['v10'][time_step, i] = flux_data[18]
            data['t2'][time_step, i] = flux_data[19]
            data['q2'][time_step, i] = flux_data[20]
            data['hpbl'][time_step, i] = flux_data[26]

            #--- 2d variable, (time, levs/levs+1)
            data['sigi'][time_step, :] = sigi
            data['sigl'][time_step, :] = sigl
            data['ak5'][time_step, :] = ak5
            data['bk5'][time_step, :] = bk5

            #--- 3d variable, (time, station, levs/levs+1)
            data['u'][time_step, i, :] = u
            data['v'][time_step, i, :] = v
            data['t'][time_step, i, :] = t
            data['q'][time_step, i, :] = q
            data['p'][time_step, i, :] = p
            data['omega'][time_step, i, :] = omega
            data['dtdt'][time_step, i, :] = dtdt
            data['dqdt'][time_step, i, :] = dqdt
            #data[''][time_step, i, :] = 

        if (do_print):
            print('Sigi:', sigi)
            print('Sigl:', sigl)
            print('Ak5:', ak5)
            print('Bk5:', bk5)
            print('u:', u)
            print('v:', v)
            print('t:', t)
            print('q:', q)
            print('omega:', omega*1e+5/86400)
            print('dtdt', dtdt)
            print('dqdt', dtdt)
            print('cloud_water', cloud_water)
            print('cloud_water_tend', cloud_water_tend)
            print('cloud_fraction', cloud_fraction)

    return data

# Example usage
filename = '../original/vocalsgfs.2008101100'

#data = read_data(filename, do_print=True)
data = read_data(filename, do_print=False)

#data['q2']
#data['p'][0,11,:]
#data['latitude']
#print(data['date'].astype(int))
#print(data['station'])

#data

#data_all = create_return_arrays(num_time_steps=2, npoint=25, levs=64)

#filename = '../original/vocalsgfs.2008101100'
#read_data(filename, data_all)

## create_xarray_dataset

In [166]:
import xarray as xr
import numpy as np

def create_xarray_dataset(num_time_steps=122, num_station=25, num_levels=64):
    # Define the coordinate arrays
    times = np.arange(num_time_steps)
    stations = np.arange(num_station)  # Correctly define the station coordinate
    levels_mid = np.arange(num_levels)
    levels_int = np.arange(num_levels + 1)

    # Create the xarray Dataset with coordinates
    ds = xr.Dataset(
        coords={
            'time': ('time', times),
            'station': ('station', stations),  # Use stations here
            'lev_mid': ('lev_mid', levels_mid),
            'lev_int': ('lev_int', levels_int),
        }
    )
    
    # Define the dimension tuples
    time_only = ('time',)
    time_station = ('time', 'station')
    time_station_levmid = ('time', 'station', 'lev_mid')
    time_station_levint = ('time', 'station', 'lev_int')

    # Define the data variables and their dimensions
    variable_specs = {
        # date
        'date': time_only,
        
        # time_station
        'latitude': time_station,
        'longitude': time_station,
        'zsfc': time_station,
        'psfc': time_station,
        'tsfc': time_station,
        'u10': time_station,
        'v10': time_station,
        't2': time_station,
        'q2': time_station,
        'hpbl': time_station,        
        
        # time_station_levmid
        'u': time_station_levmid,
        'v': time_station_levmid,
        't': time_station_levmid,
        'q': time_station_levmid,
        'p': time_station_levmid,
        'omega': time_station_levmid,
        'dtdt': time_station_levmid,
        'dqdt': time_station_levmid,
        'sigl': time_station_levmid,
        
        # time_station_levint
        'sigi': time_station_levint,
        'ak5': time_station_levint,
        'bk5': time_station_levint,
    }
    
    # Add the data variables to the Dataset
    for var_name, dims in variable_specs.items():
        shape = tuple(ds.coords[dim].size for dim in dims)
        ds[var_name] = xr.DataArray(np.random.rand(*shape), dims=dims)
    
    return ds

## process_files_in_directory

In [167]:
import os
def process_files_in_directory(directory='/lfs/home/yihsuanc/data/data.TaiESM1_scm/iop/VOCALS-REx/original/'):

    file_names = []
    for filename in os.listdir(directory):
        if filename.startswith('vocalsgfs'):
            #print(filename)
            file_path = os.path.join(directory, filename)
            file_names.append(file_path)
    return sorted(file_names)

#file_names = process_files_in_directory()
#for ff in file_names:
#    print(ff)

# Examples 

## Read a single date, all 25 stations

In [174]:
filename = '../original/vocalsgfs.2008100100'
data1 = read_data(filename, do_print=False)

variable_dimensions = {var: arr.shape for var, arr in data1.items()}
print(variable_dimensions)

{'date': (1,), 'station': (25,), 'latitude': (1, 25), 'longitude': (1, 25), 'zsfc': (1, 25), 'psfc': (1, 25), 'tsfc': (1, 25), 'u10': (1, 25), 'v10': (1, 25), 't2': (1, 25), 'q2': (1, 25), 'hpbl': (1, 25), 'sigi': (1, 65), 'ak5': (1, 65), 'bk5': (1, 65), 'u': (1, 25, 64), 'v': (1, 25, 64), 't': (1, 25, 64), 'q': (1, 25, 64), 'p': (1, 25, 64), 'omega': (1, 25, 64), 'dtdt': (1, 25, 64), 'dqdt': (1, 25, 64), 'sigl': (1, 25, 64)}


## Read all dates, all 25 stations

In [176]:
#--- get all vocalsgfs file paths
file_names = process_files_in_directory()

#--- Create an xarray dataset
ds_all = create_xarray_dataset()

#--- process file_names  and then save into ds_all
for i, ff in enumerate(file_names):
    #print(f'Read [{i}, {ff}]')
    data = read_data(ff)    

    for var_name in data.keys():
        if var_name in ds_all.variables:
            ndim = data[var_name].ndim
            #print(f"Variable: {var_name}, ndim: {ndim}")
    
            if (ndim == 1 and var_name != 'station'):
               ds_all[var_name][i] = data[var_name][0]
            elif (ndim == 2):
               ds_all[var_name][i,:] = data[var_name][0,:]
            elif (ndim == 3):
               ds_all[var_name][i,:,:] = data[var_name][0,:,:]

ds_all