# Combining forecast files on lead time dimension 

Note:

The following files are corrputed, need to rerun:

```bash
/glade/campaign/cisl/aiml/gathered/2018-04-14T00Z.nc
/glade/campaign/cisl/aiml/gathered/2018-04-15T12Z.nc
/glade/campaign/cisl/aiml/gathered/2018-04-16T12Z.nc
/glade/campaign/cisl/aiml/gathered/2018-04-18T12Z.nc
/glade/campaign/cisl/aiml/gathered/2018-06-30T00Z.nc
/glade/campaign/cisl/aiml/gathered/2018-07-15T00Z.nc
/glade/campaign/cisl/aiml/gathered/2018-07-20T00Z.nc
```

In [1]:
import os
import sys
import yaml
from glob import glob
from datetime import datetime

import numpy as np
import xarray as xr

from concurrent.futures import ThreadPoolExecutor

In [2]:
sys.path.insert(0, os.path.realpath('../libs/'))
import verif_utils as vu

In [3]:
config_name = os.path.realpath('verif_config.yml')

with open(config_name, 'r') as stream:
    conf = yaml.safe_load(stream)

In [4]:
model_name = 'wxformer'

In [5]:
def process_files_concurrently(base_dir, all_files_list, output_dir, variables_levels, time_intervals=None, max_workers=10):
    """
    Process files concurrently using ThreadPoolExecutor.
    """
    # create dir if it does not exist
    vu.create_dir(output_dir)
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(vu.process_file_group, file_list, output_dir, variables_levels, time_intervals) for file_list in all_files_list]
        for future in futures:
            future.result()  # Wait for all futures to complete

## Selected variables and levels

In [7]:
variables_levels = conf[model_name]['verif_variables']

In [8]:
base_dir = conf[model_name]['save_loc_rollout']
output_dir = conf[model_name]['save_loc_gather']
time_intervals = None

# Get list of NetCDF files
all_files_list = vu.get_nc_files(base_dir)

## Scenario: combine nc files on a single initilization time

netCDF time coord encoding warning is not resolved, but it will not impact verification results

In [9]:
process_files_concurrently(base_dir, [all_files_list[0]], output_dir, variables_levels, time_intervals)

Processing subdirectory: 2018-01-01T00Z
Output name: /glade/campaign/cisl/aiml/ksha/CREDIT/gathered/2018-01-01T00Z.nc


  ds.to_netcdf(output_file)


In [10]:
# process_files_concurrently(base_dir, [all_files_list[1]], output_dir, variables_levels, time_intervals)

In [11]:
# process_files_concurrently(base_dir, [all_files_list[206]], output_dir, variables_levels, time_intervals)

In [12]:
# process_files_concurrently(base_dir, [all_files_list[209]], output_dir, variables_levels, time_intervals)

In [13]:
# process_files_concurrently(base_dir, [all_files_list[211]], output_dir, variables_levels, time_intervals)

In [14]:
# process_files_concurrently(base_dir, [all_files_list[215]], output_dir, variables_levels, time_intervals)

In [15]:
# process_files_concurrently(base_dir, [all_files_list[360]], output_dir, variables_levels, time_intervals)

In [16]:
# process_files_concurrently(base_dir, [all_files_list[390]], output_dir, variables_levels, time_intervals)

In [17]:
# process_files_concurrently(base_dir, [all_files_list[400]], output_dir, variables_levels, time_intervals)

## Scenario: combine on a range of initializations

In [18]:
# ind_start = 0
# ind_end = 3

# process_files_concurrently(base_dir, all_files_list[ind_start:ind_end], output_dir, variables_levels, time_intervals)