In [1]:
import xarray as xr

In [2]:
from luts import *

In [3]:
fp = "/beegfs/CMIP6/jdpaul3/cmip6_regrid_for_rasdaman/cmip6_regrid_mon_ensemble.nc"

ds = xr.open_dataset(fp)

ds

In [4]:
# for the first timestep and scenario, check that the ensemble mean (model 14) is actually the mean of models 1-13
ds["pr"].isel(lat=0, lon=0, time=0, scenario=0, model=14).values

array(9.77851533)

In [6]:
ds["pr"].isel(
    lat=0, lon=0, time=0, scenario=0, model=slice(0, 14)
).values.mean()

9.778515332462641

In [1]:
import xarray as xr
from pathlib import Path

In [2]:
cmip6_dir = '/beegfs/CMIP6/jdpaul3/CMIP6_common_regrid/regrid'

# directory structure is: <model>/<scenario>/<frequency>/<variable ID>/<filename>
# create a list with one file from each model + scenario + frequency + variable combination

def get_files(cmip6_dir):
    files = []
    for model in Path(cmip6_dir).iterdir():
        for scenario in model.iterdir():
            for frequency in scenario.iterdir():
                for variable in frequency.iterdir():
                    for file in variable.iterdir():
                        files.append(file)
    return files
files = get_files(cmip6_dir)

# create a list of tuples with (model, scenario, frequency, variable, file)
def get_file_tuples(files):
    file_tuples = []
    for file in files:
        model = file.parts[-4]
        scenario = file.parts[-3]
        frequency = file.parts[-2]
        variable = file.parts[-1]
        file_tuples.append((model, scenario, frequency, variable, file))
    return file_tuples
file_tuples = get_file_tuples(files)

# create a list of tuples with (model, scenario, frequency, variable, file) for the first file in each combination
def get_first_file_tuples(file_tuples):
    first_file_tuples = []
    seen = set()
    for file_tuple in file_tuples:
        model = file_tuple[0]
        scenario = file_tuple[1]
        frequency = file_tuple[2]
        variable = file_tuple[3]
        if (model, scenario, frequency, variable) not in seen:
            seen.add((model, scenario, frequency, variable))
            first_file_tuples.append(file_tuple)
    return first_file_tuples

first_file_tuples = get_first_file_tuples(file_tuples)


In [5]:
# extract the paths only into a new list
fps = []
for file_tuple in first_file_tuples:
    fps.append(file_tuple[4])

In [7]:
len(fps)

152861

In [None]:
import xarray as xr
from collections import Counter

# Check the time dimension for each file and collect problematic files
problem_files = []
time_shapes = []
for fp in fps:
    try:
        with xr.open_dataset(fp) as ds:
            time_shape = ds['time'].shape
            time_shapes.append((fp, time_shape))
    except Exception as e:
        problem_files.append((fp, str(e)))

# Find unique time shapes
shape_counts = Counter([shape for _, shape in time_shapes])
print("Unique time dimension shapes and their counts:")
for shape, count in shape_counts.items():
    print(f"{shape}: {count}")

# List files with uncommon time shapes
common_shape = shape_counts.most_common(1)[0][0]
uncommon_files = [fp for fp, shape in time_shapes if shape != common_shape]

print("\nFiles with uncommon time dimension shapes:")
for fp in uncommon_files:
    print(fp)

print("\nFiles that could not be opened:")
for fp, err in problem_files:
    print(f"{fp}: {err}")