In [12]:
import xarray as xr
from pathlib import Path
import pandas as pd


In [2]:
timefix_dir = Path("/beegfs/CMIP6/jdpaul3/cmip6_regrid_timefix")

In [10]:
variables = ["pr", "tasmax", "tasmin"]

models_to_inspect = [
    'CESM2',
    'CNRM-CM6-1-HR',
    'E3SM-2-0',
    'EC-Earth3-Veg',
    'GFDL-ESM4',
    'HadGEM3-GC31-LL',
    'HadGEM3-GC31-MM',
    'KACE-1-0-G',
    'MIROC6',
    'MPI-ESM1-2-HR',
    'MRI-ESM2-0',
    'NorESM2-MM',
    'TaiESM1',
]

In [9]:
files = list(Path(timefix_dir).rglob("*day_*.nc"))

In [20]:
# using just the filenames, audit the files and create a table of the models, variables, scenarios, time range, and count of files
# filename format: <variable>_day_<model>_<scenario>_regrid_<start_date>-<end_date>.nc
# date format: YYYYMMDD
data = []

for file in files:
    filename = file.name
    parts = filename.split('_')
    
    if len(parts) < 6:
        continue  # skip files that do not match the expected format
    
    variable = parts[0]
    model = parts[2]
    scenario = parts[3]
    start_date = parts[5].split('-')[0]  # YYYYMMDD
    end_date = parts[5].split('-')[1].replace('.nc', '')  # YYYYMMDD
    
    if variable not in variables or model not in models_to_inspect:
        continue  # skip files that are not of interest
    
    data.append({
        'variable': variable,
        'model': model,
        'scenario': scenario,
        'start_date': start_date,
        'end_date': end_date,
        'file_path': str(file)
    })

# Create a DataFrame from the collected data
df = pd.DataFrame(data)
# Convert date strings to datetime objects for better handling
df['start_date'] = pd.to_datetime(df['start_date'], format='%Y%m%d')
df['end_date'] = pd.to_datetime(df['end_date'], format='%Y%m%d')


In [54]:
# check each row if any dates not start with jan 1 or end with december 31
def check_date_range(row):
    return row['start_date'].month == 1 and row['start_date'].day == 1 and \
           row['end_date'].month == 12 and row['end_date'].day == 31    

# Apply the check to the DataFrame
df['valid_date_range'] = df.apply(check_date_range, axis=1)
# Filter the DataFrame to show only rows with invalid date ranges
invalid_date_ranges = df[~df['valid_date_range']]
# Print the DataFrame with invalid date ranges
if not invalid_date_ranges.empty:
    print("Files with invalid date ranges:")
    print(invalid_date_ranges[['variable', 'model', 'scenario', 'start_date', 'end_date', 'file_path']])


In [25]:
# check each row if scenario = 'historical' and start_date != 19500101 or end_date != 20141231
def check_historical_dates(row):
    if row['scenario'] == 'historical':
        return not (row['start_date'] == pd.Timestamp('1950-01-01') and 
                    row['end_date'] == pd.Timestamp('2014-12-31'))
    return True  # For non-historical scenarios, we consider the dates valid
# Apply the check to the DataFrame
df['valid_historical_dates'] = df.apply(check_historical_dates, axis=1)
# Filter the DataFrame to show only rows with invalid historical dates
invalid_historical_dates = df[~df['valid_historical_dates']]
# Print the DataFrame with invalid historical dates
if not invalid_historical_dates.empty:
    print("Files with invalid historical dates:")
    print(invalid_historical_dates[['variable', 'model', 'scenario', 'start_date', 'end_date', 'file_path']])

In [58]:
# check each row if scenario != 'historical' and start_date != 20150101 or end_date != 21001231
def check_future_dates(row):
    if row['scenario'] != 'historical':
        return not (row['start_date'] == pd.Timestamp('2015-01-01') and 
                    row['end_date'] == pd.Timestamp('2100-12-31'))
    return True  # For historical scenarios, we consider the dates valid
# Apply the check to the DataFrame
df['valid_future_dates'] = df.apply(check_future_dates, axis=1)
# Filter the DataFrame to show only rows with invalid future dates
invalid_future_dates = df[~df['valid_future_dates']]
# Print the DataFrame with invalid future dates
if not invalid_future_dates.empty:
    print("Files with invalid future dates:")
    print(invalid_future_dates[['variable', 'model', 'scenario', 'start_date', 'end_date', 'file_path']])

In [28]:
# group by model and scenario, and list the variables and their counts
grouped = df.groupby(['model', 'scenario', 'variable']).size().reset_index(name='count')
# Print the grouped DataFrame
print("\nGrouped DataFrame by model, scenario, and variable:")
print(grouped)


Grouped DataFrame by model, scenario, and variable:
       model    scenario variable  count
0      CESM2  historical       pr     65
1      CESM2      ssp126       pr     86
2      CESM2      ssp126   tasmax     86
3      CESM2      ssp126   tasmin     86
4      CESM2      ssp245       pr     86
..       ...         ...      ...    ...
164  TaiESM1      ssp370   tasmax     86
165  TaiESM1      ssp370   tasmin     86
166  TaiESM1      ssp585       pr     86
167  TaiESM1      ssp585   tasmax     86
168  TaiESM1      ssp585   tasmin     86

[169 rows x 4 columns]


In [61]:
# sort the grouped dataframe by count low to high
grouped_sorted = grouped.sort_values(by='count', ascending=True)
grouped_sorted

Unnamed: 0,model,scenario,variable,count
32,EC-Earth3-Veg,ssp126,tasmax,1
35,EC-Earth3-Veg,ssp245,tasmax,14
34,EC-Earth3-Veg,ssp245,pr,14
36,EC-Earth3-Veg,ssp245,tasmin,46
0,CESM2,historical,pr,65
...,...,...,...,...
9,CESM2,ssp370,tasmin,86
10,CESM2,ssp585,pr,86
11,CESM2,ssp585,tasmax,86
12,CESM2,ssp585,tasmin,86


In [95]:
for model in models_to_inspect:
    df[df["model"] == model].sort_values(by=['scenario', 'variable', 'start_date']).to_csv(f"{model}_timefix_audit.csv", index=False)


In [88]:
# for each model, load the first file encountered in the list 
datasets = []
for model in models_to_inspect:
    model_files = [file for file in files if model in file.name]
    if model_files:
        ds = xr.open_dataset(model_files[0], decode_cf=True, drop_variables=['height', 'spatial_ref', 'type'])
        datasets.append((model, ds))

In [89]:
for dataset in datasets:
    print(dataset[0])
    print(dataset[1].coords)
    print("\n")

CESM2
Coordinates:
  * lat      (lat) float64 344B 90.0 89.06 88.12 87.17 ... 52.3 51.36 50.42
  * lon      (lon) float64 2kB -180.0 -178.8 -177.5 -176.2 ... 176.2 177.5 178.8
  * time     (time) object 3kB 2083-01-01 12:00:00 ... 2083-12-31 12:00:00


CNRM-CM6-1-HR
Coordinates:
  * time     (time) object 3kB 1966-01-01 12:00:00 ... 1966-12-31 12:00:00
  * lat      (lat) float64 344B 90.0 89.06 88.12 87.17 ... 52.3 51.36 50.42
  * lon      (lon) float64 2kB -180.0 -178.8 -177.5 -176.2 ... 176.2 177.5 178.8


E3SM-2-0
Coordinates:
  * time     (time) object 3kB 1959-01-01 12:00:00 ... 1959-12-31 12:00:00
  * lat      (lat) float64 344B 90.0 89.06 88.12 87.17 ... 52.3 51.36 50.42
  * lon      (lon) float64 2kB -180.0 -178.8 -177.5 -176.2 ... 176.2 177.5 178.8


EC-Earth3-Veg
Coordinates:
  * time     (time) object 3kB 2054-01-01 12:00:00 ... 2054-12-31 12:00:00
  * lat      (lat) float64 344B 90.0 89.06 88.12 87.17 ... 52.3 51.36 50.42
  * lon      (lon) float64 2kB -180.0 -178.8 -177.5 