In [6]:
import os
import glob

# Get a list of all .nc files in the data/weather/cdfs directory
nc_files = glob.glob('data/weather/cdfs/*.nc')

# Print the number of files found
print(f"Found {len(nc_files)} .nc files in data/weather/cdfs")

# Only keep the filename from the path
nc_files = [os.path.basename(file) for file in nc_files]

# Drop the .nc extension from nc_files
nc_files = [os.path.splitext(file)[0] for file in nc_files]

# Sort the files ascending
nc_files.sort()

# Print the first few files
print("First few files:")
for file in nc_files[:5]:
    print(f"  - {file}")



Found 2649 .nc files in data/weather/cdfs
First few files:
  - 1672790400
  - 1672801200
  - 1672812000
  - 1672822800
  - 1672833600


In [10]:
def check_weather_data(file_list):
    """
    Check that every two consecutive entries in file_list are 10800 apart.
    Returns a list of positions where there's a discrepancy.
    """
    discrepancies = []
    
    # Extract timestamps from filenames (assuming format like 'weather_1234567890')
    timestamps = []
    for file in file_list:
        # Extract the numeric part of the filename (timestamp)
        try:
            timestamp = int(file)
            timestamps.append(timestamp)
        except (IndexError, ValueError):
            print(f"Warning: Could not extract timestamp from {file}")
            continue
    
    # Check for discrepancies between consecutive timestamps
    for i in range(1, len(timestamps)):
        diff = timestamps[i] - timestamps[i-1]
        if diff != 10800:  # Expected difference of 10800 seconds (3 hours)
            discrepancies.append({
                'position': i,
                'file1': file_list[i-1],
                'file2': file_list[i],
                'timestamp1': timestamps[i-1],
                'timestamp2': timestamps[i],
                'difference': diff,
                'expected': 10800
            })
    
    # Print summary of discrepancies
    if discrepancies:
        print(f"Found {len(discrepancies)} discrepancies in the data:")
        for d in discrepancies:
            print(f"  Position {d['position']}: {d['file1']} → {d['file2']}, diff = {d['difference']} (expected {d['expected']})")
    else:
        print("No discrepancies found. All files are 10800 seconds apart.")
    
    return discrepancies

discrepancies = check_weather_data(nc_files)
print(f'Found {len(discrepancies)} discrepancies in the data:')


Found 5 discrepancies in the data:
  Position 2282: 1697425200 → 1697446800, diff = 21600 (expected 10800)
  Position 2385: 1698548400 → 1698570000, diff = 21600 (expected 10800)
  Position 2414: 1698872400 → 1698894000, diff = 21600 (expected 10800)
  Position 2447: 1699239600 → 1699272000, diff = 32400 (expected 10800)
  Position 2451: 1699304400 → 1699336800, diff = 32400 (expected 10800)
Found 5 discrepancies in the data:
