In [2]:
# import necessary packages 

import xarray as xr
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress

In [8]:
# setting base directory
# !! change here if needed
cd = "/global/scratch/users/yougsanghvi"  

In [17]:
# list files (matching a given format) in the cd, in ascending order of year
# !! remove below comments to run the code 


files = sorted(
    [f for f in os.listdir(cd) if f.endswith(".grib") and f.startswith("era5_data_")],
    key=lambda x: int(x.split('_')[2].split('.')[0])
)

print(files)


['era5_data_1979.grib', 'era5_data_1980.grib', 'era5_data_1981.grib', 'era5_data_1982.grib', 'era5_data_1983.grib', 'era5_data_1984.grib', 'era5_data_1985.grib', 'era5_data_1986.grib', 'era5_data_1987.grib', 'era5_data_1988.grib', 'era5_data_1989.grib', 'era5_data_1990.grib', 'era5_data_1991.grib', 'era5_data_1992.grib', 'era5_data_1993.grib', 'era5_data_1994.grib', 'era5_data_1995.grib', 'era5_data_1996.grib', 'era5_data_1997.grib', 'era5_data_1998.grib', 'era5_data_1999.grib', 'era5_data_2000.grib', 'era5_data_2001.grib', 'era5_data_2002.grib', 'era5_data_2003.grib', 'era5_data_2004.grib', 'era5_data_2005.grib', 'era5_data_2006.grib', 'era5_data_2007.grib', 'era5_data_2008.grib', 'era5_data_2009.grib', 'era5_data_2010.grib', 'era5_data_2011.grib', 'era5_data_2012.grib', 'era5_data_2013.grib', 'era5_data_2014.grib', 'era5_data_2015.grib', 'era5_data_2016.grib', 'era5_data_2017.grib', 'era5_data_2018.grib', 'era5_data_2019.grib']


In [5]:
# summarize hourly data into daily data through daily averages

years = range(2015, 2021)  # !! adjust range if needed

for year in years:
    input_path = f"{cd}/era5_data_{year}.grib"
    output_path = f"{cd}/era5_daily_mean_{year}.nc"

    # Skip if output already exists
    if os.path.exists(output_path):
        print(f"Skipping {year} (already summarized)")
        continue

    print(f"Processing year {year}...")

    ds = xr.open_dataset(
        input_path,
        engine="cfgrib",
        backend_kwargs={"indexpath": ""},
        chunks={"time": 24}
    )
    
    daily_ds = ds.resample(time="1D").mean()
    daily_ds.to_netcdf(output_path)

    print(f"Saved daily mean to {output_path}")

Skipping 2015 (already summarized)
Processing year 2016...


  vars, attrs, coord_names = xr.conventions.decode_cf_variables(


Saved daily mean to /global/scratch/users/yougsanghvi/era5_daily_mean_2016.nc
Processing year 2017...


  vars, attrs, coord_names = xr.conventions.decode_cf_variables(


Saved daily mean to /global/scratch/users/yougsanghvi/era5_daily_mean_2017.nc
Skipping 2018 (already summarized)
Skipping 2019 (already summarized)
Processing year 2020...


  vars, attrs, coord_names = xr.conventions.decode_cf_variables(


Saved daily mean to /global/scratch/users/yougsanghvi/era5_daily_mean_2020.nc


In [18]:
# list cleaned files to verify success


files = sorted(
    [f for f in os.listdir(cd) if f.endswith(".nc") and f.startswith("era5_daily_mean")],
    key=lambda x: int(x.split('_')[3].split('.')[0])
)

print(files)


['era5_daily_mean_1979.nc', 'era5_daily_mean_1980.nc', 'era5_daily_mean_1981.nc', 'era5_daily_mean_1982.nc', 'era5_daily_mean_1983.nc', 'era5_daily_mean_1984.nc', 'era5_daily_mean_1985.nc', 'era5_daily_mean_1986.nc', 'era5_daily_mean_1987.nc', 'era5_daily_mean_1988.nc', 'era5_daily_mean_1989.nc', 'era5_daily_mean_1990.nc', 'era5_daily_mean_1991.nc', 'era5_daily_mean_1992.nc', 'era5_daily_mean_1993.nc', 'era5_daily_mean_1994.nc', 'era5_daily_mean_1995.nc', 'era5_daily_mean_1996.nc', 'era5_daily_mean_1997.nc', 'era5_daily_mean_1998.nc', 'era5_daily_mean_1999.nc', 'era5_daily_mean_2000.nc', 'era5_daily_mean_2001.nc', 'era5_daily_mean_2002.nc', 'era5_daily_mean_2003.nc', 'era5_daily_mean_2004.nc', 'era5_daily_mean_2005.nc', 'era5_daily_mean_2006.nc', 'era5_daily_mean_2007.nc', 'era5_daily_mean_2008.nc', 'era5_daily_mean_2009.nc', 'era5_daily_mean_2010.nc', 'era5_daily_mean_2011.nc', 'era5_daily_mean_2012.nc', 'era5_daily_mean_2013.nc', 'era5_daily_mean_2014.nc', 'era5_daily_mean_2015.nc']


In [None]:
# view temperature data as a table to verify

"""
# Load dataset
file_1979 = f"{cd}/era5_daily_mean_1979.nc"
ds = xr.open_dataset(file_1979)

# print all dates on dataset to confirm
# !! remove commentif needed
# print(ds.time.values)

# Select September 2, 1979
selected_time = ds.sel(time=slice("1979-09-02", "1979-09-03"))

# Extract the temperature variable (adjust name if needed)
temp = selected_time['t2m']

# Convert to DataFrame for tabular view
temp_df = temp.to_dataframe().reset_index()

# Show first 10 rows of the table
print(temp_df.head(10))
"""



  ds = xr.open_dataset(file_1979)


['1979-01-01T00:00:00.000000000' '1979-01-02T00:00:00.000000000'
 '1979-01-03T00:00:00.000000000' '1979-01-04T00:00:00.000000000'
 '1979-01-05T00:00:00.000000000' '1979-01-06T00:00:00.000000000'
 '1979-01-07T00:00:00.000000000' '1979-01-08T00:00:00.000000000'
 '1979-01-09T00:00:00.000000000' '1979-01-10T00:00:00.000000000'
 '1979-01-11T00:00:00.000000000' '1979-01-12T00:00:00.000000000'
 '1979-01-13T00:00:00.000000000' '1979-01-14T00:00:00.000000000'
 '1979-01-15T00:00:00.000000000' '1979-01-16T00:00:00.000000000'
 '1979-01-17T00:00:00.000000000' '1979-01-18T00:00:00.000000000'
 '1979-01-19T00:00:00.000000000' '1979-01-20T00:00:00.000000000'
 '1979-01-21T00:00:00.000000000' '1979-01-22T00:00:00.000000000'
 '1979-01-23T00:00:00.000000000' '1979-01-24T00:00:00.000000000'
 '1979-01-25T00:00:00.000000000' '1979-01-26T00:00:00.000000000'
 '1979-01-27T00:00:00.000000000' '1979-01-28T00:00:00.000000000'
 '1979-01-29T00:00:00.000000000' '1979-01-30T00:00:00.000000000'
 '1979-01-31T00:00:00.000

In [None]:

# Path to your file
file_path = os.path.join(cd, "era5_daily_mean_2019.nc")

# Open the dataset
ds = xr.open_dataset(file_path)

# 1. View basic info (variables, dimensions, attributes)
print("basic info:")
print(ds)

# 2. See available variables
print("variables:")
print(ds.data_vars)

# 3. Look at coordinates
print("coordinates:")
print(ds.coords)

# 4. Inspect a single variable (e.g., 't2m')
print("inspect t2m")
print(ds['t2m'])

basic info:
<xarray.Dataset> Size: 2GB
Dimensions:    (time: 365, latitude: 721, longitude: 1440)
Coordinates:
    number     int64 8B ...
    step       timedelta64[ns] 8B ...
    surface    float64 8B ...
  * latitude   (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
  * longitude  (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
  * time       (time) datetime64[ns] 3kB 2019-01-01 2019-01-02 ... 2019-12-31
Data variables:
    t2m        (time, latitude, longitude) float32 2GB ...
Attributes:
    GRIB_edition:            1
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2025-06-06T11:24 GRIB to CDM+CF via cfgrib-0.9.1...
variables:
Data variables:
    t2m      (time, latitude, longitude) float32 2GB ...
coordinates:
C

  ds = xr.open_dataset(file_path)
