In [1]:
import xarray as xr
import numpy as np
import time as t
import pandas as pd
import glob, os
from collections import Counter
import traceback

#### Function to initialize a Dataset with all the relevant fields and dims of occurrence (input) versus pressure_level

In [10]:
def create_dataset( n_occur ):
    plev = xr.open_dataset( '/groups/sylvia/JAS-MCS-rain/ERA5/colloc5_2000.nc' )['pressure_level' ]
    npl = len( plev.values )
    nfile_updated = xr.Dataset( data_vars={
                     "year": (["occurrence"], np.full(n_occur, np.nan)),
                     "month": (["occurrence"], np.full(n_occur, np.nan)),
                     "day": (["occurrence"], np.full(n_occur, np.nan)),
                     "hour": (["occurrence"], np.full(n_occur, np.nan)),
                     "core_lat": (["occurrence"], np.full(n_occur, np.nan)),
                     "core_lon": (["occurrence"], np.full(n_occur, np.nan)),
                     "land_water_flag": (["occurrence"], np.full(n_occur, np.nan)),
                     "maxrad": (["occurrence"], np.full(n_occur, np.nan)),
                     "minctt": (["occurrence"], np.full(n_occur, np.nan)),
                     "lifetime": (["occurrence"], np.full(n_occur, np.nan)),
                     "rad": (["occurrence"], np.full(n_occur, np.nan)),
                     "center_lat": (["occurrence"], np.full(n_occur, np.nan)),
                     "center_lon": (["occurrence"], np.full(n_occur, np.nan)),
                     "conv_fraction": (["occurrence"], np.full(n_occur, np.nan)),
                     "num_cores": (["occurrence"], np.full(n_occur, np.nan)),
                     "ctt": (["occurrence"], np.full(n_occur, np.nan)),
                     "min latitude": (["occurrence"], np.full(n_occur, np.nan)),
                     "max latitude": (["occurrence"], np.full(n_occur, np.nan)),
                     "min longitude": (["occurrence"], np.full(n_occur, np.nan)),
                     "max longitude": (["occurrence"], np.full(n_occur, np.nan)),
                     "cape_mean": (["occurrence"], np.full(n_occur, np.nan)),
                     "cape_99": (["occurrence"], np.full(n_occur, np.nan)),
                     "cape_core": (["occurrence"], np.full(n_occur, np.nan)),
                     "qc_mean": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "qc_99": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "qc_core": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "qi_mean": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "qi_99": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "qi_core": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "qv_mean": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "qv_99": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "qv_core": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "temperature_mean": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "temperature_99": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "temperature_core": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "w_mean": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "w_99": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "w_core": (["occurrence","pressure_level"], np.full((n_occur, npl), np.nan)),
                     "pmax": (["occurrence"], np.full(n_occur, np.nan)),
                     "pacc": (["occurrence"], np.full(n_occur, np.nan)), },
        coords={ "occurrence": np.arange(n_occur),
            "pressure_level": plev } )

    return nfile_updated

#### Local time versus UTC

In [10]:
filtered_edges = np.load( 'CT-allDataFilteredEdges_tropical.npy' )
local_time = np.load( 'CT-allDataLocalTimes_tropical.npy' )

print( filtered_edges.shape )
print( local_time.shape )
print( '~~~~~~~~~~~~~~' )
times = filtered_edges[:,10]
not_multiples_of_three = times[ times%3 != 0 ]
print( not_multiples_of_three )

(803075, 49)
(803075, 49)
~~~~~~~~~~~~~~
[]


#### This cell copies fields from ERA-Interim files to ERA-5 files.

In [None]:
var = [ 'z', 'lifetime' ]
for i in np.arange( 1983, 1984):#2009 ):
    print( i )
    file1 = xr.open_dataset( '/groups/sylvia/JAS-MCS-rain/ERAI/colloc_' + str(i) + '.nc' )
    file2 = xr.open_dataset( '/groups/sylvia/JAS-MCS-rain/ERA5/colloc5_' + str(i) + '.nc' )
    for v in var:
        file2[v] = file1[v]
    print(file2)

#### This cell extracts only the ISCCP data for a tropical domain.

In [3]:
hilat = 10
lowlat = -10
#ISCCP_data = np.load( "CT-allDataLocalTimes.npy" )
ISCCP_data = np.load( "CT-allDataFilteredEdges.npy" )
CT_lat = ISCCP_data[:,12]
print( CT_lat.min(), CT_lat.max() )

filtered_CT_data = ISCCP_data[(CT_lat >= lowlat) & (CT_lat <= hilat)]
#np.save( "CT-allDataLocalTimes_tropical.npy", filtered_CT_data )
np.save( "CT-allDataFilteredEdges_tropical.npy", filtered_CT_data )
print( filtered_CT_data[:,12].min(), filtered_CT_data[:,12].max() )

-51.0 51.9
-10.0 10.0


#### Testing / debugging capeCollocate_ERA5.py

In [2]:
filtered_data = np.load( "CT-allDataLocalTimes_tropical.npy" )
years = filtered_data[:,7]
data_1983 = filtered_data[(years == 1983.)]
print( data_1983[:,8] )

[ 8.  8.  8. ... 12. 12. 12.]


In [None]:
era_data = xr.open_dataset( '/xdisk/sylvia/ERA5_output/ERA5_temperature_tropical.nc' )
#era_data = xr.open_dataset( '/xdisk/sylvia/ERA5_output/ERA5_cape_tropical.nc' )
#era_data = era_data.assign_coords( longitude=((era_data['longitude'] + 360) % 360) )

time = era_data['valid_time'].dt.round("S")
longitude = era_data['longitude']
display( time )

In [None]:
for l in longitude:
    print( l.values )
    print( l.values%360 )
    print( '~~~~~~~~~~~~~~~~~~`' )

#### Combine nc files across months and variables to create a file for a single year

In [6]:
years = [ 2003 ]
months = np.arange( 1, 13, 1 )
vars = [ 'cape', 'qv', 'temperature', 'qc', 'qi', 'w' ] #'pmax', 'pacc'
input_directory = '/groups/sylvia/JAS-MCS-rain/ISCCP/output-2003/'
output_directory = '/groups/sylvia/JAS-MCS-rain/ERA5/'

for y in years:
    monthly_datasets = []
    for m in months:
        print( m )
        formatted_month = f"{m:02d}"
        
        # Get all files matching the pattern for NZ files
        nz_files = sorted(glob.glob(os.path.join(input_directory, f"colloc_{y}{formatted_month}*_NZ*.nc")))
        
        if not nz_files:
            print(f"Warning: No NZ files found for year {y}. Skipping.")
            continue

        # Open all files without concatenation yet (keeps all variables)
        datasets = [xr.open_dataset(f) for f in nz_files]
        merged_ds = xr.merge( datasets, compat='override' )

        monthly_datasets.append( merged_ds )

    if monthly_datasets:
        final_ds = xr.concat( monthly_datasets, dim="occurrence" ) 

    # Rename fields according to what the other ipynbs expect
    final_ds = final_ds.rename({ "min_lat": "min latitude", "max_lat": "max latitude",
                                 "min_lon": "min longitude", "max_lon": "max longitude",
                                 "max_radius": "maxrad", "min_temp": "minctt", "cs_radius": "rad",
                                 "cs_temp": "ctt"})
    
    # Save the merged dataset for the year
    output_file = os.path.join(output_directory, f"colloc5_{y}.nc")
    final_ds.to_netcdf(output_file)
    print(f"Saved: {output_file}")

1
2
3
4
5
6
7
8
9
10
11
12
Saved: /groups/sylvia/JAS-MCS-rain/ERA5/colloc5_2003_NZ.nc


#### Match pmax / pacc data from ERAI collocation to the yearly ERA5 collocation files

In [40]:
# Some initial setup and specification of values here
basedir1 = '/groups/sylvia/JAS-MCS-rain/ERA5/'
basedir2 = '/groups/sylvia/JAS-MCS-rain/ERAI/'
year = 2004
suffix = '_NZ' # ''
efile = xr.open_dataset( basedir2 + 'colloc_' + str(year) + suffix + '.nc' )
nfile = xr.open_dataset( basedir1 + 'colloc5_' + str(year) + '_noprecip.nc' )

In [41]:
# Filter efile up front for only latitudes from -10 deg to 10 deg
ll = efile['latitude']
efile = efile.where( (ll >= -10) & (ll <= 10), drop=True )

In [42]:
monthly_data = []

# Iterate over months in a given year to combine these in an nc file / yr    
for month in np.arange( 1, 13 ):
    print( year )
    print( 'Month: ' + str(month) )
    formatted_month = f"{month:02d}"

    for day in np.arange( 1, 32 ):
        try:
            # Filter the existing_file for the month and day of interest
            ma1 = xr.where( (efile['month'] == month) & (efile['day'] == day), 1, 0 )
            ma2 = xr.where( (nfile['month'] == month) & (nfile['day'] == day), 1, 0 )

            if ma1.sum().item() == 0 or ma2.sum().item() == 0:
               print( f"No data for month {month}, day {day}. Skipping." )
               continue
    
            # Mask the datasets using the generated masks
            filtered_efile = efile.where(ma1 == 1, drop=True)
            filtered_nfile = nfile.where(ma2 == 1, drop=True)
            
            # Create a list of tuples (with MCS property values to match) for efile and nfile
            vals_efile = [ (r, c, mr, mc, lt) for r, c, mr, mc, lt in zip(filtered_efile['rad'].values, filtered_efile['ctt'].values,
                                                 filtered_efile['maxrad'].values, filtered_efile['minctt'].values,
                                                 filtered_efile['lifetime'].values) ]
            vals_nfile = [ (r, c, mr, mc, lt) for r, c, mr, mc, lt in zip(filtered_nfile['rad'].values, filtered_nfile['ctt'].values,
                                                 filtered_nfile['maxrad'].values, filtered_nfile['minctt'].values,
                                                 filtered_nfile['lifetime'].values) ]

            # Create a dictionary with unique entries from nfile. Indices in that dictionary will also be used.
            counts = Counter( vals_nfile )
            vals_nfile_dict = {mcs_properties: j for j, mcs_properties in zip(filtered_nfile['occurrence'].values ,vals_nfile)
                               if counts[mcs_properties] == 1}

            # Identify matching indices in efile (i) and nfile (j). Store them in matches.
            matches = []
            for i, p in enumerate(vals_efile):
                if p in vals_nfile_dict:
                    j = vals_nfile_dict[p]
                    matches.append( (i, j) )

            # Use the matched indices to update nfile with matching 'pmax' and 'pacc values
            nfile_updated = create_dataset( len(matches) )
            for new_index, (efile_index, nfile_index) in enumerate(matches):
                # Extract values from efile at the matching index
                pmax_value = filtered_efile['pmax'].sel(occurrence=efile_index).values
                pacc_value = filtered_efile['pacc'].sel(occurrence=efile_index).values
            
                for v in nfile_updated.variables:
                    if v not in ['pmax', 'pacc', 'occurrence', 'pressure_level']:
                        nfile_updated[v].loc[dict(occurrence=new_index)] = filtered_nfile[v].sel(occurrence=nfile_index).values
                nfile_updated['pmax'].loc[dict(occurrence=new_index)] = pmax_value
                nfile_updated['pacc'].loc[dict(occurrence=new_index)] = pacc_value
        
        except Exception as e:
            print( f"Error processing month {month}, day {day}: {e}" )
            traceback.print_exc()

        if day%10 == 0:
                print( 'Day: ' + str(day) + ' ' + str(len(matches)) )

        # Append daily dataset to the list
        monthly_data.append(nfile_updated)

    # After looping through all days, concatenate all daily datasets into one monthly dataset
    if monthly_data:
        final_nfile = xr.concat( monthly_data, dim="occurrence" )
    
        # Save the final dataset for the month
        final_nfile.to_netcdf( f"colloc_{year}{formatted_month}{suffix}.nc" )
        print(f"Saved monthly dataset: colloc_{year}{formatted_month}{suffix}.nc")

2004
Month: 1
Day: 10 49
Day: 20 52
Day: 30 39
Saved monthly dataset: colloc_200401_NZ.nc
2004
Month: 2
Day: 10 63
Day: 20 48
No data for month 2, day 30. Skipping.
No data for month 2, day 31. Skipping.
Saved monthly dataset: colloc_200402_NZ.nc
2004
Month: 3
Day: 10 62
Day: 20 32
Day: 30 77
Saved monthly dataset: colloc_200403_NZ.nc
2004
Month: 4
Day: 10 56
Day: 20 81
Day: 30 79
No data for month 4, day 31. Skipping.
Saved monthly dataset: colloc_200404_NZ.nc
2004
Month: 5
Day: 10 35
Day: 20 25
Day: 30 47
Saved monthly dataset: colloc_200405_NZ.nc
2004
Month: 6
Day: 10 34
Day: 20 27
Day: 30 37
No data for month 6, day 31. Skipping.
Saved monthly dataset: colloc_200406_NZ.nc
2004
Month: 7
Day: 10 48
Day: 20 44
Day: 30 42
Saved monthly dataset: colloc_200407_NZ.nc
2004
Month: 8
Day: 10 29
Day: 20 26
Day: 30 18
Saved monthly dataset: colloc_200408_NZ.nc
2004
Month: 9
Day: 10 56
Day: 20 52
Day: 30 63
No data for month 9, day 31. Skipping.
Saved monthly dataset: colloc_200409_NZ.nc
2004
M

In [43]:
years = [ 2004 ]
months = np.arange( 1, 13, 1 )
input_directory = '/groups/sylvia/JAS-MCS-rain/ISCCP/'
output_directory = '/groups/sylvia/JAS-MCS-rain/ERA5/'
suffix = '_NZ' #''

for y in years:
    monthly_datasets = []
    for m in months:        
        # Get the file for this month
        formatted_month = f"{m:02d}"
        dataset = xr.open_dataset(input_directory + f"colloc_{y}{formatted_month}{suffix}.nc")
        monthly_datasets.append( dataset )

    if monthly_datasets:
        final_ds = xr.concat( monthly_datasets, dim="occurrence" ) 

    # Save the merged dataset for the year
    output_file = os.path.join(output_directory, f"colloc5_{y}{suffix}.nc")
    final_ds.to_netcdf(output_file)
    print(f"Saved: {output_file}")

Saved: /groups/sylvia/JAS-MCS-rain/ERA5/colloc5_2004_NZ.nc


#### Count the total MCSs in the ERA5 collocation

In [4]:
count = 0
#basedir = '/groups/sylvia/JAS-MCS-rain/ERA5/'
basedir = '/groups/sylvia/JAS-MCS-rain/ISCCP/'
for i in np.arange( 1, 13 ):
    formatted_month = f"{i:02d}"
    filename = basedir + 'colloc_2000' + formatted_month + '.nc'
    ds = xr.open_dataset( filename )
    count = count + ds['occurrence'].shape[0]

print( count )

190303


#### Look at nc file values

In [7]:
file = xr.open_dataset( "/xdisk/sylvia/ERA5_output/colloc_2000_qv_NZ_core.nc" )
display( file )
file2 = xr.open_dataset( "colloc_2000_qv_NZ_core.nc" )
display( file2 )

#### Look at csv file values

In [43]:
year = 2000
df = pd.read_csv( "output/CT_qv_statistics_" + str(year) + "_01.csv" )
print( df['qv_mean'] )

0       0.007076
1            NaN
2       0.007436
3       0.007021
4       0.006983
          ...   
2868    0.006947
2869    0.006802
2870    0.007184
2871    0.006706
2872    0.006745
Name: qv_mean, Length: 2873, dtype: float64


#### Combine csv files one with another AND THEN with MCS precip values

In [34]:
# Some initial setup and specification of values here
basedir1 = '/groups/sylvia/JAS-MCS-rain/ISCCP/output/'
#var_list = [ 'qv', 'temperature', 'qc', 'qi', 'w' ]
var_list = [ 'temperature', 'qc', 'qi' ]
year = 2000
for month in np.arange( 1, 13 ):
    formatted_month = f"{month:02d}"
    
    filename1 = "CT_qv_statistics_" + str(year) + "_" + formatted_month + ".csv"
    filename2 = "CT_temperature_statistics_" + str(year) + "_" + formatted_month + ".csv"
    filename3 = "CT_qc_statistics_" + str(year) + "_" + formatted_month + ".csv"
    filename4 = "CT_qi_statistics_" + str(year) + "_" + formatted_month + ".csv"
    #filename5 = "CT_w_statistics_" + str(year) + "_" + formatted_month + ".csv"
    filename6 = "CT_cape_statistics_" + str(year) + "_" + formatted_month + ".csv"
    
    df1 = pd.read_csv(basedir1 + filename1)
    df2 = pd.read_csv(basedir1 + filename2)
    df3 = pd.read_csv(basedir1 + filename3)
    df4 = pd.read_csv(basedir1 + filename4)
    #df5 = pd.read_csv(basedir1 + filename5)
    df6 = pd.read_csv(basedir1 + filename6)

    df1['temperature_mean'] = df2['temperature_mean']
    df1['temperature_99'] = df2['temperature_99']
    df1['qc_mean'] = df3['qc_mean']
    df1['qc_99'] = df3['qc_99']
    df1['qi_mean'] = df4['qi_mean']
    df1['qi_99'] = df4['qi_99']
    #df1['w_mean'] = df5['w_mean']
    #df1['w_99'] = df5['w_99']
    df1['cape_mean'] = df6['cape_mean']
    df1['cape_99'] = df6['cape_99']

    df1.to_csv( "CT_statistics_" + str(year) + "_" + formatted_month + ".csv" )
    
    print( len(df1), len(df2) , len(df3), len(df4) )
    

2873 2873 2873 2873
2694 2694 2694 2694
3523 3523 3523 3523
4196 4196 4196 4196
3849 3849 3849 3849
2591 2591 2591 2591
2837 2837 2837 2837
2544 2544 2544 2544
2846 2846 2846 2846
3492 3492 3492 3492
3272 3272 3272 3272
2531 2531 2531 2531


In [40]:
# Some initial setup and specification of values here
basedir1 = '/groups/sylvia/JAS-MCS-rain/ISCCP/output/'
basedir2 = '/groups/sylvia/JAS-MCS-rain/ERAI/'
efile = xr.open_dataset( basedir2 + 'colloc_' + str(year) + '_NZ.nc' )

# Initialize an empty dataframe to hold yearly data
yearly_reduced_df = pd.DataFrame()

# Iterate over months in a given year to combine these in an nc file / yr    
m = 0
for month in np.arange( 1, 13 ):
    formatted_month = f"{month:02d}"

    # Filter the existing_file for the month of interest
    ma = xr.where( efile['month'] == month, 1, 0 )
    
    # Combine MCS properties to match the csv in the month of interest
    num_rows = ma.sum().item()
    vars_to_collocate = [ 'day', 'rad', 'ctt', 'lifetime', 'minctt', 'maxrad' ]
    ecolloc_mcs_vals = np.full((num_rows, len(vars_to_collocate)), np.nan)  # Initialize with NaN
    for i, v in enumerate( vars_to_collocate ):
        filtered_vals = efile[v].where(ma == 1, drop=True).values
        ecolloc_mcs_vals[:, i] = filtered_vals
    
    filename = "CT_statistics_" + str(year) + "_" + formatted_month + ".csv"
    df = pd.read_csv(basedir1 + filename)

    # Combine MCS properties from the csv file
    ncolloc_mcs_vals = df[['day', 'cs_radius', 'cs_temp', 'lifetime', 'min_temp', 'max_radius']].to_numpy()

    # Validate data types before matching
    ecolloc_mcs_vals = ecolloc_mcs_vals.astype(np.float64)
    ncolloc_mcs_vals = ncolloc_mcs_vals.astype(np.float64)

    #print("Shape of ecolloc_mcs_vals:", ecolloc_mcs_vals.shape)
    #print("Shape of ncolloc_mcs_vals:", ncolloc_mcs_vals.shape)

    # Perform row-wise matching
    tolerance = 1e-6
    match_matrix = np.all(np.abs(ecolloc_mcs_vals[:, None, :] - ncolloc_mcs_vals[None, :, :]) < tolerance, axis=2)

    # Find all matches using np.where
    matching_indices = np.array(np.where(match_matrix)).T  # Convert to two-column format

    #print("Matching indices (ecolloc_mcs_vals, ncolloc_mcs_vals):")
    #print(matching_indices.shape)
    m = m + matching_indices.shape[0]

    # Extract rows from df at matching_indices[:, 1]
    reduced_df = df.iloc[matching_indices[:, 1]].copy()  # Retain only rows at matching indices from df

    # Extract the pmax and pacc values at the matching index from ecolloc_mcs_vals
    matched_pacc = efile['pacc'].values[matching_indices[:, 0]]  # From ecolloc_mcs_vals
    matched_pmax = efile['pmax'].values[matching_indices[:, 0]]  # From ecolloc_mcs_vals

    # Add 'pacc' and 'pmax' as new columns to the dataframe df
    reduced_df['pacc'] = matched_pacc
    reduced_df['pmax'] = matched_pmax
    
    # Append the reduced_df for the current month to the yearly dataframe
    yearly_reduced_df = pd.concat([yearly_reduced_df, reduced_df], ignore_index=True)

# Print yearly dataframe shape
print("Shape of yearly reduced dataframe:", yearly_reduced_df.shape)

# Convert yearly dataframe to xarray Dataset
yearly_ds = yearly_reduced_df.to_xarray()

# Save the yearly dataset to a NetCDF file
output_filename = f"colloc5_{year}_NZ.nc"
yearly_ds.to_netcdf(basedir1 + output_filename)
print( m )      

Shape of yearly reduced dataframe: (17818, 33)
17818
