In [None]:
# Import required packages
import iris
import numpy as np
import pandas as pd
import os
import tobac
import matplotlib.pyplot as plt
from glob import glob
import netCDF4 as nc 
import warnings
import xarray as xr
import seaborn as sns
# Ignore some warnings and append them to the existing filter list
warnings.filterwarnings('ignore', category=UserWarning, append=True)
warnings.filterwarnings('ignore', category=RuntimeWarning, append=True)
warnings.filterwarnings('ignore', category=FutureWarning, append=True)
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)

In [None]:
# Import the 'caffeine' module to prevent the system from going to sleep or the screen from turning off
import caffeine
# Turn on the caffeine mode with the display option set to True
caffeine.on(display=True)

In [None]:
# Creating a function for initial processing of an ISCCP HXG file
def transforming_nc(datafile):
    '''
    This function processes a NetCDF file containing IR brightness temperature data.
    It calculates converted brightness temperatures, adds them to an xarray dataset, 
    removes unneeded variables, and returns the processed dataset 
    with converted temperature values.
    Input: A function takes a directory to a file as an argument
    Output: Processed dataset
    '''
    file = nc.Dataset(datafile)
    irad = np.array(file['irad']) # irad is a calibrated IR brightness temperature in standard counts
    tmbtab = np.array(file['tmptab']) # tmptab is a count to temperature conversion table
    TB = tmbtab[irad] # Converting brightness temperature to Kelvin
    vtauic = np.array(file['vtauic']) # vtauic is an all cloud optical thicknesses retrieved for both liquid and ice phase
    tautab = np.array(file['tautab']) # tautab is a count to optical thickness conversion table
    TAU = tautab[vtauic]
    ds = xr.open_dataset(datafile)
    temp = xr.DataArray(TB,dims = ["lat", "lon"]) # Creating xarray from converted TB values
    tau = xr.DataArray(TAU, dims = ["lat", "lon"]) # Creating xarray from converted TAU values
    ds['Tb'] = temp # Adding TB values as a variable to the original dataset
    ds['Tau'] = tau # Adding TAU values as a variable to the original dataset
    dataset_keys = list(ds.keys()) # A list of all the variables
    dataset_keys.remove('Tb')
    dataset_keys.remove('Tau')
    dataset_keys.remove('time')
    ds = ds.drop(labels=dataset_keys) # Removing all the variables that are not needed
    ds = ds.set_coords('time')
    return ds

In [None]:
# Specify the directory path and the file format you want to filter
#path = "DATA/"
path = "/Volumes/Pegasus32 R8/NASA/RAW_DATA/2010"
file_format = "*.nc" 
# Use glob to get the list of files matching the specified format
file_list = glob(os.path.join(path, file_format))
# Now file_list contains only the files of the specified format in the specified directory

In [None]:
# Check the length of the list
print(len(file_list))

In [None]:
# Sort the values of the list
file_list.sort()

In [None]:
file_list = file_list[:100]

# The cell below is used for the processing of the year 2009 only as one of the files is corrupted and needs to be processed in a sligtly different way than the rest of the files.

In [None]:
processed_datasets = []
for f in file_list:
    print(f)
    if f == '/Volumes/Pegasus32 R8/NASA/RAW_DATA/2009/ISCCPHXG.v01r00.GLOBAL.2009.08.05.1800.GPC.10KM.CS0952909559.EQ0.10.nc':
        # This particular file is missing lon, lat, time and tmptab information
        # Since this information is the same for all the file, I'm going to replace missing values 
        # with the data from another file
        # Loading non corrupted dataset and variables from it
        non_corrupted  = xr.open_dataset('/Volumes/Pegasus32 R8/NASA/RAW_DATA/2009/ISCCPHXG.v01r00.GLOBAL.2009.08.05.2100.GPC.10KM.CS0952909559.EQ0.10.nc')
        lons = non_corrupted.lon.values
        lats = non_corrupted.lat.values
        correct_tmptab = non_corrupted.tmptab.values
        correct_tautab = non_corrupted.tautab.values
        # Defining variable with the correct file
        time = '2009-08-05T18:00:00.000000000'
        # Loading corrupted file
        corrupted = xr.open_dataset('/Volumes/Pegasus32 R8/NASA/RAW_DATA/2009/ISCCPHXG.v01r00.GLOBAL.2009.08.05.1800.GPC.10KM.CS0952909559.EQ0.10.nc')
        # Replace latitude and longitude coordinates in the existing xarray dataset
        corrupted = corrupted.assign_coords(lat=lats, lon=lons)
        # Specify the attribute names and values in a dictionary
        attributes_dict_lon = {
            "long_name": "Center longitude of square grid cell",
            "units": "degrees_east",
            "valid_min": "0.0",
            "valid_max":"360.0",
            "bounds":"lon_bounds"
        }
        attributes_dict_lat = {
            "long_name": "Center latitude of square grid cell",
            "units": "degrees_north",
            "valid_min": "-90.0",
            "valid_max":"90.0",
            "bounds":"lat_bounds"
        }
        # Add the attributes to the variable
        for attribute_name, attribute_value in attributes_dict_lon.items():
            corrupted['lon'].attrs[attribute_name] = attribute_value
        for attribute_name, attribute_value in attributes_dict_lat.items():
            corrupted['lat'].attrs[attribute_name] = attribute_value
        # Assigning correct time 
        corrupted['time'].values = time
        corrupted["time"] = corrupted["time"].astype('datetime64[ns]')
        # Assigning correct tmptab values
        corrupted['tmptab'].values = correct_tmptab
        # Assigning correct tautab values
        corrupted['tautab'].values = correct_tautab
        # Next processing the file as regular
        file = nc.Dataset('/Volumes/Pegasus32 R8/NASA/RAW_DATA/2009/ISCCPHXG.v01r00.GLOBAL.2009.08.05.1800.GPC.10KM.CS0952909559.EQ0.10.nc')
        irad = np.array(file['irad']) # irad is a calibrated IR brightness temperature in standard counts
        vtauic = np.array(file['vtauic'])
        tmptab = np.array(corrupted['tmptab']) # tmptab is a count to temperature conversion table
        tautab = np.array(corrupted['tmptab']) 
        TB = tmptab[irad] # Converting brightness temperature to Kelvin
        TAU = tautab[vtauic]
        temp = xr.DataArray(TB,dims=["lat", "lon"]) # Creating a xarray from converted TB values
        tau = xr.DataArray(TAU,dims=["lat", "lon"])
        corrupted['Tb'] = temp # Adding TB values as a variable to the original dataset
        corrupted['Tau'] = tau
        dataset_keys = list(corrupted.keys()) # A list of all the variables
        dataset_keys.remove('Tb')
        dataset_keys.remove('Tau')
        dataset_keys.remove('time')
        corrupted = corrupted.drop(labels=dataset_keys) # Removing all the variables that are not needed
        corrupted = corrupted.set_coords('time')
        processed_datasets.append(corrupted)
    else:
        dataset = transforming_nc(f)
        processed_datasets.append(dataset)

# Regular processing

In [None]:
# Process each file in the file_list using the transforming_nc function
processed_datasets = [transforming_nc(file) for file in file_list]

In [None]:
# Assuming 'datasets' is a list containing xarray datasets with dimensions 'time', 'lat', and 'lon'

# Chunk each dataset along the 'time' dimension
chunked_datasets = [dataset.chunk({'time': -1}) for dataset in processed_datasets]

# Concatenate the chunked datasets
concatenated_dataset = xr.concat(chunked_datasets, dim='time')

In [None]:
del file_list

In [None]:
# Concatenate a list of processed datasets along the "time" dimension
merged_dataset = xr.concat(processed_datasets, dim="time")

In [None]:
merged_dataset

In [None]:
del processed_datasets

In [None]:
merged_dataset

In [None]:
def transforming_file(file):
    '''
    The function futher transforms dataset that contains brightness temperature data
    to make the description of all variables more clear and concise.
    Output: Iris cube as it is required as an input in tobac processing
    '''
    # Define latitude range for data subset
    min_lat = -60
    max_lat = 60
    # Create a mask to extract data within the specified latitude range
    subset_mask = (file.lat >= min_lat) & (file.lat <= max_lat)
    # Apply the latitude subset mask and create a subset dataset
    subset_ds = file.where(subset_mask, drop=True)
    # Update time attributes
    subset_ds.time.attrs['axis'] = 'T'
    subset_ds.time.attrs['standard_name'] = 'time'
    del subset_ds.time.attrs['long_name']
    # Update longitude attributes
    subset_ds.lon.attrs['axis'] = 'X'
    subset_ds.lon.attrs['units'] = 'degrees_east'
    subset_ds.lon.attrs['standard_name'] = 'longitude'
    subset_ds.lon.attrs['spacing'] = '0.1'
    # Update latitude attributes
    subset_ds.lat.attrs['axis'] = 'Y'
    subset_ds.lat.attrs['units'] = 'degrees_north'
    subset_ds.lat.attrs['standard_name'] = 'latitude'
    subset_ds.lat.attrs['spacing'] = '0.1'
    # Update Tb (brightness temperature) attributes
    subset_ds.Tb.attrs['long_name'] = 'Tb'
    subset_ds.Tb.attrs['units'] = 'K'
    # Save information for both TB and TAU
    TAU_TB = subset_ds
    # Remove TAU as it is not needed in tobac processing
    subset_ds = subset_ds.drop_vars('Tau')
    # Convert subset_ds.Tb to an Iris cube
    TB = subset_ds.Tb.to_iris()
    # Return the processed Iris cube and xarray dataset
    return TB, TAU_TB

In [None]:
#Set up directory to save output and plots:
savedir='Save'
if not os.path.exists(savedir):
    os.makedirs(savedir)
plot_dir="Plot"
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

In [None]:
# Replace -1000 with NaN
merged_dataset = merged_dataset.where(merged_dataset != -1000, np.nan)

In [None]:
# Transform the merged dataset using the transforming_file function
TB, TAU_TB = transforming_file(merged_dataset)

In [None]:
del merged_dataset

In [None]:
# Save both TB and TAU information
iris.save([TB],os.path.join(savedir,'TB.nc'),zlib=True,complevel=4)
TAU_TB.to_netcdf('Save/TAU_TB.nc')

In [None]:
del TAU_TB

In [None]:
# Calculate spatial and temporal spacings using the 'get_spacings' function from the 'tobac' library
dxy, dt = tobac.get_spacings(TB, grid_spacing=10000)

In [None]:
# Keyword arguments for the feature detection step
parameters_features=dict()
parameters_features['target']='minimum'
parameters_features['threshold']= [245,220]  
parameters_features['n_min_threshold'] = 2  # The higher the number, the larger features will get detected
parameters_features['position_threshold']= 'weighted_diff'
parameters_features['sigma_threshold']=1.5 # The larger the values, the fewer features detected
parameters_features['n_erosion_threshold']=2 # The larger the values, the fewer features detected

In [None]:
# Feature detection and save results to file:
print('starting feature detection')
Features=tobac.feature_detection_multithreshold(TB,dxy,**parameters_features)
Features.to_hdf(os.path.join(savedir,'Features.h5'),'table')
print('feature detection performed and saved')

In [None]:
# Set Seaborn style
sns.set(style="whitegrid")
# Create a count plot 
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
ax = sns.countplot(x='threshold_value', data=Features)
# Customize plot labels and title
plt.xlabel("Threshold Value")
plt.ylabel("Count")
plt.title("Distribution of Threshold Values")
# Rotate x-axis labels for better readability
plt.xticks(rotation=45)
# Show the plot
plt.show()

In [None]:
# Keyword arguments for the segmentation step:
parameters_segmentation={}
parameters_segmentation['target']='minimum' 
parameters_segmentation['threshold']=245

In [None]:
# Perform segmentation and save results to files:
Mask_TB,Features_TB=tobac.segmentation_2D(Features,TB,dxy,**parameters_segmentation)
print('segmentation TB performed, start saving results to files')
iris.save([Mask_TB],os.path.join(savedir,'Mask_Segmentation_TB.nc'),zlib=True,complevel=4)                
Features_TB.to_hdf(os.path.join(savedir,'Features_TB.h5'),'table')
print('segmentation TB performed and saved')

In [None]:
del Features_TB

In [None]:
# Keyword arguments for linking step:
parameters_linking={}
parameters_linking['method_linking']='predict' 
parameters_linking['v_max']=30 #(m/s)Assumed maximum speed of tracked objects
parameters_linking['adaptive_stop']=2 # Tells trackpy when to give up
parameters_linking['adaptive_step']=0.95 # Can only be in range 0-1
parameters_linking['stubs']=2 #Minumum number of timesteps for which objects have to be      
                                #detected to not be filtered out as spurious
parameters_linking['subnetwork_size']=20 
parameters_linking['time_cell_min']=5*60

In [None]:
# Perform linking and save results to file:
Track=tobac.linking_trackpy(Features,TB,dt=dt,dxy=dxy,**parameters_linking)
Track.to_hdf(os.path.join(savedir,'Track.h5'),'table')

In [None]:
Track