In [1]:
import os
import sys
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from datetime import datetime
from pathlib import Path

from scipy.stats import pearsonr

import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
# set fonts configuration - Arial
matplotlib.rcParams['font.family'] = "Open Sans"
matplotlib.rcParams['font.sans-serif'] = "Arial"

In [4]:
def data_tracks_phase(data_tracks, var_name):
    """
    return area-related variables 
    """
    var_list = []
    track_list = []
    
    for track in data_tracks.tracks.values:

        track_list.append(track)
        tmp = data_tracks.sel(tracks=track)
        phase_list = [tmp.idt_mcs_init.values, tmp.idt_mcs_grow.values, tmp.idt_mcs_mature.values,
                      tmp.idt_mcs_decay.values, tmp.idt_mcs_end.values]

        ds = xr.Dataset(data_vars=dict(var_null=(['mcs_phase'], tmp[var_name].sel(times=phase_list).values)),
                        coords=dict(mcs_phase=(['mcs_phase'],['Init', 'Grow', 'Mature', 'Decay', 'End'])))

        var_list.append(ds)

    ds_xr = xr.concat(var_list, pd.Index(track_list, name='tracks')).rename_vars({'var_null':var_name})
        
    return ds_xr

In [7]:
def data_tracks_BL_features(files):
    """
    return tracks by BL_TOT bins (unit: K)
    """
    
    track_list = []
    BL_features_list = []    
        
    for file in files:
        
        track_list.append(int(file.name[-11:-6])) # save track number 
        
        tmp = xr.open_dataset(file)
        # calculate BL+ fraction and BL+ amplitude
        BL_mcs = tmp.Buoy_TOT.where(tmp.cloudtracknumber_nomergesplit > 0)
        BL_pos = BL_mcs.where(BL_mcs > 0, 0)
        BL_pos_nan = BL_mcs.where(BL_mcs > 0, np.nan)
        BL_pos_binary = BL_pos.where(BL_pos ==0, 1)
        mcs_area = tmp.cloudtracknumber_nomergesplit.sum(('x','y'))
        BL_pos_area = BL_pos_binary.sum(('x','y')).rename('BL_positive_area')
        BL_pos_frac = (BL_pos_area/mcs_area).rename('BL_positive_fraction')
        BL_tot_bp = BL_pos_nan.mean(('x','y')).rename('BL_TOT_bp')
        
        # BL estimates, MCS grids
        BL_tot_mcs = tmp.Buoy_TOT.where(tmp.cloudtracknumber_nomergesplit > 0).mean(('x','y')).rename('BL_TOT_mcs')
        BL_cape_mcs = tmp.Buoy_CAPE.where(tmp.cloudtracknumber_nomergesplit > 0).mean(('x','y')).rename('BL_CAPE_mcs')
        BL_subsat_mcs = tmp.Buoy_SUBSAT.where(tmp.cloudtracknumber_nomergesplit > 0).mean(('x','y')).rename('BL_SUBSAT_mcs')
    
        # BL estimates, non-MCS grids
        BL_tot_ouside = tmp.Buoy_TOT.where(tmp.cloudtracknumber_nomergesplit == 0).mean(('x','y')).rename('BL_TOT_outside')
        BL_cape_ouside = tmp.Buoy_CAPE.where(tmp.cloudtracknumber_nomergesplit == 0).mean(('x','y')).rename('BL_CAPE_outside')
        BL_subsat_ouside = tmp.Buoy_SUBSAT.where(tmp.cloudtracknumber_nomergesplit == 0).mean(('x','y')).rename('BL_SUBSAT_outside')        
        
        # BL estimates, 5-deg. mean
        BL_tot_5deg = tmp.Buoy_TOT.sel(x=slice(15,25),y=slice(15,25)).mean(('x','y')).rename('BL_TOT_amean')
        BL_cape_5deg = tmp.Buoy_CAPE.sel(x=slice(15,25),y=slice(15,25)).mean(('x','y')).rename('BL_CAPE_amean')
        BL_subsat_5deg = tmp.Buoy_SUBSAT.sel(x=slice(15,25),y=slice(15,25)).mean(('x','y')).rename('BL_SUBSAT_amean')
        
        # replace 0 by nan to avoid incorrectness when doing averaging
        BL_pos_area = BL_pos_area.where(BL_pos_area > 0, np.nan)
        BL_pos_frac = BL_pos_frac.where(BL_pos_frac > 0, np.nan)
        
        BL_features_list.append(xr.merge([BL_pos_area, BL_pos_frac, BL_tot_bp, BL_tot_mcs,
                                         BL_cape_mcs, BL_subsat_mcs, BL_tot_ouside,
                                         BL_cape_ouside, BL_subsat_ouside,
                                         BL_tot_5deg, BL_cape_5deg, BL_subsat_5deg]))
    
    BL_features_xr = xr.concat(BL_features_list, pd.Index(track_list, name='tracks'))
    
    return BL_features_xr

In [8]:
def data_tracks_precip_features(files):
    """
    return tracks by BL_TOT bins (unit: K)
    """
    
    track_list = []
    precip_features_list = []    
    corr_coeff_temp = []
        
    for file in files:
        
        track_list.append(int(file.name[-11:-6])) # save track number 
        
        tmp = xr.open_dataset(file)
        # calculate precip amplitude within mcs
        prec_mcs = tmp.precipitationCal.where(tmp.cloudtracknumber_nomergesplit > 0)
        mtpr_mcs = tmp.mtpr.where(tmp.cloudtracknumber_nomergesplit > 0)
        prec_amp_mcs = prec_mcs.mean(('x','y')).rename('precipitationCal_mcs')
        mtpr_amp_mcs = mtpr_mcs.mean(('x','y')).rename('mtpr_mcs')
        
        try:
            # estimate spatial correlation 
            corr_coeff = []
            for phase in tmp.mcs_phase:
                prec_era5 = prec_mcs.sel(mcs_phase=phase)
                prec_gpm = mtpr_mcs.sel(mcs_phase=phase)
                
                x1 = np.where(np.isnan(prec_era5.values.ravel())==0)[0]
                x2 = np.where(np.isnan(prec_era5.values.ravel())==0)[0]
                idx = np.intersect1d(x1,x2)

                stats = pearsonr(prec_era5.values.ravel()[idx], prec_gpm.values.ravel()[idx])
                corr_coeff.append(stats[0]) # save correlation coefficient 
            corr_coeff_space = xr.Dataset(data_vars=dict(corr_coeff_space=(['mcs_phase'],np.asarray(corr_coeff))),
                                     coords=dict(mcs_phase=(['mcs_phase'], tmp.mcs_phase.values)))

            # estimate temporal correlation 
            corr_coeff_temp.append(pearsonr(prec_amp_mcs.values.ravel(), mtpr_amp_mcs.values.ravel())[0])

            precip_features_list.append(xr.merge([prec_amp_mcs ,mtpr_amp_mcs,
                                                  corr_coeff_space]))
        except:
            # if nan in dataset (one file in 2009...) 
            corr_coeff = [np.nan, np.nan, np.nan, np.nan, np.nan]
            corr_coeff_space = xr.Dataset(data_vars=dict(corr_coeff_space=(['mcs_phase'],np.asarray(corr_coeff))),
                                     coords=dict(mcs_phase=(['mcs_phase'], tmp.mcs_phase.values)))

            # estimate temporal correlation 
            corr_coeff_temp.append(np.nan)

            precip_features_list.append(xr.merge([prec_amp_mcs ,mtpr_amp_mcs,
                                                  corr_coeff_space]))
            
            print('error file: {}'.format(file))
            #exit()
    
    corr_coeff_temp_xr = xr.Dataset(data_vars=dict(corr_coeff_temp=(['tracks'],np.asarray(corr_coeff_temp))),
                                 coords=dict(tracks=(['tracks'], track_list)))
    precip_features_xr = xr.concat(precip_features_list, pd.Index(track_list, name='tracks'))

    # merge two datasets
    precip_features_xr = xr.merge([precip_features_xr, corr_coeff_temp_xr])
    
    return precip_features_xr

In [16]:
%%time

out_dir = Path('/scratch/wmtsai/temp_mcs/mcs_stats/mcs_tracks_non2mcs/tracks_area_mean')

data_tracks_list = []

year_list = np.arange(2001,2007)
for year in year_list:

    print('processing year: {}'.format(year))
    # directory of the mcs_3dvars files
    dir_envs_track = Path('/scratch/wmtsai/temp_mcs/mcs_stats/envs_track/{}/tropics_extend'.format(year))
    files = sorted(list(dir_envs_track.glob('*.LD.nc')))

    # load data_tracks 
    data_tracks = xr.open_dataset('/scratch/wmtsai/temp_mcs/mcs_stats/mcs_tracks_non2mcs/mcs_tracks_non2mcs_{}.tropics30NS.extend.nc'.format(year))
    
    # data_tracks time spend (1) from initial and mature (2) from mature to end
    data_tracks['hours_ccs2init'] = data_tracks.idt_mcs_init - data_tracks.idt_ccs_init
    data_tracks['hours_init2mat']  = data_tracks.idt_mcs_mature - data_tracks.idt_mcs_init
    data_tracks['hours_mat2end']  = data_tracks.idt_mcs_end - data_tracks.idt_mcs_mature
    
    # write out BL_features dataset based on mcs_envs output
    data_BL_features = data_tracks_BL_features(files)
    data_precip_features = data_tracks_precip_features(files)
    
    # extract existing area-related variables
    data_ccs_area = data_tracks_phase(data_tracks, var_name='ccs_area')
    data_core_area = data_tracks_phase(data_tracks, var_name='core_area')
    data_cold_area = data_tracks_phase(data_tracks, var_name='cold_area')
    data_area_features = xr.merge([data_ccs_area, data_core_area, data_cold_area])
    
    corr_temp = data_precip_features.corr_coeff_temp
    corr_coeff_space = data_precip_features.corr_coeff_space.mean('mcs_phase')

    data_tracks_out = xr.merge([data_tracks['mcs_duration'],
                                data_tracks['hours_init2mat'],
                                data_tracks['hours_mat2end'],
                                data_BL_features,
                                data_precip_features,
                                data_area_features]
                                )
    
    data_tracks_out.to_netcdf(out_dir / 'featstats_tracks_non2mcs_{}.tropics30NS.extend.nc'.format(year))
    
    del data_tracks_out

processing year: 2001
processing year: 2002
processing year: 2003
processing year: 2004
processing year: 2005
processing year: 2006
CPU times: user 38min 11s, sys: 4min 45s, total: 42min 56s
Wall time: 43min 49s
