# Generation of cloud-class-stratified CRE

In [2]:
#%reset  # Do this to make sure we have the max memory possible
#%whos
import numpy as np
import xarray as xr
import pandas as pd
import dask.array as da

### List of directories

In [5]:
georgedir = '/pf/b/b380796/scratch/hackathon/george/'
nicoledir = '/scratch/b/b380490/hackathon/'

### Set up da dask

In [24]:
# Trying out this dask thing from Aiko's GitHub repository :p
from tempfile import NamedTemporaryFile, TemporaryDirectory # Creating temporary Files/Dirs
import dask # Distributed data libary
from dask_jobqueue import SLURMCluster # Setting up distributed memories via slurm
from distributed import Client, progress, wait # Libaray to orchestrate distributed resources

In [25]:
# Set some user specific variables
account_name = 'bb1018'
partition = 'compute'
job_name = 'sylviaCRE' # Job name that is submitted via sbatch
memory = '64GiB' # Max memory per node that is going to be used - this depends on the partition
cores = 48 # Max number of cores per that are reserved - also partition dependent
walltime = '01:00:00' #'12:00:00' # Walltime - also partition dependent

In [26]:
scratch_dir = '/scratch/b/b380873/' # Define the users scratch dir
# Create a temp directory where the output of distributed cluster will be written to, after this notebook
# is closed the temp directory will be closed
dask_scratch_dir = TemporaryDirectory(dir=scratch_dir, prefix=job_name)
cluster = SLURMCluster(memory=memory,
                       cores=cores,
                       project=account_name,
                       walltime=walltime,
                       queue=partition,
                       name=job_name,
                       processes=8,
                       scheduler_options={'dashboard_address': ':12435'},
                       local_directory=dask_scratch_dir.name,
                       job_extra=[f'-J {job_name}', 
                                  f'-D {dask_scratch_dir.name}',
                                  f'--begin=now',
                                  f'--output={dask_scratch_dir.name}/LOG_cluster.%j.o',
                                  f'--output={dask_scratch_dir.name}/LOG_cluster.%j.o'
                                 ],
                       interface='ib0')

Perhaps you already have a cluster running?
Hosting the HTTP server on port 45039 instead


In [27]:
print(cluster.job_script())

#!/usr/bin/env bash

#SBATCH -J dask-worker
#SBATCH -p compute
#SBATCH -A bb1018
#SBATCH -n 1
#SBATCH --cpus-per-task=48
#SBATCH --mem=64G
#SBATCH -t 01:00:00
#SBATCH -J sylviaCRE
#SBATCH -D /scratch/b/b380873/sylviaCREdbjiy75r
#SBATCH --begin=now
#SBATCH --output=/scratch/b/b380873/sylviaCREdbjiy75r/LOG_cluster.%j.o
#SBATCH --output=/scratch/b/b380873/sylviaCREdbjiy75r/LOG_cluster.%j.o

JOB_ID=${SLURM_JOB_ID%;*}

/pf/b/b380459/conda-envs/Nawdex-Hackathon/bin/python3 -m distributed.cli.dask_worker tcp://10.50.40.19:37142 --nthreads 6 --nprocs 8 --memory-limit 8.59GB --name name --nanny --death-timeout 60 --local-directory /scratch/b/b380873/sylviaCREdbjiy75r --interface ib0



In [28]:
cluster.scale(jobs=1)
cluster

VBox(children=(HTML(value='<h2>sylviaCRE</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .da…

In [29]:
dask_client = Client(cluster)
dask_client

0,1
Client  Scheduler: tcp://10.50.40.19:37142  Dashboard: http://10.50.40.19:45039/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


### Method 1: Pile all cloud classes and all their values, not only statistics, into the same dataframe
Memory errors generated...

### Method 2: Pile a single cloud class and all its values, not only statistics, into the same dataframe
From the memory errors above, we need to find some way to split the problem better. Here the strategy was to save each class in a separate dataframe.

In [6]:
# Method 2: Filter the cloud radiative effects by class. Store all values for a single class in a single dataframe.
nclasses = 8
lbls = ['Cloud class','Resolution','LWCRE_toa','LWCRE_atm','LWCRE_sfc','SWCRE_toa','SWCRE_atm','SWCRE_sfc']
resolutions = ['80','40','20','10','5','2']
class_names = ['HI','MED','LO','H-M','M-L','H-L','H-M-L','Clear-sky']

# For each cloud class, iterate over the resolutions and extract its CRE values. 
# Save these in a dataframe and then delete.
for cloud_class in np.arange(1,nclasses):
    print(cloud_class)
    
    # Reinitialize the dataframe to a clean one for each class.
    B_df = pd.DataFrame(columns=lbls)
    
    for res in resolutions:
        print(res)
    
        # Read in the cloud classifications and radiative variables for this resolution.
        fi = xr.open_dataset(georgedir + 'cloud_class_array_thres10p_' + res + 'km_alltimesteps_v3.nc')
        cc = xr.open_zarr(nicoledir + 'nawdexnwp-' + res + 'km-mis-0001_cre_alltimesteps_oceanmask_applied.zarr')
        classes = fi['clch']
        
        # Extract the cloud class of interest. Generate a mask to extract the non-nan values thereafter.
        B2 = cc.where(classes == cloud_class+1)
        arr = B2.crelw_toa.values
        mask = ~np.isnan(arr)
        val = np.array([(np.ones(np.sum(mask))*int(cloud_class+1)).T, (np.ones(np.sum(mask))*int(res)).T,
                     B2.crelw_toa.values[mask].T, B2.crelw_atm.values[mask].T, B2.crelw_sfc.values[mask].T,
                     B2.cresw_toa.values[mask].T, B2.cresw_atm.values[mask].T, B2.cresw_sfc.values[mask].T])
        B_df = B_df.append(pd.DataFrame(val.T,columns=lbls),ignore_index=True)
                        
    # Save the CRE dataframe in a pickle.                     
    B_df.to_pickle('cre_class' + str(cloud_class+1) + '.pkl')

1
80
40
20
10
5
2
2
80
40
20
10
5
2
3
80
40
20
10
5
2
4
80
40
20
10
5
2
5
80
40
20
10
5
2
6
80
40
20
10
5
2
7
80
40
20
10
5
2


### Method 3: Pile a single cloud class and only its statistics into the same dataframe
From the memory errors above, we need to find some way to split the problem better. Here the strategy was to save only the relevant statistics per cloud class in a single dataframe.

In [11]:
# Method 3: Filter the cloud radiative effects by class. Store only the percentiles / stats of all these values (B_df). 
nclasses = 8
resolutions = ['80','40','20','10','5','2']
class_names = ['HI','MED','LO','H-M','M-L','H-L','H-M-L','Clear-sky']
lbls = ['Cloud class','Resolution','LWCRE_toa_p2','LWCRE_toa_p25','LWCRE_toa_med','LWCRE_toa_p75','LWCRE_toa_p98',\
        'LWCRE_atm_p2','LWCRE_atm_p25','LWCRE_atm_med','LWCRE_atm_p75','LWCRE_atm_p98',\
        'LWCRE_sfc_p2','LWCRE_sfc_p25','LWCRE_sfc_med','LWCRE_sfc_p75','LWCRE_sfc_p98',\
        'SWCRE_toa_p2','SWCRE_toa_p25','SWCRE_toa_med','SWCRE_toa_p75','SWCRE_toa_p98',\
        'SWCRE_atm_p2','SWCRE_atm_p25','SWCRE_atm_med','SWCRE_atm_p75','SWCRE_atm_p98',\
        'SWCRE_sfc_p2','SWCRE_sfc_p25','SWCRE_sfc_med','SWCRE_sfc_p75','SWCRE_sfc_p98']
B_df = pd.DataFrame(columns=lbls)

# Define a function that returns a list of statistics for an xarray field.
def stats(field):
    return [np.nanpercentile(field,2), np.nanpercentile(field,25), np.nanmedian(field), \
            np.nanpercentile(field,75), np.nanpercentile(field,98)]

# Iterate over the cloud classes.     
for cloud_class in np.arange(7,8): #np.arange(nclasses):
    print(cloud_class)
    
    for res in resolutions:
        print(res)
        # Read in the cloud classifications and radiative variables for this resolution.
        fi = xr.open_dataset(georgedir + 'cloud_class_array_thres10p_' + res + 'km_alltimesteps_v3.nc')
        cc = xr.open_zarr(nicoledir + 'nawdexnwp-' + res + 'km-mis-0001_cre_alltimesteps_oceanmask_applied.zarr')
        classes = fi['clch']
    
        # Extract the cloud class of interest. Generate a mask to extract the non-nan values thereafter.
        B2 = cc.where(classes == cloud_class+1)
        val = [[class_names[cloud_class], int(res)] + stats(B2.crelw_toa) + stats(B2.crelw_atm) + \
                stats(B2.crelw_sfc) + stats(B2.cresw_toa) + stats(B2.cresw_atm) + stats(B2.cresw_sfc)]
        B_df = B_df.append(pd.DataFrame(data=val,columns=lbls),ignore_index=True)
        
    # Save the occurrence dataframe in a pickle (or load it).
    B_df.to_pickle('crestats_class' + str(cloud_class+1) + '.pkl') 

7
80
40
20
10
5
2
