# 'Create_mask.ipynb' is created by Yue on Feb 7, 2024 for creating quality control masks.

Workflow:
1. Load statistical data [n_hours,sonum].
2. QC for nan, spikes, stability, wind angle, Taylor hypothesis, ustar magnitude, H magnitude.
3. Save results.

Notes:
1. input directory: /save_statistical_data.
2. output directory: /save_mask_data.
3. Add flags when the number of nan exceeds 10%.
4. Add flags when the number of spikes exceeds 1%.
5. Neural case is defines as |Z/L|<0.1.
6. Wind angle should be <120 degrees or >240 degrees.
7. Taylor hypothesis use u_std/u_avg < 0.5.
8. magnitude control: u_star > 0.05 m/s. (all hours passed this test)
9. |H| > 10 W/m^2

=========== Disable de-spike on Sep 14, 2024 and save data to /save_mask_data_091424 =====

=========== Disable de-spike and do planar fit on Sep 19, 2024 and save data to /save_mask_data_planarfit =====

=========== Disable de-spike and do planar fit on Sep 19, 2024 and save data to /save_mask_data_data_092024 =====

# Set up environment

In [5]:
# import packages
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os.path
import pickle
import time
from matplotlib.pyplot import figure
import scipy.io as sio
from datetime import date, timedelta
from math import *

# Define parameters

In [7]:
# directories
IN_DIR = "/projectnb/urbanclimate/yueqin/idaho_ec_jupyter/save_statistical_data_092024/"
OUT_DIR = "/projectnb/urbanclimate/yueqin/idaho_ec_jupyter/save_mask_data_092024/"
# Check if IN_DIR exists
if os.path.exists(IN_DIR):
    print(f"The input directory {IN_DIR} exists.")
else:
    print(f"The input directory {IN_DIR} does not exist.")

# Check if OUT_DIR exists
if os.path.exists(OUT_DIR):
    print(f"The output directory {OUT_DIR} exists.")
else:
    print(f"The output directory {OUT_DIR} does not exist.")

# labels
list_bot = np.array([0,1,2,3,4]) # bottom five levels
list_sel_m2 = np.array([5,6,7,8,9,10]) # from level 6 to level 11

# global constants
sonum    =12                       # number of sonic
z  = np.array([1.2,2,3.5,6,9,12.5,16.5,23,30,40,50,60])  # height of sonic above ground, 
frequency=10                   # sampling rate, Hz
time_avg =3600                  # average time, s
rpat = time_avg*frequency           # number of lines for a loop

# input variables
in_qf = ['qc_ux_nan_all', 'qc_uy_nan_all', 'qc_uz_nan_all', 'qc_T_nan_all',
         'qc_ux_dspk_all', 'qc_uy_dspk_all', 'qc_uz_dspk_all', 'qc_T_dspk_all', 'qc_wdir_dspk_all']
in_avg = ['wind_ang_all','u_avg_ldtr', 'u_avg_filt']
in_std = ['u_std_ldtr', 'u_std_filt']
in_flux = ['u_star_ldtr', 'H_ldtr', 'u_star_filt', 'H_filt']
in_stability = ['stability_ldtr', 'stability_filt']

# output variables
out_mask = ['mask_rnan', 'mask_dspk', 'mask_neutral', 'mask_wdir', 'mask_taylor',
           'mask_ustar_gt005', 'mask_H_gt10']

The input directory /projectnb/urbanclimate/yueqin/idaho_ec_jupyter/save_statistical_data_092024/ exists.
The output directory /projectnb/urbanclimate/yueqin/idaho_ec_jupyter/save_mask_data_092024/ exists.


# Define functions

In [8]:
def date_list(sdate,edate):
    """method used for creating date list"""
    delta = edate - sdate       # as timedelta
    day = [sdate+timedelta(days=x) for x in range(delta.days+1)]
    return day

def CheckForLess(list1, val): 
    # traverse in the list
    for x in list1: 
        # compare with all the
        # values with value
        if val <= x:
            return False
    return True

# Create masks

In [9]:
n_hours = np.load(f"{IN_DIR}{'n_hours'}.npy")
write_results = True
OUT_DIR

'/projectnb/urbanclimate/yueqin/idaho_ec_jupyter/save_mask_data_092024/'

In [10]:
# initialization
mask_rnan = np.zeros(n_hours, dtype=bool) # boolean array
mask_dspk = np.zeros(n_hours, dtype=bool)
mask_neutral = np.zeros(n_hours, dtype=bool)
mask_wdir = np.zeros(n_hours, dtype=bool)
mask_taylor = np.zeros(n_hours, dtype=bool)
mask_ustar_gt005 = np.zeros(n_hours, dtype=bool)
mask_H_gt10 = np.zeros(n_hours, dtype=bool)
# mask_abcdef = np.zeros(n_hours)
# load necessary variables
for var_name in in_qf+in_avg+in_std+in_flux+in_stability:
    globals()[var_name] = np.load(f"{IN_DIR}{var_name}.npy")
for i in range(n_hours):
    # qc should all be 0 so pass means not any 1 
    ## relax to six level due to data missing in some days

    # a. rm when nan exceeding 10%
    crit1 = not np.any(qc_ux_nan_all[i,list_sel_m2])
    crit2 = not np.any(qc_uy_nan_all[i,list_sel_m2])
    crit3 = not np.any(qc_uz_nan_all[i,list_sel_m2])
    crit4 = not np.any(qc_T_nan_all[i,list_sel_m2])

    # b. rm when spikes exceeding 1%
    crit7 = not np.any(qc_ux_dspk_all[i,list_sel_m2])
    crit8 = not np.any(qc_uy_dspk_all[i,list_sel_m2])
    crit9 = not np.any(qc_uz_dspk_all[i,list_sel_m2])
    crit10 = not np.any(qc_T_dspk_all[i,list_sel_m2])

    mask_rnan[i] = np.all([crit1,crit2,crit3,crit4]) #a
    mask_dspk[i] = np.all([crit7,crit8,crit9,crit10]) #b

    # c. neutral case |z/L|<0.1
    mask_neutral_ldtr = CheckForLess(abs(stability_ldtr[i,:]),0.1)
    mask_neutral_filt = CheckForLess(abs(stability_filt[i,:]),0.1)
    mask_neutral[i] = np.all([mask_neutral_ldtr,mask_neutral_filt]) #c

    # d. wind angle smaller than 120 or larger than 240
    
    mask_wdir[i] = not np.any(qc_wdir_dspk_all[i,list_sel_m2]) #d

    # e. taylor_assumption_control sigma_u/U< 0.5
    ratio_u_ldtr = u_std_ldtr[i,:]/u_avg_ldtr[i,:]
    ratio_u_filt = u_std_filt[i,:]/u_avg_filt[i,:]
    msk1 = CheckForLess(ratio_u_ldtr[list_sel_m2],0.5)
    msk2 = CheckForLess(ratio_u_filt[list_sel_m2],0.5)
    mask_taylor[i] = np.all([msk1,msk2]) #e

    # f. u>0.05m/s
    msk1 = CheckForLess(-u_star_ldtr[i,list_sel_m2],-0.05)
    msk2 = CheckForLess(-u_star_filt[i,list_sel_m2],-0.05)
    mask_ustar_gt005[i] = np.all([msk1,msk2]) #f

    # g. |H|>10 W/m2
    msk3 = CheckForLess(-abs(H_ldtr[i,list_sel_m2]),-10)
    msk4 = CheckForLess(-abs(H_filt[i,list_sel_m2]),-10)
    mask_H_gt10[i] = np.all([msk3,msk4]) #g

In [11]:
if write_results: # write output by days
    for var_name in out_mask:
        var_value = globals()[var_name]
        filename = f"{var_name}.npy"
        np.save(OUT_DIR + filename, var_value)

# Check results

In [12]:
np.sum(mask_rnan)

4788

In [13]:
mask = np.zeros(n_hours, dtype=bool)
for i in range(n_hours):
    mask[i] = np.all([mask_rnan[i],mask_dspk[i],mask_neutral[i],mask_wdir[i],mask_taylor[i],mask_ustar_gt005[i],mask_H_gt10[i]])
np.sum(mask)

45

In [14]:
np.sum(mask_rnan*mask_dspk*mask_neutral*mask_wdir*mask_taylor*mask_ustar_gt005)

107