In [1]:
import xarray as xr
import glob
import numpy as np

In [2]:
import os

In [3]:
# get sorted list of available files of flagged and non-flagged sensors
fn_list_flagged = glob.glob(
    "/pd/data/CML/data/reference/anomaly_flags/combined_4_experts/flags_20_cml_2019_*.nc"
)
fn_list = glob.glob(
    "/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_*.nc"
)
fn_list_flagged.sort()
fn_list.sort()

In [4]:
fn_list

['/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_01.nc',
 '/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_02.nc',
 '/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_03.nc',
 '/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_04.nc',
 '/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_05.nc',
 '/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_06.nc',
 '/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_07.nc',
 '/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_08.nc',
 '/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_09.nc',
 '/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_10.nc',
 '/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_11.nc',

In [5]:
fn_list_flagged

['/pd/data/CML/data/reference/anomaly_flags/combined_4_experts/flags_20_cml_2019_03.nc',
 '/pd/data/CML/data/reference/anomaly_flags/combined_4_experts/flags_20_cml_2019_05.nc',
 '/pd/data/CML/data/reference/anomaly_flags/combined_4_experts/flags_20_cml_2019_07.nc']

# load data from archive

In [6]:
# load non-flagged data (all sensors)
ds = xr.open_dataset(fn_list[6])

In [7]:
# load flagged data (20 sensors)
ds_f = xr.open_dataset(fn_list_flagged[2])


# Processing steps

In [8]:
# drop flagged sensor ids from all sensors and duplicate time steps
ds_nf = ds.drop_sel({"cml_id": ds_f.cml_id.values}).drop_duplicates(
    dim="time", keep="first"
)
# concatenate flagged and non-flagged sensors. adds missing values in flags for non-flagged sensors
ds_m = xr.concat([ds_f, ds_nf], dim="cml_id")

In [9]:


# create variable that tracks which cmls were flagged by experts
ds_m["flagged"] = (
    "cml_id",
    ((~np.isnan(ds_m.Jump)).sum(dim="time").sum(dim="expert") > 0).values,
)
# split txrx into two variables along channel_id and rename to TL according to naming convention
ds_m["TL_1"] = ["cml_id", "time"], ds_m.txrx.isel(channel_id=0).values
ds_m["TL_2"] = ["cml_id", "time"], ds_m.txrx.isel(channel_id=1).values
# split frequency and polarization coords along channel_id dim
ds_m["frequency_1"] = "cml_id", ds_m.frequency.isel(channel_id=0).values
ds_m["frequency_2"] = "cml_id", ds_m.frequency.isel(channel_id=1).values
ds_m["polarization_1"] = "cml_id", ds_m.polarization.isel(channel_id=0).values
ds_m["polarization_2"] = "cml_id", ds_m.polarization.isel(channel_id=1).values
ds_m = ds_m.assign_coords(
    {
        "frequency_1": ("cml_id", ds_m.frequency.isel(channel_id=0).values),
        "frequency_2": ("cml_id", ds_m.frequency.isel(channel_id=1).values),
        "polarization_1": ("cml_id", ds_m.polarization.isel(channel_id=0).values),
        "polarization_2": ("cml_id", ds_m.polarization.isel(channel_id=1).values),
    }
)
# drop irrelevant variables based on which ones we want to keep
current_vars = list(ds_m.keys())
keep_vars = ["TL_1", "TL_2", "Jump", "Dew", "Fluctuation", "Unknown anomaly", "flagged"]
ds_m = ds_m.drop(list(set(current_vars) - set(keep_vars)))
# drop channel_id dimension
ds_m = ds_m.drop_dims(["channel_id"])
# rename dimension
ds_m = ds_m.rename_dims({"cml_id": "sensor_id"})
# rename coordinate var
ds_m = ds_m.rename_vars({"cml_id": "sensor_id"})


  ds_m = ds_m.drop(list(set(current_vars) - set(keep_vars)))


# Inspect result

In [10]:
ds_m

# select example subset

In [11]:
non_flagged_example_cmls =  ['DO0202_2_DO6017_4', 'DO0206_2_DO6017_3',
       'DO0341_2_DO1665_4', 'DO0455_2_DO6017_4', 'DO0787_2_DO2382_2',
       'DO0951_2_DO1665_2', 'DO1689_2_DO0255_4', 'DO1744_2_DO0255_2',
       'DO1893_2_DO6017_3', 'DO1932_2_HY1317_4', 'DO2118_2_DO6038_3',
       'DO2796_2_DO6017_2', 'DO2867_2_DO6017_6', 'DO2974_2_DO0214_2',
       'DO3109_2_DO1893_3', 'DO3623_2_DO6017_2', 'DO5416_2_DO6075_2',
       'DO5533_2_DO0951_3', 'DO6001_2_DO6017_4', 'DO6066_2_DO6017_5',
       'DO6067_2_DO6017_3', 'HY0277_2_HY1317_3']

In [97]:
flagged_example_cml = 'DO1695_2_DO6017_2'
example_time = slice('2019-07-02T00:00:00.000000000','2019-07-29T23:59:00.000000000')
example_cmls = list(non_flagged_example_cmls)
example_cmls.append(flagged_example_cml)
print(example_cmls)

['DO0202_2_DO6017_4', 'DO0206_2_DO6017_3', 'DO0341_2_DO1665_4', 'DO0455_2_DO6017_4', 'DO0787_2_DO2382_2', 'DO0951_2_DO1665_2', 'DO1689_2_DO0255_4', 'DO1744_2_DO0255_2', 'DO1893_2_DO6017_3', 'DO1932_2_HY1317_4', 'DO2118_2_DO6038_3', 'DO2796_2_DO6017_2', 'DO2867_2_DO6017_6', 'DO2974_2_DO0214_2', 'DO3109_2_DO1893_3', 'DO3623_2_DO6017_2', 'DO5416_2_DO6075_2', 'DO5533_2_DO0951_3', 'DO6001_2_DO6017_4', 'DO6066_2_DO6017_5', 'DO6067_2_DO6017_3', 'HY0277_2_HY1317_3', 'DO1695_2_DO6017_2']


In [98]:
ds_e = ds_m.sel({
    'sensor_id': example_cmls,
    'time': example_time,
})

In [99]:
ds_e.nbytes/1e6

74.514419

In [100]:
ds_e

# Save

In [104]:
path = os.getcwd()
ds_e.to_netcdf(os.path.abspath(os.path.join(path, os.pardir))+'/example_data/cml_raw_example.nc')

  ds_e.to_netcdf(os.path.abspath(os.path.join(path, os.pardir))+'/example_data/cml_raw_example.nc')
  data = data.astype(dtype=dtype)
  ds_e.to_netcdf(os.path.abspath(os.path.join(path, os.pardir))+'/example_data/cml_raw_example.nc')
  ds_e.to_netcdf(os.path.abspath(os.path.join(path, os.pardir))+'/example_data/cml_raw_example.nc')
  ds_e.to_netcdf(os.path.abspath(os.path.join(path, os.pardir))+'/example_data/cml_raw_example.nc')
