In [1]:
import xarray as xr
import glob
import numpy as np

# load data from archive

In [2]:
# get sorted list of available files of flagged and non-flagged sensors
fn_list_flagged = glob.glob(
    "/pd/data/CML/data/reference/anomaly_flags/combined_4_experts/flags_20_cml_2019_*.nc"
)
fn_list = glob.glob(
    "/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_0[3, 5, 7]*.nc"
)
#fn_list_flagged.sort()
#fn_list.sort()
# load non-flagged data (all sensors)
#ds = xr.open_dataset(fn_list[6]).load()
# load flagged data (20 sensors)
#ds_f = xr.open_dataset(fn_list_flagged[2]).load()


In [3]:
fn_list

['/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_05.nc',
 '/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_07.nc',
 '/pd/data/CML/data/processed/proc2021.001/proc_hess_amt/proc_cnn_gapstandard_2019_03.nc']

In [4]:
extract_nf = True
if extract_nf:
    for i, file_nf in enumerate(fn_list):
        # Open processed data to extract neighbouring CML data
        cml_data = xr.open_dataset(file_nf)
        cml_data = cml_data.drop_isel(time=-1)
        if i == 0:
            extracted_data = cml_data
        else:
            extracted_data = xr.concat([extracted_data, cml_data], dim='time')
    extracted_data.to_netcdf('/bg/home/lasota-e/experiment_pipeline/data/merged.nc')
    
ds = xr.open_dataset('/bg/home/lasota-e/experiment_pipeline/data/merged.nc')

In [5]:
extract_f = False
if extract_f:
    for i, file_f in enumerate(fn_list_flagged):
        # Open processed data to extract neighbouring CML data
        cml_data = xr.open_dataset(file_f)
        cml_data = cml_data.drop_isel(time=-1)
        if i == 0:
            extracted_data = cml_data
        else:
            extracted_data = xr.concat([extracted_data, cml_data], dim='time')
    extracted_data.to_netcdf('/bg/home/lasota-e/experiment_pipeline/data/merged_flagged.nc')

ds_f = xr.open_dataset('/bg/home/lasota-e/experiment_pipeline/data/merged_flagged.nc')

# Processing steps

In [6]:
# drop flagged sensor ids from all sensors and duplicate time steps
ds_nf = ds.drop_sel({"cml_id": ds_f.cml_id.values}).drop_duplicates(
    dim="time", keep="first"
)
# concatenate flagged and non-flagged sensors. adds missing values in flags for non-flagged sensors
ds_m = xr.concat([ds_f, ds_nf], dim="cml_id")
# create variable that tracks which cmls were flagged by experts
ds_m["flagged"] = (
    "cml_id",
    ((~np.isnan(ds_m.Jump)).sum(dim="time").sum(dim="expert") > 0).values,
)
# split txrx into two variables along channel_id and rename to TL according to naming convention
ds_m["TL_1"] = ["cml_id", "time"], ds_m.txrx.isel(channel_id=0).values
ds_m["TL_2"] = ["cml_id", "time"], ds_m.txrx.isel(channel_id=1).values
# split frequency and polarization coords along channel_id dim
ds_m["frequency_1"] = "cml_id", ds_m.frequency.isel(channel_id=0).values
ds_m["frequency_2"] = "cml_id", ds_m.frequency.isel(channel_id=1).values
ds_m["polarization_1"] = "cml_id", ds_m.polarization.isel(channel_id=0).values
ds_m["polarization_2"] = "cml_id", ds_m.polarization.isel(channel_id=1).values
ds_m = ds_m.assign_coords(
    {
        "frequency_1": ("cml_id", ds_m.frequency.isel(channel_id=0).values),
        "frequency_2": ("cml_id", ds_m.frequency.isel(channel_id=1).values),
        "polarization_1": ("cml_id", ds_m.polarization.isel(channel_id=0).values),
        "polarization_2": ("cml_id", ds_m.polarization.isel(channel_id=1).values),
    }
)
# drop irrelevant variables based on which ones we want to keep
current_vars = list(ds_m.keys())
keep_vars = ["TL_1", "TL_2", "Jump", "Dew", "Fluctuation", "Unknown anomaly", "flagged"]
ds_m = ds_m.drop(list(set(current_vars) - set(keep_vars)))
# drop channel_id dimension
ds_m = ds_m.drop_dims(["channel_id"])
# rename dimension
ds_m = ds_m.rename_dims({"cml_id": "sensor_id"})
# rename coordinate var
ds_m = ds_m.rename_vars({"cml_id": "sensor_id"})


# Inspect result

In [7]:
ds_m

# Save

In [8]:
ds_m.to_netcdf('/bg/home/lasota-e/experiment_pipeline/data/cml_raw.nc')

  ds_m.to_netcdf('/bg/home/lasota-e/experiment_pipeline/data/dataset.nc')
  data = data.astype(dtype=dtype)
  ds_m.to_netcdf('/bg/home/lasota-e/experiment_pipeline/data/dataset.nc')
  ds_m.to_netcdf('/bg/home/lasota-e/experiment_pipeline/data/dataset.nc')
  ds_m.to_netcdf('/bg/home/lasota-e/experiment_pipeline/data/dataset.nc')
