In [None]:
# This notebook supports input parameters for automatic report generation. The parameters must be variables in this
# cell, which has a special 'parameters' tag.
DATA_ROOT = r'G:\Shared drives\Covid-19 Spectrum Monitoring\Data'
# DATA_ROOT = r'D:\dkuester\covidscratch'

EXPORT_DATA_ROOT = r'G:\Shared drives\Covid-19 Spectrum Monitoring Data Export'

HISTOGRAM_RESOLUTION_SWEEPS = 100
HISTOGRAM_POWER_LOW = -120
HISTOGRAM_POWER_HIGH = -20

dat_path_pattern = 'dUsVcuPP/*2020-11-*.dat'

figure_format = 'svg'

In [None]:
# hide the pink warnings in reports (comment to include them)
from environment import *
import figures

bounds = (HISTOGRAM_POWER_LOW, HISTOGRAM_POWER_HIGH)

set_matplotlib_formats(figure_format)

display(widgets.HTML(f'This report was produced {time.strftime(time_format)}'));

In [None]:
import histogram_analysis as ha
import pyarrow.parquet
ha = reload(ha)

def ccdf_from_hists(hists, include_N=True):
    rcum = ha.rcumsum(hists.T, power_fix=False).T
    N = rcum.iloc[0].copy()
    rcum = rcum.astype('float')
    rcum.values[:] /= rcum.values[0,np.newaxis,:]

    if include_N:
        name = f'{rcum.columns.name} (# events $N$)'
        rcum.columns = [f'{c} ($N={n}$)' for c,n in zip(rcum.columns,N)]
        rcum.columns.name = name
    
    return rcum, N

def ccdf_by_threshold(durations, include_N=True):
    hists = pd.DataFrame(
        {
            th: np.histogram(durations.loc[:,th], bins=DURATION_BINS)[0]
            for th in durations.columns
        },
        index=DURATION_BINS[:-1]
    )
    hists.columns.name = 'Occupancy threshold (dBm)'

    return ccdf_from_hists(hists, include_N=include_N)

def ccdf_by_site(durations, threshold):
    durations = durations.loc[:,threshold].dropna()
    durations.index = durations.index.remove_unused_levels()
    site_list = sorted(durations.index.levels[durations.index.names.index('Site')])

    hists = pd.DataFrame(
        {
            site: np.histogram(durations.loc(axis=0)[:,site], bins=DURATION_BINS)[0]
            for site in site_list
        },
        index=DURATION_BINS[:-1]
    )
    hists.columns.name = 'Site'

    return ccdf_from_hists(hists)

def ccdf_by_frequency(durations, threshold):
    durations = durations.loc[:,threshold].dropna()
    durations.index = durations.index.remove_unused_levels()
    # freqs = np.sort(np.unique(durations.index.get_level_values('Frequency')))
    freq_list = np.sort(np.unique(durations.index.get_level_values('Frequency')))

    hists = pd.DataFrame(
        {
            np.round(fc,1): np.histogram(durations.loc(axis=0)[fc], bins=DURATION_BINS)[0]
            for fc in freq_list
        },
        index=DURATION_BINS[:-1]
    )
    hists.columns.name = 'Frequency (MHz)'

    return ccdf_from_hists(hists)

def plot_ccdfs_from_durations(durations: pd.DataFrame, threshold: float, band_type: str, delay_max_ms: float):
    """
    Arguments:

        delay_max_ms: 
    """
    fig, (ax1, ax2, ax3) = subplots(3,1,figsize=(figsize_fullwidth[0], 2.5*figsize_fullwidth[1]))

    ccdf, N = ccdf_by_threshold(group_durations)
    if DURATION_BINS[0] == 0:
        ccdf = ccdf.iloc[1:]
    ccdf.index = ccdf.index * 1e3
    ccdf.loc[:delay_max_ms].plot(logy=False, ax=ax1, marker='.')

    by_threshold = ccdf

    ccdf, N = ccdf_by_site(group_durations, threshold=threshold)
    if DURATION_BINS[0] == 0:
        ccdf = ccdf.iloc[1:]
    ccdf.index = ccdf.index * 1e3
    ccdf.loc[:delay_max_ms].plot(logy=False, ax=ax2, marker='.')
    ax2.legend(loc='upper right', ncol=2, title='Site')

    ccdf, N = ccdf_by_frequency(group_durations, threshold=threshold)
    if DURATION_BINS[0] == 0:
        ccdf = ccdf.iloc[1:]    
    ccdf.index = ccdf.index * 1e3
    ccdf.loc[:delay_max_ms].plot(logy=False, ax=ax3, marker='.')

    fig.suptitle(f'Empirical CCDFs {band_type}', visible=False)
    fig.supxlabel('Time duration (ms)')
    ax2.set_ylabel(r'Fraction of occupancy events exceeding duration')
    ax2.xaxis.set_minor_locator(mpl.ticker.MultipleLocator(0.5))

    ax1.yaxis.set_minor_locator(mpl.ticker.MultipleLocator(0.1))
    ax1.yaxis.set_major_locator(mpl.ticker.MultipleLocator(0.2))
    ax1.xaxis.set_minor_locator(mpl.ticker.MultipleLocator(0.5))

    ax2.yaxis.set_minor_locator(mpl.ticker.MultipleLocator(0.1))
    ax2.yaxis.set_major_locator(mpl.ticker.MultipleLocator(0.2))
    ax2.xaxis.set_minor_locator(mpl.ticker.MultipleLocator(0.5))

    ax3.yaxis.set_minor_locator(mpl.ticker.MultipleLocator(0.1))
    ax3.yaxis.set_major_locator(mpl.ticker.MultipleLocator(0.2))
    ax3.xaxis.set_minor_locator(mpl.ticker.MultipleLocator(0.5))

    set_caption(
        fig,
        f"""Empirical CCDFs of occupancy duration in {band_type} across all dates. Plots compare """
        f"""first the choice of occupancy threshold power level (left), and then the test sites """
        f"""based on occupancy threshold {threshold} dBm (right)."""
    )

    return fig

def read_occupancy(path, site=None, year=None, month=None, day=None, duration:dict=None, end=None):
    if year:
        start = pd.Timestamp(year=year, month=month or 1, day=day or 1)
    
    filters = []
    
    if site is not None:
        filters.append(['Site','=',site])

    if year is not None:
        start = pd.Timestamp(year=year, month=month or 1, day=day or 1)        
        filters.append(['Time', '>=', start])
        
    if duration is not None:
        if year is None:
            raise ValueError(f"duration argument not supported unless a start date is passed")
        filters.append(['Time', '<', start+pd.DateOffset(**dict(duration))])
        
    elif end is not None:
        filters.append(['Time', '<', pd.Timestamp(end)])

    hists = pd.read_parquet(
        path,
        filters=filters or None,
        use_threads=False,
        buffer_size=1024*1024*256
    )

    if site is not None:
        hists.reset_index('Site', drop=True, inplace=True)

    hists.columns = hists.columns.astype('float32')
    
    return hists


In [None]:
if EXPORT_DATA_ROOT:
    dates = [
        [6,2020],[7,2020],[8,2020],[9,2020],[10,2020],[11,2020],[12,2020],
        [1,2021],[2,2021],[3,2021],[4,2021],[5,2021],[6,2021],[7,2021],
        [8,2021],[9,2021]
    ]

    export_dir = Path(EXPORT_DATA_ROOT)/'occupancy durations'#/f'{site:02d}'
    export_dir.mkdir(exist_ok=True, parents=True)
    possible_sites = [f'{i:02g}' for i in range(13)] + ['hospital']
    for site in possible_sites:
        # for month, year in dates:
        export_file = export_dir/f'{site}.csv.gz'#f'{year}-{month:02d}.csv.gz'

        # if export_file.exists():
        #     continue

        try:
            print(f'site {site}')
            df = read_occupancy(
                Path(DATA_ROOT)/'occupancy durations.parquet',
                site=site,
                # year=year,
                # month=month,
                # duration=dict(months=1)
            )

            if df.size == 0:
                continue

            df.to_csv(
                export_file,
                chunksize=df.shape[0],
                float_format='%.6g',
                compression=dict(method='gzip', compresslevel=5)
            )

        except:
            print(f'removing {str(export_file)}')
            try:
                export_file.unlink()
            except OSError:
                pass
            raise

    raise Exception('export finished')
else:    
    durations = read_occupancy(Path(DATA_ROOT)/'occupancy durations.parquet')

    DURATION_BINS = np.arange(0, .105, 0.5e-3)

    fc_map = dict(zip(np.round(durations.index.levels[0],1),durations.index.levels[0]))

In [None]:
hospital_durations = read_occupancy(Path(DATA_ROOT)/'occupancy durations.parquet', site='hospital')

In [None]:
band_type = 'uplink channels of LTE bands'
delay_max = 8
threshold = -60
fc_group = fc_lte_ul
group_durations = durations.loc(axis=0)[[fc_map[fc] for fc in fc_group], :]
plot_ccdfs_from_durations(group_durations, threshold=threshold, band_type=band_type, delay_max_ms=delay_max);

In [None]:
# # fun.stack().unstack('Occupancy threshold (dBm)')#groupby(['Site','Month','Occupancy threshold (dBm)']).u)

# # np.abs(fun-0.1).groupby(['Site','Month']).idxmin()
# quantile_low = (
#     np.abs(fun-0.5)
#     .stack()
#     .unstack(level='Duration (ms)')
#     .groupby(level=['Site','Month','Occupancy threshold (dBm)'], group_keys=False)
#     .idxmin(axis=1)
# )
# quantile_low.loc(axis=0)[:,:,-60].unstack(level='Site').droplevel('Occupancy threshold (dBm)').plot()

In [None]:
band_type = 'the 2.4 GHz ISM band'
delay_max = 8
threshold = -60
fc_group = fc_ism

group_durations = durations.loc(axis=0)[[fc_map[fc] for fc in fc_group], :]

plot_ccdfs_from_durations(group_durations, threshold=threshold, band_type=band_type, delay_max_ms=delay_max);

In [None]:
band_type = 'the 2695 MHz passive service band'
threshold = -75
delay_max = 8
fc_group = [fc_quiet]

group_durations = durations.loc(axis=0)[[fc_map[fc] for fc in fc_group]]

plot_ccdfs_from_durations(group_durations, threshold=threshold, band_type=band_type, delay_max_ms=delay_max);

In [None]:
band_type = '5 GHz U-NII bands'
threshold = -65
delay_max = 8
fc_group = fc_unii

group_durations = durations.loc(axis=0)[[fc_map[fc] for fc in fc_group], :]

plot_ccdfs_from_durations(group_durations, threshold=threshold, band_type=band_type, delay_max_ms=delay_max);


In [None]:
def ccdf_by_threshold2(durations):
    hists = pd.DataFrame(
        {
            th: np.histogram(durations.loc[:,th], bins=DURATION_BINS)[0]
            for th in durations.columns
        },
        index=DURATION_BINS[1:]
    )
    hists.columns.name = 'Occupancy threshold (dBm)'

    # print(hists.head())

    return ccdf_from_hists(hists)[0]


# gb = group_durations.loc[:,-75.].groupby(
#     [
#         group_durations.index.get_level_values('Time').year,
#         group_durations.index.get_level_values('Time').month
#     ]
# )

df = group_durations.copy()
df['Month'] = df.index.get_level_values('Time').to_period('M')
df.set_index('Month', append=True).groupby('Month').apply(lambda df: ccdf_by_threshold(df)[0])

#.groupby(pd.Grouper(freq=['Y','M'], level='Time'))