In [2]:
import pandas as pd
import numpy as np
import os
import itertools
from glob import glob

import seaborn as sns
import matplotlib.pyplot as plt

## Mounting Google Drive and setting resources path (path to captures)

In [20]:
from google.colab import drive
drive.mount('/content/drive')

resources_dir = '/content/drive/MyDrive/honours/captures'

print(os.path.exists(resources_dir))

Mounted at /content/drive
True


## Setting configs

In [3]:
# Directory storing the CSI and WLAN captures
resources_dir = "/Volumes/tim_details/tim_honours/CAPTURES"

# Directory to save plots to
plt_dir = "/Users/timothylee/Desktop/Uni/Yr5/Honours/honours_thesis/figures/plt_figs/"

# Supress pd scientific notation
pd.set_option('display.float_format', '{:.6f}'.format)

# Resolution of plots
plt.rcParams["figure.dpi"] = 100 # 300
# plt.rcParams["figure.dpi"] = 500 # 300

# Backend to generate plots
# mpl.use("agg")
# %matplotlib ipympl
%matplotlib inline

# plt figure style
fig_style = "seaborn-v0_8-whitegrid"

# colormaps
cmap_qual = "pastel"
cmap_seq = "viridis"
cmap_cycl = "twilight"

# Hide warnings
import warnings
warnings.filterwarnings("ignore")


## Combining Dataset Functions

* Binning and Aggregating
* Combining Different Filters
* Handling CSI

In [69]:
def ts_bin_df(df, interval, end_time, agg_dict):
    """
    Bins data into the given intervals.
    Also includes all missing intervals and sorts the index.

    The intervals are exactly (0, 1*interval, 2*interval, ..., end_time), NOT inclusive of end_time
    """
    df = df.copy()
    # Generating time-bins
    time_series = df["frame.time_relative"]
    # If the df is empty (i.e. empty capture)
    if time_series.shape[0] == 0:
        return pd.DataFrame(columns=list(agg_dict.keys()))
    # Making list of bins from 0 to at least the end_time
    bins = np.arange(
        0,
        np.max([np.ceil(time_series.max()) + interval, end_time]),
        interval,
    )
    # Adding binned category column to data
    df["ts_bins"] = pd.cut(
        time_series,
        bins=bins,
        include_lowest=True,
        labels=bins[:-1],
    )

    # Grouping and aggregating data on time bins
    df_binned = df.groupby("ts_bins").agg(agg_dict)
    # Ensuring that there are all timebins (even if some rows are empty)
    df_binned = (
        df_binned
        .merge(
            pd.Series(bins, name="ts_bins"),
            how="right",
            left_index=True,
            right_on="ts_bins",
        )
        .sort_index()
    )
    # Ensuring that ONLY bins from
    # Taking the first 500 bins (i.e. 180 seconds)
    df_binned = df_binned[df_binned['ts_bins'] < 180]
    return df_binned


def bin_filt_df(df, interval, end_time, agg_dict):
    """
    Bins data into intervals, with different characteristics
    filtered by direction and frame type.
    Uses ts_bin_df to bin the data.
    """
    # Initialising binned df
    df_binned = pd.DataFrame()
    # Making bins for each filter (direction, data/non-data frame):
    # Direction
    for d_k, d_v in {"up": (True,), "dn": (False,), "all": None}.items():
        # Frame Type: https://en.wikipedia.org/wiki/802.11_Frame_Types
        for fc_k, fc_v in {"ndat": (0, 1), "dat": (2,), "all": None}.items():
            # Filtering df
            df_f = df
            if d_v:
                df_f = df_f[df_f["is_upstream"].isin(d_v)]
            if fc_v:
                df_f = df_f[df_f["wlan.fc.type"].isin(fc_v)]
            # Making the binned df
            df_f_binned = ts_bin_df(df_f, interval, end_time, agg_dict)
            # Adding this filtered df to the total
            df_binned = df_binned.merge(
                right=df_f_binned.add_suffix(f"_{d_k}_{fc_k}"),
                how="outer",
                left_index=True,
                right_index=True,
            )
    return df_binned


def df_to_csi_matrix(df):
    """
    Converts the df format of CSI (i, r) values to 2D matrix format of complex values.
    """
    # Getting the shape of CSI matrix (# subcarriers, and # frames)
    nsubs = int(np.sum(["csi_" in i for i in df.columns]) / 2)
    nframes = df.shape[0]
    # Initialising CSI matrix
    csi = np.zeros((nframes, nsubs), dtype=np.complex64)
    # Filling CSI matrix values
    for i in np.arange(nsubs):
        csi[:, i] = (df[f"csi_{i}_r"] + 1j * df[f"csi_{i}_i"]).astype(np.complex64)
    # Returning complex CSI matrix
    return csi

### Setting Labels to Combine On

In [None]:
devices = {
    "client_pc_50": "PC 50 cm",
    "client_pc_200": "PC 200 cm",
    "client_pc_200_wall": "PC 200 cm (wall)",
    "client_pi_50":  "RPi4 50 cm",
    "client_pi_200": "RPi4 200 cm",
    "client_pi_200_wall": "RPi4 200 cm (wall)",
}
videos = {
    "v=3InbMow9IYo": "Vid 1",
    "v=A3gUpodXMv0": "Vid 2",
    "v=NSW5u1RTxEA": "Vid 3",
    "v=gxxqdrrpgZc": "Vid 4",
    "v=mkWKZWMokdI": "Vid 5",
    "v=t634q_Voeto": "Vid 6",
    "v=t6jlhqNxRYk": "Vid 7",
    "v=w_oGIbFjiCo": "Vid 8",
    "v=yve6qo6eowU": "Vid 9",
}
labels = pd.DataFrame(columns=["devices", "videos"])
for i in itertools.product(devices, videos):
    labels.loc[len(labels)] = i

## Combining WLAN binned data into a single H5

In [74]:
# Time intervals and duration to keep
interval = 0.36 # Binning interval in seconds
end_time = 180

# Grouping and aggregating data on time bins
agg_dict = {
    "frames": "count", # #frames
    "bytes": "sum", # #bytes
}

# Making combined binned DF of all labels and instances
X = pd.DataFrame()
# For each label combination
for i in labels.index:
    # Getting the particular label details
    row = labels.loc[i]
    print(row.values)
    # For reach capture in the label combination folder
    for fp in glob(os.path.join(resources_dir, row["devices"], row["videos"], "wlan_h5", "*.h5")):
        # Reading the wlan h5 file
        df = pd.read_hdf(
            fp,
            key="wlan",
            mode="r",
        )

        # Not including captures that are too short (i.e. invalid capture)
        if df["frame.time_relative"].max() < end_time*0.9:
            continue

        # Adding a column to track the frame count
        df["frames"] = np.arange(df.shape[0])
        # renaming columns
        df = df.rename(columns={"frame.len": "bytes"})

        # Binning df with different filters
        df_binned = bin_filt_df(df, interval, end_time, agg_dict)

        # Imputing missing values
        df_binned = df_binned.fillna(0) # all (set to 0)
        # Setting the ts_bins index as a column
        df_binned["ts_bins"] = df_binned.index

        # Adding label combo to group instances
        df_binned["devices"] = devices[row['devices']]
        df_binned["videos"] = videos[row['videos']]
        df_binned["instances"] = os.path.split(fp)[1]

        # Concatenating the instance's binned df to the overall binned df (df_all)
        X = pd.concat([X, df_binned], axis=0, ignore_index=True)

# Saving to h5 file
# X.to_hdf(
#     os.path.join(resources_dir, "total_wlan.h5"),
#     key="wlan",
#     mode="w"
# )

['client_pc_50' 'v=3InbMow9IYo']
['client_pc_50' 'v=A3gUpodXMv0']
['client_pc_50' 'v=NSW5u1RTxEA']
['client_pc_50' 'v=gxxqdrrpgZc']
['client_pc_50' 'v=mkWKZWMokdI']


Unnamed: 0,frames_up_ndat,bytes_up_ndat,ts_bins_up_ndat,frames_up_dat,bytes_up_dat,ts_bins_up_dat,frames_up_all,bytes_up_all,ts_bins_up_all,frames_dn_ndat,...,frames_all_dat,bytes_all_dat,ts_bins_all_dat,frames_all_all,bytes_all_all,ts_bins_all_all,ts_bins,devices,videos,instances
0,0.000000,0.000000,0.000000,1.000000,43.000000,0.000000,1.000000,43.000000,0.000000,0.000000,...,1.000000,43.000000,0.000000,1.000000,43.000000,0.000000,0,PC 50 cm,Vid 1,cap_21.h5
1,0.000000,0.000000,0.360000,0.000000,0.000000,0.360000,0.000000,0.000000,0.360000,0.000000,...,0.000000,0.000000,0.360000,0.000000,0.000000,0.360000,1,PC 50 cm,Vid 1,cap_21.h5
2,1.000000,45.000000,0.720000,3.000000,357.000000,0.720000,4.000000,402.000000,0.720000,0.000000,...,3.000000,357.000000,0.720000,4.000000,402.000000,0.720000,2,PC 50 cm,Vid 1,cap_21.h5
3,5.000000,350.000000,1.080000,0.000000,0.000000,1.080000,5.000000,350.000000,1.080000,8.000000,...,0.000000,0.000000,1.080000,13.000000,2630.000000,1.080000,3,PC 50 cm,Vid 1,cap_21.h5
4,6.000000,420.000000,1.440000,0.000000,0.000000,1.440000,6.000000,420.000000,1.440000,1.000000,...,0.000000,0.000000,1.440000,7.000000,688.000000,1.440000,4,PC 50 cm,Vid 1,cap_21.h5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11495,0.000000,0.000000,178.200000,1.000000,234.000000,178.200000,1.000000,234.000000,178.200000,0.000000,...,1.000000,234.000000,178.200000,1.000000,234.000000,178.200000,495,PC 50 cm,Vid 5,cap_26.h5
11496,0.000000,0.000000,178.560000,3.000000,220.000000,178.560000,3.000000,220.000000,178.560000,0.000000,...,4.000000,454.000000,178.560000,4.000000,454.000000,178.560000,496,PC 50 cm,Vid 5,cap_26.h5
11497,0.000000,0.000000,178.920000,3.000000,232.000000,178.920000,3.000000,232.000000,178.920000,0.000000,...,3.000000,232.000000,178.920000,3.000000,232.000000,178.920000,497,PC 50 cm,Vid 5,cap_26.h5
11498,0.000000,0.000000,179.280000,1.000000,136.000000,179.280000,1.000000,136.000000,179.280000,0.000000,...,1.000000,136.000000,179.280000,1.000000,136.000000,179.280000,498,PC 50 cm,Vid 5,cap_26.h5


## Combining CSI binned data into a single H5

In [84]:
# Time intervals and duration to keep
interval = 0.36 # Binning interval in seconds
end_time = 180

# Grouping and aggregating data on time bins
agg_dict = {
    "frames": "count" # #frames
}
for i in np.arange(64):
    agg_dict[f"csi_{i}_a"] = "mean"
    agg_dict[f"csi_{i}_p"] = "mean"

X = pd.DataFrame()
# For each label combination
for i in labels.index:
    # Getting the particular label details
    row = labels.loc[i]
    print(row.values)
    # For reach capture in the label combination folder
    for fp in glob(os.path.join(resources_dir, row["devices"], row["videos"], "csi_h5", "*.h5")):
        # Reading the wlan h5 file
        df = pd.read_hdf(
            fp,
            key="csi",
            mode="r",
        )

        # Not including captures that are too short (i.e. invalid capture)
        if df["frame.time_relative"].max() < end_time*0.9:
            continue

        # Converting df CSI columns from "r" and "i" columns to combined complex
        # Getting CSI matrix
        csi = df_to_csi_matrix(df)
        # Dropping old CSI columns from DF
        df = df.loc[:,["csi_" not in i for i in df.columns]]
        # Adding complex CSI columns to DF
        for i in np.arange(csi.shape[1]):
            # df[f"csi_{i}"] = csi[:, i]
            df[f"csi_{i}_a"] = np.abs(csi[:, i])
            df[f"csi_{i}_p"] = np.angle(csi[:, i])

        # Adding a column to track the frame count
        df["frames"] = np.arange(df.shape[0])

        # Binning df with different filters
        df_binned = ts_bin_df(df, interval, end_time, agg_dict)

        # Imputing missing values
        df_binned["frames"] = df_binned["frames"].fillna(0) # Frames (set to 0)
        df_binned = df_binned.interpolate(method="linear", axis=0) # CSI (interpolation)
        # Setting the ts_bins index as a column
        df_binned["ts_bins"] = df_binned.index

        # Adding label combo to group instances
        df_binned["devices"] = devices[row['devices']]
        df_binned["videos"] = videos[row['videos']]
        df_binned["instances"] = os.path.split(fp)[1]

        # Concatenating the instance's binned df to the overall binned df (df_all)
        X = pd.concat([X, df_binned], axis=0, ignore_index=True)

# Saving to h5 file
# X.to_hdf(
#     os.path.join(resources_dir, "total_csi.h5"),
#     key="csi",
#     mode="w"
# )

['client_pc_200_wall' 'v=3InbMow9IYo']
['client_pc_200_wall' 'v=A3gUpodXMv0']
['client_pc_200_wall' 'v=NSW5u1RTxEA']
['client_pc_200_wall' 'v=gxxqdrrpgZc']
['client_pc_200_wall' 'v=mkWKZWMokdI']


## Checking correct timebins

In [None]:
X = pd.read_hdf(
    os.path.join(resources_dir, "total_wlan.h5"),
    key="wlan",
    mode="r"
)

display(
    pd
    .MultiIndex
    .from_frame(X[["devices", "videos", "instances"]])
    .unique()
    .shape
)

display(
    X
    .groupby(["devices", "videos", "instances"])
    .agg(
        {
            "ts_bins": ["count", "max"],
        }
    )
)

