In [None]:
import pandas as pd
import numpy as np
import os
import itertools
from glob import glob

import matplotlib.pyplot as plt

# import tensorflow

## Setting configs

In [None]:
# Directory storing the CSI and WLAN captures
resources_dir = "/Volumes/tim_details/tim_honours/CAPTURES"

# Directory to save plots to
plt_dir = "/Users/timothylee/Desktop/Uni/Yr5/Honours/honours_thesis/figures/plt_figs/"

# Supress pd scientific notation
pd.set_option('display.float_format', '{:.6f}'.format)

# Resolution of plots
plt.rcParams["figure.dpi"] = 100 # 300
# plt.rcParams["figure.dpi"] = 500 # 300

# Backend to generate plots
# mpl.use("agg")
# %matplotlib ipympl
%matplotlib inline

# plt figure style
fig_style = "seaborn-v0_8-whitegrid"

# colormaps
cmap_qual = "pastel"
cmap_seq = "viridis"
cmap_cycl = "twilight"

# Hide warnings
import warnings
warnings.filterwarnings("ignore")


## ML Preprocessing

### Reading in total binned df

In [None]:
X = pd.read_hdf(
    os.path.join(resources_dir, "total_wlan.h5"),
    key="wlan",
    mode="r"
)

### Formatting total binned df as features matrix

In [None]:
# Making each row instance's time series vector for each column's measure
X_features = (
    X
    .sort_values("ts_bins")
    .groupby(["devices", "videos", "instances"])
    .agg({
        # Uplink (with non-data, data, and all frames)
        "frames_up_ndat": lambda x: x.values.tolist(),
        "bytes_up_ndat": lambda x: x.values.tolist(),
        "frames_up_dat": lambda x: x.values.tolist(),
        "bytes_up_dat": lambda x: x.values.tolist(),
        "frames_up_all": lambda x: x.values.tolist(),
        "bytes_up_all": lambda x: x.values.tolist(),
        # Downlink
        "frames_dn_ndat": lambda x: x.values.tolist(),
        "bytes_dn_ndat": lambda x: x.values.tolist(),
        "frames_dn_dat": lambda x: x.values.tolist(),
        "bytes_dn_dat": lambda x: x.values.tolist(),
        "frames_dn_all": lambda x: x.values.tolist(),
        "bytes_dn_all": lambda x: x.values.tolist(),
        # All
        "frames_all_ndat": lambda x: x.values.tolist(),
        "bytes_all_ndat": lambda x: x.values.tolist(),
        "frames_all_dat": lambda x: x.values.tolist(),
        "bytes_all_dat": lambda x: x.values.tolist(),
        "frames_all_all": lambda x: x.values.tolist(),
        "bytes_all_all": lambda x: x.values.tolist(),
    })
)

X_features


### Setting Y labels

In [None]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

# Making a DF of the corresponding label combinations of each sample
Y = (
    X_features
    .index
    .to_frame()
    .reset_index(drop=True)
    .assign(
        locations=lambda x: x["devices"].str.split(" ").str[1:].str.join(" "),
        devices=lambda x: x["devices"].str.split(" ").str[0],
    )
)

# Using video as label
labels_to_classify = ["videos", "devices"]
# TODO: can try out different combinations
y = np.array(
    ["|".join(x) for x in Y[labels_to_classify].values]
)

# Encode the labels
# le = LabelEncoder() # assigns integer encoding
lb = LabelBinarizer() # one-hot encoding|
lb.fit(y)
y_lb = lb.transform(y)

y

### Processing X features matrix

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer

# Making a 3D features matrix
feature_channels = ['frames_all_ndat', 'frames_all_dat', 'bytes_all_all']
# Axes format is (instance, time, feature_channel)
X_features_matr = (
    np.array(
        X_features[feature_channels].values.tolist()
    )
    .transpose(0, 2, 1)
)

# MinMax scaling the 2D matrix of each feature channel
X_features_matr_scaled = np.zeros(X_features_matr.shape)
# For each feature channel
for i in np.arange(X_features_matr.shape[2]):
    # Scale the 2D (instance, time) matrix
    view = X_features_matr[:, :, i]
    X_features_matr_scaled[:, :, i] = (view - view.min())/(view.max() - view.min())
# Set nan values to 0
X_features_matr_scaled[np.isnan(X_features_matr_scaled)] = 0

### Visualising features to sense check

In [None]:
# Line plots through time for some labels
fig = plt.figure(figsize=(8, 4))
axes = fig.subplots(nrows=4, ncols=X_features_matr_scaled.shape[2]).reshape(-1, X_features_matr_scaled.shape[2])
# For each label (given n labels)
for i in np.arange(axes.shape[0]):
    # For each feature channel
    for j in np.arange(axes.shape[1]):
        lab = np.unique(y)[i]
        feature = feature_channels[j]
        axes[i, j].plot(
            X_features_matr_scaled[y == lab][:5, :, j].T
        )
        axes[i, j].set_title(f"{lab}, {feature}")

# Hist plots of each feature's value frequency for some instances
fig = plt.figure(figsize=(8, 4))
axes = fig.subplots(nrows=4, ncols=X_features_matr_scaled.shape[2]).reshape(-1, X_features_matr_scaled.shape[2])
# For each label (given n labels)
for i in np.arange(axes.shape[0]):
    # For each feature channel
    for j in np.arange(axes.shape[1]):
        lab = np.unique(y)[i]
        feature = feature_channels[j]
        sns.histplot(
            (X_features_matr[0, :, j]),
            element="step",
            bins=50,
            ax=axes[i, j],
        )
        axes[i, j].set_ylim(0, 10)
        axes[i, j].set_title(f"{lab}, {feature}")

### Making Training and Test sets

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_features_matr_scaled,
    y_lb,
    test_size=0.2,
    random_state=42,
    stratify=y_lb,
)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Flattening this data (reshaping) to feed into different ML algos
# X_train_flat = X_train.reshape(X_train.shape[0], -1)
# X_test_flat = X_test.reshape(X_test.shape[0], -1)

### Conducting PCA

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# PERFORMING PCA ANALYSIS (IN-CASE)
# Initialize and fit the PCA model
pca = PCA(n_components=10)
pca.fit(np.abs(X_train))

# Transform the data to the lower-dimensional space (and selecting only the top 3 components)
n_components = 2
X_train_pca = pca.transform(np.abs(X_train))[:, :n_components]
X_test_pca = pca.transform(np.abs(X_test))[:, :n_components]

plt.plot(pca.explained_variance_ratio_)
plt.vlines(x=n_components, ymin=0, ymax=pca.explained_variance_ratio_.max(), colors=(0.6,0,0.2))
plt.show()

## Evaluating ML Models

In [None]:
def plot_confusion_matrix(
    cm,
    classes,
    title='Confusion matrix',
    cmap=cmap_intsy,
):
    """
    To plot heatmap of confusion matrix
    """
    # Initialising figure and axes
    with plt.style.context(fig_style):
        fig = plt.figure(
            figsize=(8, 8),
            layout="constrained"
        )
        ax = fig.subplots()
    # Making confusion matrix heatmap
    sns.heatmap(
        cm,
        # annot=True,
        ax=ax,
        cmap=cmap,
        fmt='.2f',
        # cbar=False,
        xticklabels=lb.classes_,
        yticklabels=lb.classes_,
    )
    # Set titles
    ax.tick_params(labelsize="small")
    ax.set_title(title, fontsize="xx-large")
    ax.set_xlabel("Predicted", fontsize="large")
    ax.set_ylabel("True", fontsize="large")
    # Return figure and axis
    return fig, ax

import pickle

def save_model(
    model,
    y_true,
    y_pred,
    name,
):
    """
    Saves the model and results to a folder (given by name)
    """
    # Making directory to store model
    my_model_dir = os.path.join(models_dir, name)
    os.makedirs(my_model_dir, exist_ok=True)
    # Storing model as pickle
    model_fp = os.path.join(my_model_dir, f"{name}.pkl")
    with open(model_fp, "wb") as f:
        pickle.dump(model, f)
    # Generating and storing results as pickle
    res_fp = os.path.join(my_model_dir, f"{name}.h5")
    pd.DataFrame(
        {
            "y_true": lb.inverse_transform(y_test),
            "y_pred": lb.inverse_transform(y_pred),
        }
    ).to_hdf(res_fp, key="results", mode="w")


In [None]:
# SVM classifier with RBF. Not as fast or accurate as KNN
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Making RBF SVM
svc = SVC(
    C=2.0, # Regularisation parameter. Reg strength is inversely proportional to C
    kernel='rbf', # {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} or callable
    # degree=3, # Degree for poly kernels
    gamma='scale', # {‘scale’, ‘auto’} or float
    coef0=0.0, # Independent term in kernel function. It is only significant in ‘poly’ and ‘sigmoid’.
    shrinking=True, # Whether to use the shrinking heuristic
    probability=False, # Allows predict_proba but slows down process
    tol=0.001, # Tolerance for stopping criterion.
    cache_size=200, # Specify the size of the kernel cache in MB
    class_weight=None, # Set the parameter C of class i to class_weight[i]*C. Keep as none for equal weights across classes
    verbose=False, # Enable verbose output
    max_iter=-1, # Hard limit on iterations within solver, or -1 for no limit
    decision_function_shape='ovo', # {‘ovo’, ‘ovr’}
    break_ties=False,
    random_state=None
)
svc_mc = OneVsRestClassifier(svc)

# Training KNN
svc_mc.fit(X_train, y_train)
# svc_mc.fit(X_train_pca, y_train)

# Evaluating KNN
y_pred = svc_mc.predict(X_test)
# y_pred = svc_mc.predict(X_test_pca)

# Showing evaluation confusion matrix
cm = confusion_matrix(
    lb.inverse_transform(y_pred),
    lb.inverse_transform(y_test),
)
print(cm)
print(classification_report(y_test, y_pred, target_names=lb.classes_))
print(accuracy_score(y_test, y_pred))

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Making MLP model with Entire CSI matrix.
model = tf.keras.Sequential([
    layers.Input(shape=(X_train.shape[1], 1)),
    layers.Conv1D(8, 5, padding="valid", activation='relu'),
    layers.MaxPooling1D(2),
    layers.Conv1D(16, (3,), padding="valid", activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(lb.classes_), activation='softmax')
])

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=[
        tf.keras.metrics.CategoricalAccuracy(),
    ],
)

# Show model architecture
model.summary()

# Training the model
history = model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=16,
    validation_split=0.2,
    verbose=True,
)

# Evaluating the model
y_pred = model.predict(X_test)
y_pred = tf.keras.utils.to_categorical(y_pred.argmax(axis=1), len(lb.classes_)).astype(int)

cm = confusion_matrix(
    lb.inverse_transform(y_pred),
    lb.inverse_transform(y_test)
)
print(cm)
# print(classification_report(y_test, y_pred, target_names=classes))
print(accuracy_score(y_test, y_pred))

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Making MLP model with PCA
model = tf.keras.Sequential([
    layers.Input(shape=(X_train.shape[1], 1)),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dense(len(lb.classes_), activation='softmax')
])

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    # loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[
        # tf.keras.metrics.Accuracy(),
        tf.keras.metrics.CategoricalAccuracy(),
    ],
)

# Show model architecture
# model.summary()

# Training the model
history = model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=16,
    validation_split=0.2,
    verbose=True,
)

# Evaluating the model
y_pred = model.predict(X_train)
y_pred = tf.keras.utils.to_categorical(y_pred.argmax(axis=1), len(lb.classes_)).astype(int)

cm = confusion_matrix(
    lb.inverse_transform(y_pred),
    lb.inverse_transform(y_train)
)
print(cm)
print(classification_report(y_train, y_pred, target_names=lb.classes_))
print(accuracy_score(y_train, y_pred))

In [None]:
# Evaluating the model
y_pred = model.predict(X_test_pca)
y_pred = tf.keras.utils.to_categorical(y_pred.argmax(axis=1), len(lb.classes_)).astype(int)

cm = confusion_matrix(
    lb.inverse_transform(y_pred),
    lb.inverse_transform(y_test)
)
print(cm)
print(classification_report(y_test, y_pred, target_names=classes))
print(accuracy_score(y_test, y_pred))