# Compressed sensing using GMMs on GNFUV dataset 

Dataset found at: https://archive.ics.uci.edu/ml/datasets/GNFUV+Unmanned+Surface+Vehicles+Sensor+Data+Set+2#
The data-set comprises (4) sets of mobile sensor readings data (humidity, temperature) corresponding to a swarm of four (4) Unmanned Surface Vehicles (USVs).
The swarm of the USVs is moving according to a GPS pre-defined trajectory, whose relative way-points are in the figure. The USVs are floating over the sea surface in a coastal area of Athens (Greece).


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.mixture import GaussianMixture
import joblib
from tqdm import tqdm
import pandas as pd
import argparse
import os
import pandas 

Reused code from Nimay's BTP code  

In [None]:
def weighted_l2(v, mat):
    if np.linalg.matrix_rank(mat) < mat.shape[0]:
        m = v.T @ np.linalg.pinv(mat) @ v
    else:
        # print("hi ", np.linalg.det(mat), np.linalg.matrix_rank(mat))
        m = v.T @ np.linalg.inv(mat) @ v
    return m

def decode(model, A, y, sigma = 1e-7):
    x_hat = np.empty(model.means_.shape)
    cost = []
    var_noise = sigma * np.eye(A.shape[0])

    for j in range(model.means_.shape[0]):
        var_j, mu_j = model.covariances_[j], model.means_[j]
        x_hat_j = var_j @ A.T @ np.linalg.inv(A @ var_j @ A.T + var_noise) @ (y - A @ mu_j) + mu_j
        # print(np.linalg.det(var_j))
        # print(var_j)
        try:
            cost_j = weighted_l2(y - A @ x_hat_j, var_noise) + weighted_l2(x_hat_j - mu_j, var_j) + np.log(np.linalg.det(var_j),where=np.linalg.det(var_j)>0)
        except:
            print("y - A @ x_hat_j",y - A @ x_hat_j , "\n", var_noise,"\n",y,A)
        # print(np.linalg.det(var_j))
        x_hat[j] = x_hat_j
        cost.append(cost_j)

    j = np.argmin(cost)
    # print(j)
    return x_hat[j]

def PSNR(x_, x_t):
    assert x_.shape == x_t.shape
    peak = np.max(x_) - np.min(x_)
    mse = np.mean(((x_ - x_t)/peak)**2, axis=1)
    print(type(mse),mse)
    psnr = -10*np.log10(mse)
    return np.mean(psnr), np.min(psnr), np.max(psnr), np.std(psnr)

def plot_data(xs, ys, path = 'PSNR.png', ylabel='Avg PSNR'):
    plt.plot(xs, ys, marker="o")
    plt.xlabel('# of measurements')
    plt.ylabel(ylabel)
    plt.grid(True)
    for x,y in zip(xs,ys):
        label = f"({x},{round(y,2)})"
        plt.annotate(label, (x,y), textcoords="offset points", xytext=(-5,10), ha='center')
#     plt.tight_layout()
    plt.savefig(path)
    plt.close()
    

In [None]:
def mmse(model, A, sigma = 1e-5):
    cost = 0
    var_noise = sigma * np.eye(A.shape[0])
    for j in range(model.means_.shape[0]):
        var_j = model.covariances_[j]
        m_j = np.trace(var_j - var_j @ A.T @ np.linalg.inv(A @ var_j @ A.T + var_noise) @ A @ var_j)
        cost += model.weights_[j] * m_j
    return cost

def optimize_mat(model, A_):
    A = A_.copy()
    costs = []
    for it in tqdm(range(20)):
        cnt = 0
        for i in range(A.shape[0]):
            for j in range(A.shape[1]):
                # flip the bit only if mmse is reduced
                cost = mmse(model, A)
                A[i, j] = 1 - A[i, j]
                cnt += 1
                if cost <= mmse(model, A):
                    A[i, j] = 1 - A[i, j]
                    cnt -= 1
        # break the loop if no improvement
        if cnt == 0:
            break
        costs.append(mmse(model, A))
    return A

The time-series data of GNFUV dataset is arranged in arrays of patches, for patch_size = 40, we will take 20 time-instances and make an array \[ temperature_1, humidity_1, ... , temperature_20, humidity_20 \]

In [None]:
############## Patch size
patch_size = 20*2


### Data preprocessing
Dataset has some empty values, which we remove. We also sort the dataset by the time, since we would transmit temporally close data. 

In [None]:
full_data = []

for experiment_number  in [1,2]:
    
    for USV_number in [2,3,4,5]:
        path = "CNFUV_Datasets/Datasets/Data_Experiment_%d/pi%d.xlsx"% (experiment_number, USV_number)
        dataframe = pandas.read_excel(path)

        dataframe = dataframe.mask(dataframe.eq(" None")).dropna()
        dataframe = dataframe.mask(dataframe.eq("None")).dropna()
        dataframe = dataframe.sort_values(by=['time'])
        dataframe = dataframe[['Humidity','Temperature']]
        
        dataframe = dataframe[:(dataframe.shape[0]//patch_size)*patch_size]
        data = dataframe.to_numpy().flatten()
        data = data.reshape(-1,patch_size)
        print(data.shape)
        full_data.extend(data)
full_data = np.array(full_data)


We divide the dataset into 90:10 train:test ratio and set other parameters

In [None]:
n_train = full_data.shape[0]//10 *9
print(n_train,full_data.shape[0])
n_test = full_data.shape[0] - n_train
n_component_overall = 5
cnt = 5
n_init = 5

to_train = 1
use_mat = 0

np.random.shuffle(data)
train_data = full_data[:n_train]
test_data = full_data[n_train:]

In [None]:
print("full_data.shape: ",full_data.shape)
print("len(train_data): ",len(train_data))
print("len(test_data): ",len(test_data))

Now, we consider the unoptimised case and the optimised case. In the optimised case, we iterate over a range of possible n_components to get the best fit using the BIC criteria. We also use the sensing matrix optimisation. 

In [None]:
A  = np.random.binomial(1, 0.5, size=(patch_size//5,patch_size))

unoptimised = 0
optimised = 0
val_signal_mean = 0
runs = 20
for run in range(runs):
    folder_name = 'GNFUV_results/latest/'
    if to_train:
        model = GaussianMixture(n_components=n_component_overall, n_init=n_init,  max_iter=200, init_params='random')
        model.fit(train_data)

        lowest_bic = np.infty
        bic = []
        n_components_range = range(1, 10)

        for n_component in n_components_range:
            # Fit a Gaussian mixture with EM
            gmm = GaussianMixture(
                n_components=n_component, n_init=n_init,  max_iter=200,
            )
            gmm.fit(train_data)
            bic.append(gmm.bic(train_data))
            if bic[-1] < lowest_bic:
                lowest_bic = bic[-1]
                best_gmm = gmm
                n_component_overall = n_component
        model_opt = best_gmm
        print("n_component_overall: ",n_component_overall)
        patch_err = []
        patch_err_opt = []
        A_opt = optimize_mat(model, A)
        for j in range(len(test_data)):
            x = test_data[j]
            x_hat = np.zeros(x.shape)
            y = A @ x
            y_opt = A_opt @ x
            x_hat = decode(model, A, y)
            x_hat_opt = decode(model_opt, A_opt, y_opt)
            # Mean Absolute Percentage Error (MAPE)
            patch_err.append(np.mean(np.abs( np.divide(x - np.round(x_hat),x) )))
            patch_err_opt.append(np.mean(np.abs( np.divide(x - np.round(x_hat_opt),x) )))
        #     if patch_err[-1] > 1:
        #         print("x:\n",x,"x_hat:\n",x_hat,"y:\n",y)
        val_err = np.mean(patch_err)
        unoptimised +=val_err
        val_err_opt = np.mean(patch_err_opt)
        optimised += val_err_opt
        val_signal_mean += np.mean(test_data)
print("Done")

Print the percentage errors for the unoptimised and optimised case

In [None]:
val_err = unoptimised/runs
val_err_opt = optimised/runs
val_signal_mean /= runs
print(val_err,val_err_opt,val_signal_mean)
print("Percentage error: " ,val_err*100, val_err_opt*100)
    

Play a sound after a long simulation ends to alert the user that it is completed

In [None]:
import winsound
duration = 3000  # milliseconds
freq = 440  # Hz
winsound.Beep(freq, duration)

In [None]:
plt.plot(x, label='Original', color='C0')
plt.plot(x_hat_opt, label='Reconstructed', color='C1')
print(y,x,"\nx_hat",x_hat)

Code dump ( partially modified code, older version code, users other than the coder can ignore) 