## Scikit-learn

Iteratively perform KMeans clustering cycles in non-overlapping windows of 10 epochs with the aim of finding outlier (noisy) motes.

In [3]:
import numpy as np
from sklearn.cluster import KMeans
from collections import defaultdict

all_temperatures = np.genfromtxt("mote_data/temp.csv",
                                 delimiter='\t', skip_header=0)
all_temperatures = all_temperatures[:487, :]  # ensure shape (487, 48)

all_humidities = np.loadtxt("mote_data/humidity.txt", delimiter='\t')  # shape (487, 48)
all_lights     = np.loadtxt("mote_data/light.txt",    delimiter='\t')  # shape (487, 48)
all_voltages   = np.loadtxt("mote_data/voltage.txt",  delimiter='\t')  # shape (487, 48)

print("Temperatures shape:", all_temperatures.shape)
print("Humidities shape:  ", all_humidities.shape)
print("Lights shape:      ", all_lights.shape)
print("Voltages shape:    ", all_voltages.shape)
# Each variable has shape (487, 48) [epoch, mote]

Temperatures shape: (487, 48)
Humidities shape:   (487, 48)
Lights shape:       (487, 48)
Voltages shape:     (487, 48)


In [4]:
WINDOW_SIZE = 10
K = 3 
OUTLIER_CLUSTER_SIZE = 2


# key = window_index (starting epoch)
# value = list of mote indices that are outliers in that window
outliers_by_window = defaultdict(list)

num_epochs = all_temperatures.shape[0]  # 487
num_motes = all_temperatures.shape[1]   # 48

window_index = 0

for start in range(0, num_epochs, WINDOW_SIZE):
    end = start + WINDOW_SIZE
    if end > num_epochs:
        break  # only full windows - full 10 epochs

    window_temp  = all_temperatures[start:end, :] 
    window_humid = all_humidities[start:end, :]
    window_light = all_lights[start:end, :]
    window_volt  = all_voltages[start:end, :]

    temp_t  = window_temp.T   # shape (48, 10)
    humid_t = window_humid.T  # shape (48, 10)
    light_t = window_light.T  # shape (48, 10)
    volt_t  = window_volt.T   # shape (48, 10)

    window_features = np.hstack([temp_t, humid_t, light_t, volt_t])

    # apply KMeans
    clustering = KMeans(n_clusters=K, random_state=13)
    clustering.fit(window_features)

    labels = clustering.labels_ 

    # Identify clusters of size <= OUTLIER_CLUSTER_SIZE
    cluster_counts = np.bincount(labels)  #cluster_counts[0] = number of motes in cluster 0

    # cluster with small count (1,2), we consider those motes outliers
    for cluster_id in range(K):
        if cluster_counts[cluster_id] <= OUTLIER_CLUSTER_SIZE:
            # find which motes belong to cluster_id
            outlier_motes = np.where(labels == cluster_id)[0]
            outliers_by_window[start].extend(outlier_motes.tolist())

    window_index += 1

print("Detected outliers in each 10-epoch window:")
for wstart, outlier_list in outliers_by_window.items():
    print(f"Window {wstart:03d}-{wstart+WINDOW_SIZE-1:03d}: motes outliers = {outlier_list}")

Detected outliers in each 10-epoch window:
Window 010-019: motes outliers = [15]
Window 020-029: motes outliers = [15]
Window 030-039: motes outliers = [14]
Window 080-089: motes outliers = [30]
Window 100-109: motes outliers = [30]
Window 120-129: motes outliers = [14]
Window 130-139: motes outliers = [14]
Window 180-189: motes outliers = [30]
Window 190-199: motes outliers = [14]
Window 210-219: motes outliers = [14]
Window 240-249: motes outliers = [14]
Window 250-259: motes outliers = [14]
Window 260-269: motes outliers = [14]
Window 290-299: motes outliers = [14]
