In [101]:
import numpy as np
import pandas as pd

df = pd.read_csv('WsnData.csv')
print(df)

      SensorID         Timestamp   SensorType          X          Y  \
0            1  18/04/2024 22:41  Temperature  32.300556  21.575965   
1            2  18/04/2024 22:41     Humidity  61.588405  96.030429   
2            3  18/04/2024 22:41     Pressure  17.543522  75.573323   
3            4  18/04/2024 22:41  Temperature  75.932405  92.519359   
4            5  18/04/2024 22:41     Humidity  13.461775  14.773697   
...        ...               ...          ...        ...        ...   
9995        96  18/04/2024 22:43     Pressure  34.745910  39.720684   
9996        97  18/04/2024 22:43  Temperature  90.937513  15.553641   
9997        98  18/04/2024 22:43     Humidity  18.121821  18.919793   
9998        99  18/04/2024 22:43     Pressure  32.229611  31.770562   
9999       100  18/04/2024 22:43  Temperature  73.680801   1.365563   

      SensorData  BatteryLife  Temperature  IsFaulty  
0      51.396271    73.533551    53.793287         0  
1      15.024628    73.435270    50.8

In [102]:
#checking for null_values
null_values = df.isnull().sum()
#count null values for eachn column
print(null_values)

SensorID       0
Timestamp      0
SensorType     0
X              0
Y              0
SensorData     0
BatteryLife    0
Temperature    0
IsFaulty       0
dtype: int64


In [243]:
#Define training Set
X = df.iloc[:, 3:8].values  # All columns except the last one
y = df.iloc[:, -1].values   # Only the last column


# Displaying the shapes of X and y to verify
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Shape of X: (10000, 5)
Shape of y: (10000,)


In [274]:
x_train = X[:7500]
y_train = y[:7500]

x_cv = X[7500:10000]
y_cv = y[7500:10000]


print(f"the shape of the training set (input) is: {x_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the cross validation set (input) is: {x_cv.shape}")
print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")

the shape of the training set (input) is: (7500, 5)
the shape of the training set (target) is: (7500,)

the shape of the cross validation set (input) is: (2500, 5)
the shape of the cross validation set (target) is: (2500,)



In [275]:
def estimate_gaussian(X): 
    """
    Calculates mean and variance of all features 
    in the dataset
    
    Args:
        X (ndarray): (m, n) Data matrix
    
    Returns:
        mu (ndarray): (n,) Mean of all features
        var (ndarray): (n,) Variance of all features
    """

    m, n = X.shape
    mu = np.zeros(n)
    var = np.zeros(n)
    ### START CODE HERE ### 
    for j in range(n):
        #for each feature in all the training examples
        x_feat = X[:,j]
        mu[j] = np.mean(x_feat)
        var[j] =  np.var(x_feat)
    
    
    ### END CODE HERE ### 
        
    return mu, var

In [276]:
# Estimate mean and variance of each feature
mu, var = estimate_gaussian(x_train)              

print("Mean of each feature:", mu)
print("Variance of each feature:", var)

Mean of each feature: [50.30887561 47.10071416 50.4059588  49.26649485 43.25606869]
Variance of each feature: [ 722.64664894 1015.40556094  886.04822104  806.84730427  278.41424682]


In [277]:
from scipy.stats import multivariate_normal
def multivariate_gaussian(X, mu, var):
    pdf_values = multivariate_normal.pdf(X, mean=mu, cov=var)
    return pdf_values
 

p = multivariate_gaussian(x_cv, mu, var)
print(p)

[1.50500544e-10 4.04824845e-11 4.81921481e-11 ... 1.63878288e-10
 1.21366760e-10 1.35490051e-10]


In [278]:
#select_threshold

def select_threshold(y_val, p_val): 
    """
    Finds the best threshold to use for selecting outliers 
    based on the results from a validation set (p_val) 
    and the ground truth (y_val)
    
    Args:
        y_val (ndarray): Ground truth on validation set
        p_val (ndarray): Results on validation set
        
    Returns:
        epsilon (float): Threshold chosen 
        F1 (float):      F1 score by choosing epsilon as threshold
    """ 

    best_epsilon = 0
    best_F1 = 0
    F1 = 0
    
    step_size = (max(p_val) - min(p_val)) / 1000
    
    for epsilon in np.arange(min(p_val), max(p_val), step_size):
        #iterating through different values of epsilon that range from the minimum y_pred to the
        #maximum y_pred
    
        ### START CODE HERE ### 
        #comparing this particular value of epsilon to the list of our predicted values returning 1 if less than
        #epsilon and 0 if greater than epsilon
        predictions = (p_val < epsilon)
        
        
        tp = np.sum((predictions == 1) & (y_val == 1))
        fp = np.sum((predictions == 1) & (y_val == 0))
      
        fn = np.sum((predictions == 0) & (y_val == 1))

    
        if tp != 0 and (tp + fp) != 0 and  (tp + fn) != 0:
            precision = tp/(tp + fp)
            recall = tp/(tp + fn)
            F1 = (2*precision*recall)/(precision + recall)
            if F1 > best_F1:
                best_F1 = F1
                best_epsilon = epsilon
      
        
        
        ### END CODE HERE ### 
#       
        
    return best_epsilon, best_F1

In [279]:
epsilon,F1 = select_threshold(y_cv, p)
print(epsilon)
print(F1)

6.419287204360805e-10
0.9137144099108889


In [281]:
print('# Anomalies found: %d'% sum(p < epsilon))
print('# Actual Anomalies: %d'%sum(y_cv))

# Anomalies found: 2499
# Actual Anomalies: 2102


In [None]:
#NOTE: training the data on way too littles datasets i.e getting the parameters of a relatively small amount of data makes the
#data poor in identifying normalcy thus causing to flag more normals as anomalies causing extremely low precision