In [39]:
from kafka import KafkaConsumer
from json import loads
import json
from time import sleep
import pandas as pd
from pyspark.sql import SparkSession
import influxdb_client
from influxdb_client.client.write_api import SYNCHRONOUS

In [40]:
token = "En1iX5zqnyR_AT71S6Ahz8_Hs78nrJHwEkZDDksf4J6reHJNqXzbaMEXbmBjy7I-bdzp2k8fy7E1FjU1f2ZWsA=="
org = "mema_org"
bucket = "mema_bucket"
url = "http://localhost:8086"

Evaluation functions

In [41]:
# Definitions of functions
def calculate_delay_of_detection(true_change_indexes, detected_indexes):
    
    delays = [index - detected_index for detected_index, index in zip(detected_indexes, true_change_indexes)]

    if len(delays) > 0:
        average_delay = sum(delays) / len(delays)
        return average_delay
    else:
        return 0  

In [42]:

def calculate_false_detection_rate(true_change_indexes, detected_indexes):
    total_drifts = len(true_change_indexes)
    total_detected = len(detected_indexes)
    false_detections = total_detected - total_drifts
    fdr = false_detections / total_drifts
    return fdr

In [43]:

def calculate_miss_detection_rate(true_change_indexes, detected_indexes):
    total_drifts = len(true_change_indexes)
    total_detected = len(detected_indexes)
    mdr = (total_drifts - total_detected) / total_drifts
    return mdr


In [44]:

def calculate_rate_of_drift(detected_indexes, total_time):
    total_detected = len(detected_indexes)
    rod = total_detected / total_time
    return rod

In [45]:

from sklearn.preprocessing import StandardScaler

In [46]:
# Page-Hinkley Test Implementation
class PageHinkley:
    def __init__(self, min_instances=30, delta=0.005, threshold=50, alpha=1-0.0001):
        self.min_instances = min_instances
        self.delta = delta
        self.threshold = threshold
        self.alpha = alpha
        self.cum_sum = 0
        self.mean = 0
        self.n = 0

    def add_element(self, value):
        if self.n < self.min_instances:
            self.n += 1
            self.mean = self.mean + (value - self.mean) / self.n
            return False

        self.cum_sum = max(0, self.alpha * self.cum_sum + (value - self.mean - self.delta))

        self.mean = self.mean + (value - self.mean) / self.n
        self.n += 1

        if self.cum_sum > self.threshold:
            self.cum_sum = 0
            return True

        return False




In [47]:
# Detect change using Page-Hinkley
def detect_change(data, columns_to_monitor):
    results = {}
    for column in columns_to_monitor:
        ph = PageHinkley()
        results[column] = []
        for i, value in enumerate(data[column]):
            if ph.add_element(value):
                results[column].append(i)
    return results

In [48]:
import joblib

# Load the model from the file
model = joblib.load('../albert/model/random_forest_model.joblib')

In [51]:
class HaiConsumer:
    def __init__(self, topic, bootstrap_servers):
        self.topic = topic
        self.bootstrap_servers = bootstrap_servers
        self.consumer = KafkaConsumer(
            self.topic,
            bootstrap_servers=self.bootstrap_servers,
            auto_offset_reset='earliest',
            enable_auto_commit=True,
            value_deserializer=lambda x: loads(x.decode('utf-8')))

    def consume(self):
        
        columns_to_scale_and_monitor = ['P1_FCV01D', 'P1_PIT01', 'P1_FT01', 'P2_VIBTR01', 'x1001_05_SETPOINT_OUT']
        counter = 0
        # write to file
        with influxdb_client.InfluxDBClient(url=url, token=token, org=org) as client:
            write_api = client.write_api(write_options=SYNCHRONOUS)
            for i, message in enumerate(self.consumer):
                message = message.value
                # selected_columns = {key: value for key, value in message.items() if key in columns_to_scale_and_monitor}
                df = pd.DataFrame([message])
                df = df.iloc[:,1:]
                random_forest_detection = model.predict(df)
                stream_size = random_forest_detection.shape[0]
                change_points = detect_change(df, columns_to_scale_and_monitor)

                # Evaluate change detection performance using EDDM
                # average_delay_eddm = calculate_delay_of_detection(true_attack_indexes, change_points)
                fdr_ph = calculate_false_detection_rate(random_forest_detection, change_points)
                mdr_ph = calculate_miss_detection_rate(random_forest_detection, change_points)
                rod_ph = calculate_rate_of_drift(change_points, total_time=stream_size - 500)

                p = influxdb_client.Point("Change_Detection_PageHinkley").field('fdr_ph', fdr_ph)
                write_api.write(bucket, org, p)

                p = influxdb_client.Point("Change_Detection_PageHinkley").field('mdr_ph', mdr_ph)
                write_api.write(bucket, org, p)
                
                p = influxdb_client.Point("Change_Detection_PageHinkley").field('rod_ph', rod_ph)
                write_api.write(bucket, org, p)
                
                sleep(3)
                if i > 100:
                    break

            client.close()

In [52]:
topic = 'hai-input'
bootstrap_servers = ['localhost:9092']
consumer = HaiConsumer(topic, bootstrap_servers)
consumer.consume()

In [18]:

from sklearn.model_selection import train_test_split

# Main function
def main():
    file_path = '../../data_loading/hai-23_05/hai-test1.csv'
    test1 = pd.read_csv(file_path)
    label_file_path = '../../data_loading/hai-23_05/label-test1.csv'
    label1 = pd.read_csv(label_file_path)
    
    # Split the data into training and test sets
    train_data, test_data, train_labels, test_labels = train_test_split(test1.iloc[:,1:], label1['label'], test_size=0.2, random_state=42)
    print(test_data.dtypes)
    print(type(test_data))
    print(test_data.shape)
    stream_size = test_data.shape[0]
    print(test1.dtypes)
    print(label1.dtypes)
    # Accessing the indexes of true attack instances
    true_attack_indexes = test_labels[test_labels == 1].index
    print(true_attack_indexes)

    # Select columns to preprocess and monitor
    columns_to_scale_and_monitor = ['P1_FCV01D', 'P1_PIT01', 'P1_FT01', 'P2_VIBTR01', 'x1001_05_SETPOINT_OUT']

    # preprocessed_data = preprocess_data(data, columns_to_scale_and_monitor)
    change_points = detect_change(test_data, columns_to_scale_and_monitor)

    # Evaluate change detection performance using EDDM
    # average_delay_eddm = calculate_delay_of_detection(true_attack_indexes, change_points)
    fdr_ph = calculate_false_detection_rate(true_attack_indexes, change_points)
    mdr_ph = calculate_miss_detection_rate(true_attack_indexes, change_points)
    rod_ph = calculate_rate_of_drift(change_points, total_time=stream_size - 500)

    print(fdr_ph)
    print(mdr_ph)
    print(rod_ph)


In [19]:
if __name__ == "__main__":
    main()

P1_FCV01D                float64
P1_FCV01Z                float64
P1_FCV02D                float64
P1_FCV02Z                float64
P1_FCV03D                float64
                          ...   
x1002_07_SETPOINT_OUT    float64
x1002_08_SETPOINT_OUT    float64
x1003_10_SETPOINT_OUT    float64
x1003_18_SETPOINT_OUT    float64
x1003_24_SUM_OUT         float64
Length: 86, dtype: object
<class 'pandas.core.frame.DataFrame'>
(10800, 86)
timestamp                 object
P1_FCV01D                float64
P1_FCV01Z                float64
P1_FCV02D                float64
P1_FCV02Z                float64
                          ...   
x1002_07_SETPOINT_OUT    float64
x1002_08_SETPOINT_OUT    float64
x1003_10_SETPOINT_OUT    float64
x1003_18_SETPOINT_OUT    float64
x1003_24_SUM_OUT         float64
Length: 87, dtype: object
timestamp    object
label         int64
dtype: object
Index([49597, 20286, 27368, 24985,  1583, 12146, 20237, 24516, 49546,  1632,
       ...
       30421, 24758, 24923, 49

NameError: name 'fdr_eddm' is not defined