In [1]:
from kafka import KafkaConsumer
from json import loads
import json
from time import sleep
import pandas as pd
import influxdb_client
from influxdb_client.client.write_api import SYNCHRONOUS

In [2]:
token = "LuDJ6F7gMj8KqZyms0Q3krl0HKsHPYNvPO6k6dEl2lvzSSYyxyVy951S2hg-loMep5v71fZidBVWb2MqH1qdkA=="
org = "mema_org"
bucket = "mema_bucket"
url = "http://localhost:8086"

In [3]:
class ADWIN:
    def __init__(self, delta=0.002):
        self.delta = delta
        self.window = []
        self.total = 0
        self.variance = 0
        self.width = 0

    def add_element(self, value):
        cutpoint_detected = False
        self.window.append(value)
        self.width += 1
        self.total += value
        self.update_variance(value)

        if self.width > 1:
            cutpoint = self.find_cut()
            if cutpoint:
                self.window = self.window[cutpoint:]
                self.width = len(self.window)
                self.total = sum(self.window)
                self.recalculate_variance()
                cutpoint_detected = True
        return cutpoint_detected

    def update_variance(self, value):
        mean = self.total / self.width
        self.variance += (value - mean) * (value - self.total / (self.width - 1))

    def find_cut(self):
        for i in range(1, self.width):
            w0 = i
            w1 = self.width - i
            mean0 = sum(self.window[:i]) / w0
            mean1 = sum(self.window[i:]) / w1
            var0 = sum((x - mean0) ** 2 for x in self.window[:i]) / w0
            var1 = sum((x - mean1) ** 2 for x in self.window[i:]) / w1
            m = 1 / w0 + 1 / w1
            epsilon = ((2 / (m - 1)) * log(4 / self.delta)) ** 0.5
            if abs(mean0 - mean1) > epsilon:
                return i
        return None

    def recalculate_variance(self):
        mean = self.total / self.width
        self.variance = sum((x - mean) ** 2 for x in self.window)


In [4]:
import joblib

# Load the model from the file
model = joblib.load('../albert/model/random_forest_model.joblib')

In [5]:
def split_df(df):
    label = df.pop('attack')
    return df, label

In [6]:
class HaiConsumer:
    def __init__(self, topic, bootstrap_servers):
        self.topic = topic
        self.bootstrap_servers = bootstrap_servers
        self.consumer = KafkaConsumer(
            self.topic,
            bootstrap_servers=self.bootstrap_servers,
            auto_offset_reset='earliest',
            enable_auto_commit=True,
            value_deserializer=lambda x: loads(x.decode('utf-8')))

    def consume(self):
        # Initialize ADWIN detector
        adwin = ADWIN(delta=0.002)

        counter = 0
        # Perform change detection on the test set
        global change_points
        global accuracies
        global whole_df
        global whole_labels
        change_points = []
        accuracies = []
        whole_df = pd.DataFrame()
        whole_labels = pd.Series()
        # write to file
        with influxdb_client.InfluxDBClient(url=url, token=token, org=org) as client:
            write_api = client.write_api(write_options=SYNCHRONOUS)
            for i, message in enumerate(self.consumer):
                
                message = message.value
                # selected_columns = {key: value for key, value in message.items() if key in columns_to_scale_and_monitor}
                df = pd.DataFrame([message])
                x_i, y_i = split_df(df)
                x_i = x_i.iloc[0:1]
                
                whole_df =pd.concat([whole_df,x_i], ignore_index=True)
                whole_labels = pd.concat([whole_labels,y_i], ignore_index=True)

                # Predict using the RandomForest model
                pred = model.predict(x_i)

                # Check for a change point using ADWIN
                if adwin.add_element(pred == int(y_i)):
                    change_points.append(i)

                # Calculate accuracy at each step
                accuracy = model.score(whole_df.iloc[:i + 1], whole_labels.iloc[:i + 1])
                accuracies.append(accuracy)
                
                if accuracy >= 0.5:
                    accuracy = 1
                else:
                    accuracy = 0
                p = influxdb_client.Point("HAI_ChangeDetection_ADWIN").field('current_state', int(accuracy))
                write_api.write(bucket, org, p)

                
                p = influxdb_client.Point("HAI_ChangeDetection_ADWIN").field('change_detected', int(y_i))
                write_api.write(bucket, org, p)
                
            client.close()

In [None]:
topic = 'hai-preprocessed-mao-3'
bootstrap_servers = ['localhost:9092']
consumer = HaiConsumer(topic, bootstrap_servers)
consumer.consume()