In [18]:
import os
import pandas as pd
from kafka import KafkaConsumer
#from skmultiflow.drift_detection import DDM
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from datetime import datetime
from sklearn.model_selection import train_test_split
import json

In [19]:
# implement basic ddm change detector
class DDM:
    """ Drift Detection Method for evolving data streams."""
    
    def __init__(self, min_num_instances=30, warning_level=2.0, out_control_level=3.0):
        """ Init DDM."""
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.out_control_level = out_control_level
        self.reset()

    def reset(self):
        """ Reset the change detector."""
        self.n = 1
        self.m_n = 0
        self.s_n = 0
        self.m_n1 = 0
        self.s_n1 = 0
        self.min_num_instances = 30
        self.warning_level = 2.0
        self.out_control_level = 3.0
        self.in_concept_change = False
        self.in_warning_zone = False

    def add_element(self, value):
        """ Add a new element to the statistics. """
        if self.n == 1:
            self.m_n = value
            self.s_n = 0
        else:
            self.m_n1 = self.m_n
            self.s_n1 = self.s_n
            self.m_n = self.m_n1 + (value - self.m_n1) / self.n
            self.s_n = self.s_n1 + (value - self.m_n1) * (value - self.m_n)

        self.n += 1

        if self.n < self.min_num_instances:
            return 1

        if self.s_n < 0:
            self.s_n = 0

        if self.s_n > 0:
            z_n = (value - self.m_n) / (self.s_n ** 0.5)
        else:
            z_n = 0

        if z_n < self.warning_level:
            self.in_warning_zone = True
        else:
            self.in_warning_zone = False

        if z_n < self.out_control_level:
            self.in_concept_change = True
        else:
            self.in_concept_change = False

        return z_n

    def get_info(self):
        """Return information about the change."""
        description = "DDM: Drift Detection Method for evolving data streams.\n"
        description += "min_num_instances: " + str(self.min_num_instances) + "\n"
        description += "warning_level: " + str(self.warning_level) + "\n"
        description += "out_control_level: " + str(self.out_control_level)
        return description
    

In [20]:
# import csv training data
data_folder = '../../hai_dataset/hai/hai-21.03'
data_filename = 'train1.csv'
data_path = os.path.join(data_folder, data_filename).replace(os.sep, '/')
df = pd.read_csv(data_path, index_col=0)
# data columns

# timestamp is the time of the event
# there are multiple features
# the label columns are 

In [21]:
# split into train and test
label_columns = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']
X_train, X_test, y_train, y_test = train_test_split(df.drop(label_columns, axis=1), df[label_columns], test_size=0.33, random_state=42)

In [22]:
X_train.head()

Unnamed: 0_level_0,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,P1_FCV01Z,...,P4_HT_PO,P4_HT_PS,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-11 09:03:09,0.09913,1.41187,396.71945,894.71869,32.0,50.0648,1555.60352,35.78186,17.42988,16.32843,...,6.69128,0,345.75738,-0.0013,18264.0,347.05951,339.35547,0,10086.0,27612.0
2020-07-11 17:51:08,0.09933,1.42565,394.01361,1055.10217,32.38132,100.0,2825.40674,36.19713,100.0,99.49343,...,69.93268,0,380.80512,0.00217,18081.0,312.86169,336.51617,0,10053.0,27582.0
2020-07-12 17:38:52,0.09841,1.33974,406.22626,961.54413,33.52099,100.0,2832.2854,37.12432,100.0,99.30044,...,0.30743,0,360.3335,0.00101,17610.0,360.96643,324.68896,0,10026.0,27578.0
2020-07-13 07:19:37,0.09904,1.35274,399.73972,1085.01001,31.64864,33.09878,1126.19263,35.28287,14.49461,16.06903,...,0.47021,0,305.73645,-0.00015,17084.0,304.16303,317.05731,0,10052.0,27596.0
2020-07-12 10:03:01,0.10135,1.31842,406.68262,902.40576,32.0,34.67972,1052.52466,35.54268,15.99286,14.68811,...,0.05423,0,336.58856,0.00015,17172.0,338.59595,315.97223,0,9999.0,27613.0


In [23]:
y_train.head()

Unnamed: 0_level_0,attack,attack_P1,attack_P2,attack_P3
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-07-11 09:03:09,0,0,0,0
2020-07-11 17:51:08,0,0,0,0
2020-07-12 17:38:52,0,0,0,0
2020-07-13 07:19:37,0,0,0,0
2020-07-12 10:03:01,0,0,0,0


In [24]:
# initialize classifier
clf = RandomForestClassifier()

In [25]:
# do the initial training
clf.fit(X_train, y_train)

In [26]:
# get the initial accuracy
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [27]:
# connect to kafka
consumer = KafkaConsumer('hai-input', bootstrap_servers=['localhost:9092'])

In [28]:
# initialize drift detector
ddm = DDM()

In [29]:
def split_dict(data_dict, label_keys):
    """
    Split a dictionary into two dictionaries based on the specified keys.

    Parameters:
    - input_dict (dict): The input dictionary.
    - keys (list): List of keys to include in the first dictionary.

    Returns:
    - dict, dict: Two dictionaries - one with specified keys, and one with the rest.
    """
    selected_dict = {key: data_dict[key] for key in label_keys if key in data_dict}
    remaining_dict = {key: data_dict[key] for key in data_dict if key not in label_keys}
    return remaining_dict, selected_dict

In [30]:
# consume the streamed data from kafka and detect drift
label_columns = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']
for msg in consumer:
    # get the data
    data = json.loads(msg.value)
    print(data)
    # get the timestamp
    timestamp = datetime.fromtimestamp(data.drop(['time']))
    # get the features
    features, label = split_dict(data, label_columns)
    print(features, label)
    # get the label
    # add the data to the classifier
    #clf.partial_fit(features, label)
    # get the prediction
    y_pred = clf.predict(features)
    # get the accuracy
    accuracy = accuracy_score(label, y_pred)
    # add the data to the drift detector
    ddm.add_element(accuracy)
    # get the drift status
    drift_status = ddm.get_info()
    # print the results
    print(f'{timestamp} - {accuracy} - {drift_status}')

KeyboardInterrupt: 