In [81]:
import os
import pandas as pd
from kafka import KafkaConsumer
#from skmultiflow.drift_detection import DDM
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from datetime import datetime
from sklearn.model_selection import train_test_split
import json

In [128]:
# import csv training data
data_folder = '../../hai_dataset/hai/hai-21.03'
data_filename = 'train1.csv'
data_path = os.path.join(data_folder, data_filename).replace(os.sep, '/')
df = pd.read_csv(data_path, index_col=0)
# data columns

# timestamp is the time of the event
# there are multiple features
# the label columns are 

In [99]:
# split into train and test
label_columns = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']
X_train, X_test, y_train, y_test = train_test_split(df.drop(label_columns, axis=1), df[label_columns], test_size=0.33, random_state=42)

In [100]:
# save the column names to a list
column_names = X_train.columns.tolist()
print(column_names)

['P1_B2004', 'P1_B2016', 'P1_B3004', 'P1_B3005', 'P1_B4002', 'P1_B4005', 'P1_B400B', 'P1_B4022', 'P1_FCV01D', 'P1_FCV01Z', 'P1_FCV02D', 'P1_FCV02Z', 'P1_FCV03D', 'P1_FCV03Z', 'P1_FT01', 'P1_FT01Z', 'P1_FT02', 'P1_FT02Z', 'P1_FT03', 'P1_FT03Z', 'P1_LCV01D', 'P1_LCV01Z', 'P1_LIT01', 'P1_PCV01D', 'P1_PCV01Z', 'P1_PCV02D', 'P1_PCV02Z', 'P1_PIT01', 'P1_PIT02', 'P1_PP01AD', 'P1_PP01AR', 'P1_PP01BD', 'P1_PP01BR', 'P1_PP02D', 'P1_PP02R', 'P1_STSP', 'P1_TIT01', 'P1_TIT02', 'P2_24Vdc', 'P2_ASD', 'P2_AutoGO', 'P2_CO_rpm', 'P2_Emerg', 'P2_HILout', 'P2_MSD', 'P2_ManualGO', 'P2_OnOff', 'P2_RTR', 'P2_SIT01', 'P2_SIT02', 'P2_TripEx', 'P2_VT01', 'P2_VTR01', 'P2_VTR02', 'P2_VTR03', 'P2_VTR04', 'P2_VXT02', 'P2_VXT03', 'P2_VYT02', 'P2_VYT03', 'P3_FIT01', 'P3_LCP01D', 'P3_LCV01D', 'P3_LH', 'P3_LIT01', 'P3_LL', 'P3_PIT01', 'P4_HT_FD', 'P4_HT_LD', 'P4_HT_PO', 'P4_HT_PS', 'P4_LD', 'P4_ST_FD', 'P4_ST_GOV', 'P4_ST_LD', 'P4_ST_PO', 'P4_ST_PS', 'P4_ST_PT01', 'P4_ST_TT01']


In [101]:
y_train.head()

Unnamed: 0_level_0,attack,attack_P1,attack_P2,attack_P3
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-07-11 09:03:09,0,0,0,0
2020-07-11 17:51:08,0,0,0,0
2020-07-12 17:38:52,0,0,0,0
2020-07-13 07:19:37,0,0,0,0
2020-07-12 10:03:01,0,0,0,0


In [102]:
# initialize classifier
clf = RandomForestClassifier()

In [103]:
# do the initial training
clf.fit(X_train, y_train)

In [104]:
# get the initial accuracy
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [129]:
# implement basic ddm change detector
class DDM:
    """ Drift Detection Method for evolving data streams."""
    
    def __init__(self, min_num_instances=30, warning_level=2.0, out_control_level=3.0):
        """ Init DDM."""
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.out_control_level = out_control_level
        self.reset()
        self.mean = 0.0
        self.n = 1
        self.sum = 0.0
        self.buffer = []

    def reset(self):
        """ Reset the change detector."""
        self.n = 1 # number of samples
        self.sum = 0.0 # sum of samples
        self.mean = 0.0 # mean of samples
        self.buffer = [] # buffer of samples
        
        print('reset')

    def add_element(self, value):
        """ Add a new element to the statistics. """
        # maintain a window of 1000 samples
        # if the window is full, remove the oldest sample
        # add the new sample
        # calculate the average accuracy
        # if the average accuracy is less than 0.5, then we detect a change
        # return the change status
        
        # if the window is full, remove the oldest sample
        if self.n > self.min_num_instances:
            self.sum -= self.buffer.pop(0)
        else:
            self.n += 1
            
        # add the new sample
        self.buffer.append(value)
        self.sum += value
        self.mean = self.sum / self.n
        
        return self.mean
            
            
        
        

    def get_info(self):
        """Return information about the change."""
        description = "DDM: Drift Detection Method for evolving data streams.\n"
        description += "min_num_instances: " + str(self.min_num_instances) + "\n"
        description += "warning_level: " + str(self.warning_level) + "\n"
        description += "out_control_level: " + str(self.out_control_level)
        return description 
    

In [130]:
class StreamConsumer:
    def __init__(self, topic, bootstrap_servers):
        self.topic = topic
        self.bootstrap_servers = bootstrap_servers
        self.consumer = KafkaConsumer(
            self.topic,
            bootstrap_servers=self.bootstrap_servers,
            auto_offset_reset='earliest',
            enable_auto_commit=True,
            value_deserializer=lambda x: json.loads(x.decode('utf-8')))
    def consume_next(self):
        # consume the next message
        return next(self.consumer)

In [131]:
topic = 'hai-preprocessed'
bootstrap_servers = ['localhost:9092']
consumer = StreamConsumer(topic, bootstrap_servers)

In [132]:
# initialize drift detector
ddm = DDM()

reset


In [144]:
# consume the streamed data from kafka and detect drift
label_columns = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']
msg = consumer.consume_next()
i = 0
while msg is not None:
    # get the next message
    msg = consumer.consume_next()
    # get the data
    data = msg.value
    # get the timestamp
    timestamp = data.pop('time')
    # get the features
    features = data.pop('features')
    features = pd.DataFrame([features])
    # get the label
    labels = data.pop('labels')
    labels = pd.DataFrame([labels])
    # add the data to the classifier
    #clf.partial_fit(features, label)
    # get the prediction
    y_pred = clf.predict(features)
    # get the accuracy
    accuracy = accuracy_score(labels, y_pred)
    # add the data to the drift detector
    mean_difference = ddm.add_element(1-accuracy)
    #if i > 1000 and (mean_difference > 0.5 or i% 1000 == 0):
        # print('mean difference: ', mean_difference)
        # reset the drift detector
    i += 1
    
    if accuracy < 0.5:
        print('accuracy: ', accuracy)
        print('------------------')
        
    if i % 1000 == 0:
        print('mean difference: ', mean_difference)
        print('------------------')
        ddm.reset()


KeyboardInterrupt: 