In [1]:
import os
import pandas as pd
from kafka import KafkaConsumer, KafkaProducer
#from skmultiflow.drift_detection import DDM
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from datetime import datetime
from sklearn.model_selection import train_test_split
import json
from drift_detectors.ddm import DDM

In [2]:
def get_data(data_folder, data_filenames):
    """
    Get the training data from the csv files in the data folder
    Join the dataframes from the different files
    """
    data_list = list()
    for data_filename in data_filenames: 
        data_path = os.path.join(data_folder, data_filename).replace(os.sep, '/')
        data_list.append(pd.read_csv(data_path, index_col=0))  
        
    data_frame = pd.concat(data_list, axis=0, ignore_index=True)
        
    return data_frame

In [3]:
# import csv training data
data_folder = '../../hai_dataset/hai/hai-21.03'
data_filenames = ['train1.csv', 'train2.csv', 'train3.csv']
df = get_data(data_folder, data_filenames)

In [4]:
# split into train and test
label_columns = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']
X_train, X_test, y_train, y_test = train_test_split(df.drop(label_columns, axis=1), df[label_columns], test_size=0.33, random_state=42)

In [5]:
X_train.head()

Unnamed: 0,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,P1_FCV01Z,...,P4_HT_PO,P4_HT_PS,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01
640671,0.03005,1.23308,393.59396,1086.62048,32.0,19.02375,519.58618,35.50758,16.89631,14.29901,...,1.41061,0,309.60651,-0.00492,16728.0,307.8161,308.53949,0,10026.0,27625.0
251080,0.10087,1.4098,405.06735,993.95294,33.4603,100.0,2825.54712,37.23263,100.0,99.91608,...,29.80322,0,376.82654,0.00072,17934.0,324.05597,332.12164,0,10016.0,27627.0
94824,0.10099,1.33851,404.36404,1093.57519,32.5836,28.80751,919.18939,36.17739,16.07068,13.61999,...,0.61487,0,354.9624,0.00058,17509.0,354.90808,324.0741,0,10053.0,27627.0
690119,0.09835,1.4407,400.49872,1005.13696,32.47603,100.0,2826.3894,36.33189,100.0,99.3445,...,0.07236,0,353.02734,-0.00159,18671.0,352.71991,344.88934,0,10026.0,27627.0
232306,0.10087,1.52539,405.06735,993.95294,33.4603,100.0,2826.3894,37.52159,100.0,99.91608,...,68.90192,0,441.38818,0.00347,19607.0,369.80975,363.26318,0,10029.0,27584.0


In [9]:
# save the column names to a list
column_names = X_train.columns.tolist()
print(column_names)

['P1_B2004', 'P1_B2016', 'P1_B3004', 'P1_B3005', 'P1_B4002', 'P1_B4005', 'P1_B400B', 'P1_B4022', 'P1_FCV01D', 'P1_FCV01Z', 'P1_FCV02D', 'P1_FCV02Z', 'P1_FCV03D', 'P1_FCV03Z', 'P1_FT01', 'P1_FT01Z', 'P1_FT02', 'P1_FT02Z', 'P1_FT03', 'P1_FT03Z', 'P1_LCV01D', 'P1_LCV01Z', 'P1_LIT01', 'P1_PCV01D', 'P1_PCV01Z', 'P1_PCV02D', 'P1_PCV02Z', 'P1_PIT01', 'P1_PIT02', 'P1_PP01AD', 'P1_PP01AR', 'P1_PP01BD', 'P1_PP01BR', 'P1_PP02D', 'P1_PP02R', 'P1_STSP', 'P1_TIT01', 'P1_TIT02', 'P2_24Vdc', 'P2_ASD', 'P2_AutoGO', 'P2_CO_rpm', 'P2_Emerg', 'P2_HILout', 'P2_MSD', 'P2_ManualGO', 'P2_OnOff', 'P2_RTR', 'P2_SIT01', 'P2_SIT02', 'P2_TripEx', 'P2_VT01', 'P2_VTR01', 'P2_VTR02', 'P2_VTR03', 'P2_VTR04', 'P2_VXT02', 'P2_VXT03', 'P2_VYT02', 'P2_VYT03', 'P3_FIT01', 'P3_LCP01D', 'P3_LCV01D', 'P3_LH', 'P3_LIT01', 'P3_LL', 'P3_PIT01', 'P4_HT_FD', 'P4_HT_LD', 'P4_HT_PO', 'P4_HT_PS', 'P4_LD', 'P4_ST_FD', 'P4_ST_GOV', 'P4_ST_LD', 'P4_ST_PO', 'P4_ST_PS', 'P4_ST_PT01', 'P4_ST_TT01']


In [10]:
y_train.head()

Unnamed: 0,attack,attack_P1,attack_P2,attack_P3
640671,0,0,0,0
251080,0,0,0,0
94824,0,0,0,0
690119,0,0,0,0
232306,0,0,0,0


In [11]:
# initialize classifier
clf = RandomForestClassifier()

In [12]:
# do the initial training
clf.fit(X_train, y_train)

In [13]:
# get the initial accuracy
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [14]:
class StreamConsumer:
    def __init__(self, topic, bootstrap_servers):
        self.topic = topic
        self.bootstrap_servers = bootstrap_servers
        self.consumer = KafkaConsumer(
            self.topic,
            bootstrap_servers=self.bootstrap_servers,
            auto_offset_reset='earliest',
            enable_auto_commit=True,
            value_deserializer=lambda x: json.loads(x.decode('utf-8')))
    def consume_next(self):
        # consume the next message
        return next(self.consumer)

In [15]:
class ResultsProducer:
    def __init__(self, topic, bootstrap_servers):
        self.topic = topic
        self.bootstrap_servers = bootstrap_servers
        self.producer = KafkaProducer(
            bootstrap_servers=self.bootstrap_servers,
            value_serializer=lambda x: json.dumps(x).encode('utf-8'))
    def send(self, data):
        # send the data
        self.producer.send(self.topic, value=data)

In [16]:
input_topic = 'hai-preprocessed'
input_bootstrap_servers = ['localhost:9092']
consumer = StreamConsumer(input_topic, input_bootstrap_servers)

In [17]:
output_topic = 'hai-results'
output_bootstrap_servers = ['localhost:9092']
producer = ResultsProducer(output_topic, output_bootstrap_servers)

In [18]:
# initialize drift detector
ddm = DDM()

# consume the streamed data from kafka and detect drift
msg = consumer.consume_next()
i = 0
while msg:
    # get the data from the message
    data = msg.value
    #print(data['features'])
    # convert the dictionary to a dataframe
    X = pd.DataFrame(data['features'], index=[0])
    # get the labels
    y = pd.DataFrame(data['labels'], index=[0])
    # predict the labels
    y_pred = clf.predict(X)
    # get the accuracy
    accuracy = accuracy_score(y, y_pred)
    # detect drift
    ddm.add_element(accuracy)
    warning_detected = ddm.detected_warning_zone()
    drift_detected = ddm.detected_change()
    
    # send the results
    producer.send({'accuracy': accuracy, 'warning_detected': warning_detected, 'drift_detected': drift_detected})
    
    # print the results
    '''if accuracy < 0.5:
        print('Iteration {}'.format(i), 'Accuracy {}'.format(accuracy))'''
    if i % 1000 == 0:
        print('Iteration {}'.format(i), 'Accuracy {}'.format(accuracy))
    if drift_detected:
        print(f'{datetime.now()} - accuracy: {accuracy} - drift detected: {drift_detected}')
    # get the next message
    msg = consumer.consume_next()
    i += 1

Iteration 0 Accuracy 1.0
Iteration 1000 Accuracy 1.0
Iteration 2000 Accuracy 1.0
2023-12-12 14:47:57.032501 - accuracy: 1.0 - drift detected: True
Iteration 3000 Accuracy 1.0
Iteration 4000 Accuracy 1.0
Iteration 5000 Accuracy 1.0
Iteration 6000 Accuracy 1.0
Iteration 7000 Accuracy 1.0
Iteration 8000 Accuracy 1.0
Iteration 9000 Accuracy 1.0
Iteration 10000 Accuracy 1.0
2023-12-12 14:48:48.964099 - accuracy: 1.0 - drift detected: True
Iteration 11000 Accuracy 1.0
Iteration 12000 Accuracy 1.0
Iteration 13000 Accuracy 1.0
Iteration 14000 Accuracy 1.0
Iteration 15000 Accuracy 1.0
2023-12-12 14:49:17.080489 - accuracy: 1.0 - drift detected: True
Iteration 16000 Accuracy 1.0
Iteration 17000 Accuracy 1.0
Iteration 18000 Accuracy 1.0
Iteration 19000 Accuracy 1.0
Iteration 20000 Accuracy 1.0
Iteration 21000 Accuracy 1.0
2023-12-12 14:49:51.864497 - accuracy: 1.0 - drift detected: True
Iteration 22000 Accuracy 1.0
2023-12-12 14:49:58.281225 - accuracy: 1.0 - drift detected: True
Iteration 23000 

KeyboardInterrupt: 

In [None]:
# TODO: implement error counting drift detector
# TODO: implement complex drift detector
# TODO: implement 2 models, for the no attack and attack case, use them accordingly
# TODO: implement model retraining
# TODO: implement flink job