In [8]:
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from drift_detectors.ddm import DDM
from model_trainer import DataLoaderProcessor, ModelTrainerTester
from sklearn.ensemble import RandomForestClassifier
from kafka import KafkaConsumer, KafkaProducer
import json
from datetime import datetime

In [9]:
# import csv training data
data_folder = '../../hai_dataset/hai/hai-21.03'
data_filenames = ['train1.csv', 'train2.csv', 'train3.csv']
label_columns = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']

In [10]:
data_loader = DataLoaderProcessor(data_folder=data_folder, data_filenames=data_filenames, label_columns=label_columns)
df = data_loader.get_data()
X_train, X_test, y_train, y_test = data_loader.split_data(data_frame=df)

In [4]:
clf = RandomForestClassifier()
model_handler = ModelTrainerTester(classifier=clf, X_train=X_train, y_train=y_train)
clf = model_handler.train_model()
_, tr_accuracy = model_handler.test_model(X_test=X_test, y_test=y_test)

In [5]:
consumer = KafkaConsumer(
    'hai-preprocessed',
    bootstrap_servers=['localhost:9092'],
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    value_deserializer=lambda x: json.loads(x.decode('utf-8'))
)

producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'],
    value_serializer=lambda x: json.dumps(x).encode('utf-8'))

In [6]:
ddm = DDM()

In [7]:
# consume the streamed data from kafka and detect drift
for i, msg in enumerate(consumer):
    # get the data from the message
    data = msg.value
    #print(data['features'])
    # convert the dictionary to a dataframe
    X = pd.DataFrame(data['features'], index=[0])
    # get the labels
    y = pd.DataFrame(data['labels'], index=[0])
    # predict the labels
    y_pred = clf.predict(X)
    # get the accuracy
    accuracy = accuracy_score(y, y_pred)
    # detect drift
    ddm.add_element(accuracy)
    warning_detected = ddm.detected_warning_zone()
    drift_detected = ddm.detected_change()
    
    # send the results
    producer.send('hai-results', value={'accuracy': accuracy, 'warning_detected': warning_detected, 'drift_detected': drift_detected})
    
    # print the results
    '''if accuracy < 0.5:
        print('Iteration {}'.format(i), 'Accuracy {}'.format(accuracy))'''
    if i % 1000 == 0:
        print('Iteration {}'.format(i), 'Accuracy {}'.format(accuracy))
    if drift_detected:
        print(f'{datetime.now()} - accuracy: {accuracy} - drift detected: {drift_detected}')
    i += 1

Iteration 0 Accuracy 1.0
Iteration 1000 Accuracy 1.0
Iteration 2000 Accuracy 1.0
2023-12-12 18:32:45.829148 - accuracy: 1.0 - drift detected: True
Iteration 3000 Accuracy 1.0
Iteration 4000 Accuracy 1.0
Iteration 5000 Accuracy 1.0
Iteration 6000 Accuracy 1.0
Iteration 7000 Accuracy 1.0
Iteration 8000 Accuracy 1.0
Iteration 9000 Accuracy 1.0
Iteration 10000 Accuracy 1.0
2023-12-12 18:33:35.785936 - accuracy: 1.0 - drift detected: True
Iteration 11000 Accuracy 1.0
Iteration 12000 Accuracy 1.0
Iteration 13000 Accuracy 1.0
Iteration 14000 Accuracy 1.0
Iteration 15000 Accuracy 1.0
2023-12-12 18:34:05.319353 - accuracy: 1.0 - drift detected: True
Iteration 16000 Accuracy 1.0
Iteration 17000 Accuracy 1.0
Iteration 18000 Accuracy 1.0
Iteration 19000 Accuracy 1.0
Iteration 20000 Accuracy 1.0
Iteration 21000 Accuracy 1.0
2023-12-12 18:34:47.555024 - accuracy: 1.0 - drift detected: True
Iteration 22000 Accuracy 1.0
2023-12-12 18:34:54.804034 - accuracy: 1.0 - drift detected: True
Iteration 23000 

KeyboardInterrupt: 