In [11]:
import pandas as pd
from sklearn.metrics import accuracy_score
from drift_detectors.basic_window_ddm import BasicWindowDDM
from model_handlers.model_trainer import ModelTrainerTester
from sklearn.ensemble import RandomForestClassifier
from kafka import KafkaConsumer, KafkaProducer
import json
from data_loaders.train_loader import TrainLoader

In [12]:
# 0.9854401518483369 attack 0 ratio
# 0.9990180262327277 val. accuracy

## Normal data and normal model
- Load normal data
- Train a model on the normal data

In [13]:
# import csv training data
data_folder = '../../hai_dataset/hai/hai-21.03'
data_filenames = ['train1.csv', 'train2.csv', 'train3.csv']
label_columns = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']

In [14]:
data_loader = TrainLoader(data_folder=data_folder, data_filenames=data_filenames, label_columns=label_columns)
df_normal = data_loader.get_data()

In [16]:
X_train_normal, X_test_normal, y_train_normal, y_test_normal = data_loader.split_data(data_frame=df_normal)

In [17]:
clf_normal = RandomForestClassifier()
model_handler = ModelTrainerTester(classifier=clf_normal, X_train=X_train_normal, y_train=y_train_normal)
clf_normal = model_handler.train_model()
_, tr_accuracy = model_handler.test_model(X_test=X_test_normal, y_test=y_test_normal)

tr_accuracy

## Attack only data and attack model
- Load data containing attacks
- Filter the rows containing attacks
- Train a model on the attack data

In [15]:
# import csv training data
data_folder = '../../hai_dataset/hai/hai-21.03'
data_filenames = ['test1.csv']
label_columns = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']

data_loader = TrainLoader(data_folder=data_folder, data_filenames=data_filenames, label_columns=label_columns)
df_attack = data_loader.get_data()

df_attack = df_attack[df_attack['attack'] > 0]

X_train_attack, X_test_attack, y_train_attack, y_test_attack = data_loader.split_data(data_frame=df_attack)

In [None]:
clf_attack = RandomForestClassifier()
model_handler = ModelTrainerTester(classifier=clf_attack, X_train=X_train_attack, y_train=y_train_attack)
clf_attack = model_handler.train_model()
_, tr_accuracy = model_handler.test_model(X_test=X_test_attack, y_test=y_test_attack)
tr_accuracy

In [None]:
consumer = KafkaConsumer(
    'hai-preprocessed',
    bootstrap_servers=['localhost:9092'],
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    value_deserializer=lambda x: json.loads(x.decode('utf-8'))
)

producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'],
    value_serializer=lambda x: json.dumps(x).encode('utf-8'))

In [None]:
ddm = BasicWindowDDM()

In [None]:
# models
clfs = {
    'normal': clf_normal, 
    'attack': clf_attack
}

In [None]:
# consume the streamed data from kafka and detect drift
# model selection
model_label = 'normal'
for i, msg in enumerate(consumer):
    # get the data from the message
    data = msg.value
    #print(data['features'])
    # convert the dictionary to a dataframe
    X = pd.DataFrame(data['features'], index=[0])
    # get the labels
    y = pd.DataFrame(data['labels'], index=[0])
    # predict the labels
    y_pred = clfs[model_label].predict(X)
    # get the accuracy
    accuracy = accuracy_score(y, y_pred)
    # detect drift
    ddm.add_element(accuracy)
    warning_detected = ddm.detected_warning_zone()
    drift_detected = ddm.detected_change()
    
    # send the results
    producer.send('hai-results', value={'accuracy': accuracy, 'warning_detected': int(warning_detected), 'drift_detected': int(drift_detected)})
    
    # print the results
    '''if accuracy < 0.5:
        print('Iteration {}'.format(i), 'Accuracy {}'.format(accuracy))'''
    if i % 1000 == 0:
        print('Iteration {}'.format(i), 'Accuracy {}'.format(accuracy))
    if drift_detected:
        print(f'Iteration {i} - accuracy: {accuracy} - drift detected: {drift_detected}')
    
    if drift_detected:
        if model_label == 'normal':
            model_label = 'attack'
        else:
            model_label = 'normal'