In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from drift_detectors.basic_window_ddm import BasicWindowDDM
from sklearn.ensemble import RandomForestClassifier
from kafka import KafkaConsumer, KafkaProducer
import json
from static_data_handlers import TrainLoaderPython, ModelTrainerTester
from joblib import load

In [12]:
# 0.9854401518483369 attack 0 ratio
# 0.9990180262327277 val. accuracy

# Multi Model Drift Detection with Model Selection
This is an experiment in which I use multiple classifiers during drift detection.
If a drift is detected, I switch to the other model. And vice versa.
So not only do I detect drift, but I also adapt to the concept change and switch to a model that is better suited for the new concept.

## Load the basic model trained on the normal data

In [13]:
# load the pretrained model
clf_normal = load('models/base_model.joblib')

## Attack only data and attack model
- Load data containing attacks
- Filter the rows containing attacks
- Train a model on the attack data

This would of course count as cheating, since we are looking at the test data that we would only access real-time. 
However, this is only for the purpose of building a demo model that performs well in case of attacks. 
Using this, we are able to demonstrate a combined drift detection and model selection approach.

**I'm only doing it here and not in the pipeline with spark, because this should not be part of the pipeline.**

In [15]:
# import csv training data
data_folder = '../../hai_dataset/hai/hai-21.03'
data_filenames = ['test1.csv']
label_columns = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']

# load the data
data_loader = TrainLoaderPython(data_folder=data_folder, data_filenames=data_filenames, label_columns=label_columns)
df_attack = data_loader.get_data()

# filter the rows containing attacks
df_attack = df_attack[df_attack['attack'] > 0]

# split the data
X_train_attack, X_test_attack, y_train_attack, y_test_attack = data_loader.split_data(data_frame=df_attack)

### Train the attack model

In [None]:
# train the model
clf_attack = RandomForestClassifier()
model_handler = ModelTrainerTester(classifier=clf_attack, X_train=X_train_attack, y_train=y_train_attack)
clf_attack = model_handler.train_model()
# test the model
_, tr_accuracy = model_handler.test_model(X_test=X_test_attack, y_test=y_test_attack)
print('Model accuracy:', tr_accuracy)

### Create the models dictionary

In [None]:
# models
clfs = {
    'normal': clf_normal, 
    'attack': clf_attack
}

## Define the Kafka consumer and producer

In [None]:
consumer = KafkaConsumer(
    'hai-preprocessed',
    bootstrap_servers=['localhost:9092'],
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    value_deserializer=lambda x: json.loads(x.decode('utf-8'))
)

producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'],
    value_serializer=lambda x: json.dumps(x).encode('utf-8'))

## Define the drift detector with model selection

In [None]:
def drift_detection_with_model_selection(consumer, producer, clfs, drift_detector, starting_model='normal'):
    # consume the streamed data from kafka and detect drift
    # model selection
    model_label = starting_model
    for i, msg in enumerate(consumer):
        # get the data from the message
        data = msg.value
        #print(data['features'])
        # convert the dictionary to a dataframe
        X = pd.DataFrame(data['features'], index=[0])
        # get the labels
        y = pd.DataFrame(data['labels'], index=[0])
        # predict the labels
        y_pred = clfs[model_label].predict(X)
        # get the accuracy
        accuracy = accuracy_score(y, y_pred)
        # detect drift
        drift_detector.add_element(accuracy)
        warning_detected = drift_detector.detected_warning_zone()
        drift_detected = drift_detector.detected_change()
        
        # send the results
        producer.send(
            'hai-results', 
            value={
                'drift_detector': 'multi_model', 
                'accuracy': accuracy, 
                'warning_detected': int(warning_detected), 
                'drift_detected': int(drift_detected)
            }
        )
        
        # print the results
        if i % 1000 == 0:
            print('Iteration {}'.format(i), 'Accuracy {}'.format(accuracy))
        if drift_detected:
            print(f'Iteration {i} - accuracy: {accuracy} - drift detected: {drift_detected}')
        
        if drift_detected:
            if model_label == 'normal':
                model_label = 'attack'
            else:
                model_label = 'normal'

## Run the drift detection with model selection

In [None]:
drift_detection_with_model_selection(consumer=consumer, producer=producer, clfs=clfs, drift_detector=BasicWindowDDM())