In [29]:
from kafka import KafkaProducer
from kafka import KafkaConsumer
from json import loads
import json
from time import sleep
import pandas as pd

In [30]:

def json_serializer(data):
    return json.dumps(data).encode("utf-8")

In [31]:
def data_preprocessing(df):
    preprocessed = False
    # Since attack column is aggregation of attack_P1, attack_P2 and attack_P3, we are dropping those three columns and using only attack col as label
    col_to_drop = ['time','attack_P1', 'attack_P2', 'attack_P3']
    if all(column in df.columns for column in col_to_drop):
        df = df.drop(columns=col_to_drop)
        preprocessed = True
    
    return df, preprocessed

In [32]:
class KafkaDataStreamer:
    def __init__(self, bootstrap_servers, topic):
        self.bootstrap_servers = bootstrap_servers
        self.topic = topic
        # Initialize the Kafka producer
        self.producer = KafkaProducer(bootstrap_servers=self.bootstrap_servers, 
                                      value_serializer=json_serializer,
                                      api_version=(0, 10, 1))

    def stream_data(self, data):
        for _, row in data.iterrows():
            message = row.to_dict()
            self.producer.send(self.topic, value=message)

In [33]:
topic_producer = 'hai-preprocessed-mao-3'
bootstrap_servers = ['localhost:9092']

In [36]:
class HaiConsumer:
    def __init__(self, topic, bootstrap_servers):
        self.topic = topic
        self.bootstrap_servers = bootstrap_servers
        self.consumer = KafkaConsumer(
            self.topic,
            bootstrap_servers=self.bootstrap_servers,
            auto_offset_reset='earliest',
            enable_auto_commit=True,
            value_deserializer=lambda x: loads(x.decode('utf-8')))

    def consume(self):
        
        streamer = KafkaDataStreamer(bootstrap_servers, topic_producer)
        counter = 0
        print('Started')
        
        df = pd.DataFrame()
        numIter = 0
        for i, message in enumerate(self.consumer):
            
            message = message.value
            df = pd.DataFrame([message])
            df, preprocessed = data_preprocessing(df)
            
            if not preprocessed:
                continue
            
            if preprocessed:    
                print(df)
                streamer.stream_data(df)
                counter += 1


            

In [37]:
topic = 'hai-input-mao-3'
consumer = HaiConsumer(topic, bootstrap_servers)
consumer.consume()

Started
   P1_B2004  P1_B2016  P1_B3004   P1_B3005  P1_B4002  P1_B4005   P1_B400B  \
0    0.0982    1.5937  461.9883  1099.7363      32.0   67.5559  1989.2568   

   P1_B4022  P1_FCV01D  P1_FCV01Z  ...  P4_HT_PO  P4_HT_PS     P4_LD  \
0   36.2389    29.2474    25.7507  ...   76.5878         0  450.2677   

   P4_ST_FD  P4_ST_LD  P4_ST_PO  P4_ST_PS  P4_ST_PT01  P4_ST_TT01  attack  
0    0.0027  372.7032  380.7147       0.0     10053.0     27603.0       0  

[1 rows x 60 columns]
   P1_B2004  P1_B2016  P1_B3004   P1_B3005  P1_B4002  P1_B4005   P1_B400B  \
0    0.0982    1.5913  461.9883  1099.7363      32.0   67.6609  1925.8107   

   P1_B4022  P1_FCV01D  P1_FCV01Z  ...  P4_HT_PO  P4_HT_PS     P4_LD  \
0   36.2328    28.6755    27.2919  ...   76.4612         0  446.1987   

   P4_ST_FD  P4_ST_LD  P4_ST_PO  P4_ST_PS  P4_ST_PT01  P4_ST_TT01  attack  
0    0.0003  372.2874  378.3276       0.0     10053.0     27599.0       0  

[1 rows x 60 columns]
   P1_B2004  P1_B2016  P1_B3004   P1_B3005