In [1]:
import os
import pandas as pd
from kafka import KafkaProducer
from time import sleep
import json

In [2]:

def json_serializer(data):
    return json.dumps(data).encode("utf-8")

In [3]:
def split_dataset(df):
    df_attack = df[df['attack'] == 1]
    df_normal = df.head(2500)
    df_attack = df_attack.head(2500)
    df = pd.concat([df_normal, df_attack])

    df = df.sample(frac=1).reset_index(drop=True)

    return df

In [4]:
class KafkaDataStreamer:
    """
    Class to stream data to Kafka
    The data is read from a CSV file and sent to Kafka topic
    """
    def __init__(self, bootstrap_servers, topic):
        """
        Initialize the Kafka producer
        :param bootstrap_servers: bootstrap servers for the Kafka cluster
        :param topic: Kafka topic to which the data is sent
        """
        self.bootstrap_servers = bootstrap_servers
        self.topic = topic
        # Initialize the Kafka producer
        self.producer = KafkaProducer(bootstrap_servers=self.bootstrap_servers, 
                                      value_serializer=json_serializer,
                                      api_version=(0, 10, 1))

    def stream_data(self, data_path):
        """
        Read the data from the CSV file and send it to Kafka topic
        :param data_path: the path to the CSV file
        :return: 
        """
        counter = 0
        data = pd.read_csv(data_path, sep=';')
        data = split_dataset(data)
        print(f'data.shape: {data.shape}')
        for _, row in data.iterrows():
            counter += 1
            if counter < 5000:
                continue
            
            message = row.to_dict()
            self.producer.send(self.topic, value=message)

            if counter > 7000:
                break
            
            

In [5]:
data_path = '../data_loading/hai-23_05/test1.csv'
bootstrap_servers = ['localhost:9092']  # Update with your Kafka bootstrap servers
topic = 'hai-input-albert-1'

# Stream the stored data to Kafka
streamer = KafkaDataStreamer(bootstrap_servers, topic)
streamer.stream_data(data_path)

data.shape: (5500, 64)
{'time': '2019-10-31 10:30:38', 'P1_B2004': 0.1, 'P1_B2016': 1.2145, 'P1_B3004': 461.5749, 'P1_B3005': 1095.085, 'P1_B4002': 32.0, 'P1_B4005': 9.8324, 'P1_B400B': 228.6099, 'P1_B4022': 35.2863, 'P1_FCV01D': 15.9057, 'P1_FCV01Z': 15.1382, 'P1_FCV02D': 0.0, 'P1_FCV02Z': 52.1942, 'P1_FCV03D': 50.5623, 'P1_FCV03Z': 51.5991, 'P1_FT01': 177.9937, 'P1_FT01Z': 838.3027, 'P1_FT02': 41.2369, 'P1_FT02Z': 208.748, 'P1_FT03': 302.5436, 'P1_FT03Z': 1096.2152, 'P1_LCV01D': 15.3756, 'P1_LCV01Z': 14.7644, 'P1_LIT01': 460.0116, 'P1_PCV01D': 95.4414, 'P1_PCV01Z': 98.085, 'P1_PCV02D': 12.0, 'P1_PCV02Z': 12.2238, 'P1_PIT01': 1.3261, 'P1_PIT02': 0.2237, 'P1_TIT01': 35.4675, 'P1_TIT02': 36.2915, 'P2_24Vdc': 28.0186, 'P2_Auto': 1, 'P2_Emgy': 0, 'P2_On': 1, 'P2_SD01': 0, 'P2_SIT01': 778.0, 'P2_TripEx': 0, 'P2_VT01e': 11.7162, 'P2_VXT02': -3.8435, 'P2_VXT03': -2.0176, 'P2_VYT02': -0.55, 'P2_VYT03': 0.8379, 'P3_LCP01D': 11496, 'P3_LCV01D': 352, 'P3_LH': 70, 'P3_LL': 10, 'P3_LT01': 26.161, 