In [14]:
import os
import pandas as pd
from kafka import KafkaProducer
from time import sleep

In [15]:
class KafkaDataStreamer:
    """
    Class to stream data to Kafka
    The data is read from a CSV file and sent to Kafka topic
    """
    def __init__(self, bootstrap_servers, topic):
        """
        Initialize the Kafka producer
        :param bootstrap_servers: bootstrap servers for the Kafka cluster
        :param topic: Kafka topic to which the data is sent
        """
        self.bootstrap_servers = bootstrap_servers
        self.topic = topic
        # Initialize the Kafka producer
        self.producer = KafkaProducer(bootstrap_servers=self.bootstrap_servers, api_version=(0, 10, 1))

    def stream_data(self, data_path):
        """
        Read the data from the CSV file and send it to Kafka topic
        :param data_path: the path to the CSV file
        :return: 
        """

        data = pd.read_csv(data_path)
        for _, row in data.iterrows():
            message = row.to_json()
            self.producer.send(self.topic, value=message.encode('utf-8'))  
            if _ % 10000 == 0:
                print('Sent message #{}'.format(_))

In [16]:
data_folder = '../../hai_dataset/hai/hai-21.03'
data_path = os.path.join(data_folder, 'test1.csv').replace(os.sep, '/')
bootstrap_servers = ['localhost:9092']  # Update with your Kafka bootstrap servers
topic = 'hai-input'

# Stream the stored data to Kafka
streamer = KafkaDataStreamer(bootstrap_servers, topic)
streamer.stream_data(data_path)

Sent message #0
Sent message #10000
Sent message #20000
Sent message #30000
Sent message #40000
