In [7]:
import os
import pandas as pd
from kafka import KafkaProducer
from time import sleep
import json

In [8]:

def json_serializer(data):
    return json.dumps(data).encode("utf-8")

In [9]:
class KafkaDataStreamer:
    """
    Class to stream data to Kafka
    The data is read from a CSV file and sent to Kafka topic
    """
    def __init__(self, bootstrap_servers, topic):
        """
        Initialize the Kafka producer
        :param bootstrap_servers: bootstrap servers for the Kafka cluster
        :param topic: Kafka topic to which the data is sent
        """
        self.bootstrap_servers = bootstrap_servers
        self.topic = topic
        # Initialize the Kafka producer
        self.producer = KafkaProducer(bootstrap_servers=self.bootstrap_servers, 
                                      value_serializer=json_serializer,
                                      api_version=(0, 10, 1))

    def stream_data(self, data_path):
        """
        Read the data from the CSV file and send it to Kafka topic
        :param data_path: the path to the CSV file
        :return: 
        """
        counter = 0
        data = pd.read_csv(data_path, sep=';')
        for _, row in data.iterrows():
            counter += 1
            if counter < 5000:
                continue
            
            message = row.to_dict()
            self.producer.send(self.topic, value=message)

            if counter > 7000:
                break
            
            

In [10]:
data_path = '../data_loading/hai-23_05/test1.csv'
bootstrap_servers = ['localhost:9092']  # Update with your Kafka bootstrap servers
topic = 'hai-input-mao-3'

# Stream the stored data to Kafka
streamer = KafkaDataStreamer(bootstrap_servers, topic)
streamer.stream_data(data_path)