# Producer 2

Write a python program that loads all the data from
hotspot_AQUA_streaming.csv and randomly (with replacement) feed the data to
the stream every 2 seconds. AQUA is the satellite from NASA that reports latitude,
longitude, confidence and surface temperature of a location. You will need to
append additional information such as producer information to identify the
producer and created date & time.

In [None]:
import random
from time import sleep
from json import dumps
from kafka3 import KafkaProducer
from pymongo import MongoClient

import datetime as dt
import pandas as pd

In [None]:
# read csv
hotspot_streaming_df = pd.read_csv("dataset/hotspot_AQUA_streaming.csv")
hotspot_streaming_df.head()

In [None]:
# initializing mongo connection to database
client = MongoClient()
db = client.fit3182_assignment_db
collection = db.climate_historic

# finding latest date
result = collection.find().sort("date", -1).limit(1)
latest_date = list(result)[0]["date"]
latest_date

In [None]:
def process_data(df):
    dataset = []
    for index, row in df.iterrows():
        # convert each row to dictionary
        data = row.to_dict()
        dataset.append(data)
    return dataset


def publish_message(producer_instance, topic_name, data):
    try:
        # send message to kafka asynchronously
        producer_instance.send(topic_name, value=data)
        # wait for all outstanding message to be persisted to disk
        producer_instance.flush()
        print("Message published successfully. Data: " + str(data))
    except Exception as ex:
        print("Exception in publishing message.")
        print(str(ex))


def connect_kafka_producer(host):
    _producer = None
    try:
        # serializer used to to serialize data to json instead of string
        _producer = KafkaProducer(
            bootstrap_servers=[f"{host}:9092"],
            value_serializer=lambda x: dumps(x).encode("ascii"),
            api_version=(0, 10),
        )
    except Exception as ex:
        print("Exception while connecting Kafka.")
        print(str(ex))
    finally:
        return _producer

In [None]:
TOPIC = "hotspot"
HOST = "localhost"

producer = connect_kafka_producer(HOST)
dataset = process_data(hotspot_streaming_df)
current_date = latest_date

print("Publishing records..")
while True:
    # increment date after every iteration
    # since 10 seconds == 24 hours, hence 2 seconds = (24 / 10) * 2
    current_date += dt.timedelta(hours=4.8)

    # get random row with replacement
    selection = random.choice(dataset)

    # add metadata and row identifier
    selection["datetime"] = current_date.isoformat()
    selection["producer_id"] = "aqua_producer"

    publish_message(producer, TOPIC, selection)

    sleep(2)  # sleep for 2 seconds before publishing next message