# Producer 1

Write a python program that loads all the data from
climate_streaming.csv and randomly (with replacement) feed the data to the
stream every 10 seconds. You will need to append additional information such as producer information to identify the producer and created date.

In [None]:
import random

from time import sleep
from json import dumps
from kafka3 import KafkaProducer
from pymongo import MongoClient

import datetime as dt
import pandas as pd

In [None]:
# read csv
climate_streaming_df = pd.read_csv("dataset/climate_streaming.csv")
print(climate_streaming_df.dtypes)

# striping whitespace from column and separating precipitation type
climate_streaming_df.rename(columns={"precipitation ": "precipitation"}, inplace=True)
climate_streaming_df["precipitation_type"] = (
    climate_streaming_df["precipitation"].str.strip().str[-1]
)
climate_streaming_df["precipitation"] = (
    climate_streaming_df["precipitation"].str.strip().str[:-1].astype(float)
)

climate_streaming_df.head()

In [None]:
# initializing mongo connection and setting up database
client = MongoClient()
db = client.fit3182_assignment_db
collection = db.climate_historic

# finding latest date
result = collection.find().sort("date", -1).limit(1)
latest_date = list(result)[0]["date"]
latest_date

In [None]:
def process_data():
    dataset = []
    for index, row in climate_streaming_df.iterrows():
        data = row.to_dict()
        dataset.append(data)
    return dataset


def publish_message(producer_instance, topic_name, data):
    try:
        # send message to kafka asynchronously
        producer_instance.send(topic_name, value=data)
        # wait for all outstanding message to be persisted to disk
        producer_instance.flush()
        print("Message published successfully. Data: " + str(data))
    except Exception as ex:
        print("Exception in publishing message.")
        print(str(ex))


def connect_kafka_producer(host):
    _producer = None
    try:
        # serializer to serialize data to json instead of string
        _producer = KafkaProducer(
            bootstrap_servers=[f"{host}:9092"],
            value_serializer=lambda x: dumps(x).encode("ascii"),
            api_version=(0, 10),
        )
    except Exception as ex:
        print("Exception while connecting Kafka.")
        print(str(ex))
    finally:
        return _producer

In [None]:
TOPIC = "climate"
HOST = "localhost"

producer = connect_kafka_producer(HOST)
dataset = process_data()
current_date = latest_date

print("Publishing records..")
while True:
    # increment date after every iteration
    current_date += dt.timedelta(days=1)

    # get random row and add metadata with replacement
    selection = random.choice(dataset)
    # transform date to string to be serialized
    selection["date"] = current_date.strftime("%Y-%m-%d")

    # adding metadata field
    # producer id to identify type of producer
    # station - set as constant to conform to data model
    selection["producer_id"] = "climate_producer"
    selection["station"] = 948700

    publish_message(producer, TOPIC, selection)

    sleep(10)  # sleep for 10 seconds before publishing next message