# Producer 1

Write a python program that loads all the data from
climate_streaming.csv and randomly (with replacement) feed the data to the
stream every 10 seconds. You will need to append additional information such as producer information to identify the producer and created date.

In [1]:
import random

from time import sleep
from json import dumps
from kafka3 import KafkaProducer
from pymongo import MongoClient

import datetime as dt
import pandas as pd

In [2]:
# read csv
climate_streaming_df = pd.read_csv("dataset/climate_streaming.csv")
print(climate_streaming_df.dtypes)

# striping whitespace from column and separating precipitation type
climate_streaming_df.rename(columns={"precipitation ": "precipitation"}, inplace=True)
climate_streaming_df["precipitation_type"] = (
    climate_streaming_df["precipitation"].str.strip().str[-1]
)
climate_streaming_df["precipitation"] = (
    climate_streaming_df["precipitation"].str.strip().str[:-1].astype(float)
)

climate_streaming_df.head()

latitude                   float64
longitude                  float64
air_temperature_celcius      int64
relative_humidity          float64
windspeed_knots            float64
max_wind_speed             float64
precipitation               object
GHI_w/m2                     int64
dtype: object


Unnamed: 0,latitude,longitude,air_temperature_celcius,relative_humidity,windspeed_knots,max_wind_speed,precipitation,GHI_w/m2,precipitation_type
0,-37.623,149.323,19,56.8,7.9,11.1,0.0,154,I
1,-38.038,142.986,15,50.7,9.2,13.0,0.02,128,G
2,-37.95,142.366,16,53.6,8.1,15.0,0.0,133,G
3,-38.231,147.172,24,61.6,7.7,14.0,0.0,186,I
4,-37.903,145.25,24,62.3,7.0,13.0,0.0,185,I


In [3]:
# initializing mongo connection to database
client = MongoClient()
db = client.fit3182_assignment_db
collection = db.climate_historic

# finding latest date
result = collection.find().sort("date", -1).limit(1)
latest_date = list(result)[0]["date"]
latest_date

datetime.datetime(2023, 3, 24, 0, 0)

In [4]:
def process_data(df):
    dataset = []
    for index, row in df.iterrows():
        # convert each row to dictionary
        data = row.to_dict()
        dataset.append(data)
    return dataset


def publish_message(producer_instance, topic_name, data):
    try:
        # send message to kafka asynchronously
        producer_instance.send(topic_name, value=data)
        # wait for all outstanding message to be persisted to disk
        producer_instance.flush()
        print("Message published successfully. Data: " + str(data))
    except Exception as ex:
        print("Exception in publishing message.")
        print(str(ex))


def connect_kafka_producer(host):
    _producer = None
    try:
        # serializer used to serialize data to json instead of string
        _producer = KafkaProducer(
            bootstrap_servers=[f"{host}:9092"],
            value_serializer=lambda x: dumps(x).encode("ascii"),
            api_version=(0, 10),
        )
    except Exception as ex:
        print("Exception while connecting Kafka.")
        print(str(ex))
    finally:
        return _producer

In [5]:
TOPIC = "climate"
HOST = "localhost"

producer = connect_kafka_producer(HOST)
dataset = process_data(climate_streaming_df)
current_date = latest_date

print("Publishing records..")
while True:
    # increment date after every iteration
    current_date += dt.timedelta(days=1)

    # get random row and add metadata with replacement
    selection = random.choice(dataset)
    # transform date to string to be serialized
    selection["date"] = current_date.strftime("%Y-%m-%d")

    # adding metadata field
    # producer id to identify type of producer
    # station - set as constant to conform to data model
    selection["producer_id"] = "climate_producer"
    selection["station"] = 948700

    publish_message(producer, TOPIC, selection)

    sleep(10)  # sleep for 10 seconds before publishing next message

Publishing records..
Message published successfully. Data: {'latitude': -37.236, 'longitude': 141.176, 'air_temperature_celcius': 14, 'relative_humidity': 48.1, 'windspeed_knots': 13.7, 'max_wind_speed': 19.0, 'precipitation': 0.35, 'GHI_w/m2': 122, 'precipitation_type': 'G', 'date': '2023-03-25', 'producer_id': 'climate_producer', 'station': 948700}
Message published successfully. Data: {'latitude': -36.779, 'longitude': 146.108, 'air_temperature_celcius': 13, 'relative_humidity': 42.0, 'windspeed_knots': 11.4, 'max_wind_speed': 16.9, 'precipitation': 0.0, 'GHI_w/m2': 119, 'precipitation_type': 'G', 'date': '2023-03-26', 'producer_id': 'climate_producer', 'station': 948700}
Message published successfully. Data: {'latitude': -35.321, 'longitude': 143.502, 'air_temperature_celcius': 12, 'relative_humidity': 44.0, 'windspeed_knots': 4.7, 'max_wind_speed': 7.0, 'precipitation': 0.0, 'GHI_w/m2': 108, 'precipitation_type': 'G', 'date': '2023-03-27', 'producer_id': 'climate_producer', 'stati

KeyboardInterrupt: 