# Streaming Application

In [None]:
import json
import os

from datetime import datetime
from pymongo import MongoClient
from pyspark.sql import SparkSession
from pprint import pprint

import pygeohash as pgh

os.environ[
    "PYSPARK_SUBMIT_ARGS"
] = "--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell"

In [None]:
# setting constant environment variables
TOPIC = "climate,hotspot"  # read from 2 topics
HOST = "localhost"
BATCH_INTERVAL = 10

In [None]:
# initialize spark client to read from kafka source
spark = (
    SparkSession.builder.master("local[*]")
    .appName("Climate-Hotspot-Analysis")
    .getOrCreate()
)

kafka_sdf = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", f"{HOST}:9092")
    .option("subscribe", TOPIC)
    .load()
)

In [None]:
def submit_datapoint(measurement):
    # initialize client and connection to db
    client = MongoClient()
    db = client.fit3182_assignment_db
    collection = db.climate_historic

    # insert document into db per batch
    try:
        collection.insert_one(measurement)
    except Exception as ex:
        print("Exception Occured. Message: {0}".format(str(ex)))
    client.close()


def group_hotspots(aqua, terra):
    groups = {}
    fire_events = set()

    # iterate through each record and group similar geohash record together
    for record in aqua:
        geohash = record["geohash"]
        if geohash not in groups:
            groups[geohash] = []
        groups[geohash].append(record)

    hotspot_key = groups.keys()

    # iterate through each record and group similar geohash record together
    for record in terra:
        geohash = record["geohash"]
        # categorize as fire_event if there is another datapoint coming from another/previous satelite
        if geohash in hotspot_key:
            fire_events.add(geohash)
        if geohash not in groups:
            groups[geohash] = []
        groups[geohash].append(record)

    hotspots = []
    for group in groups.values():
        if len(group) > 1:
            # find average of measurement for hotspots data that are referring to the same location
            number_of_records = len(group)
            datapoint = group[0]

            total_temp = 0
            total_conf = 0

            for record in group:
                total_temp += record["surface_temperature_celcius"]
                total_conf += record["confidence"]

            # calculate average surface temperature and confidence in 2 decimal point
            datapoint["surface_temperature_celcius"] = round(
                total_temp / number_of_records, 2
            )
            datapoint["confidence"] = round(total_conf / number_of_records, 2)

            hotspots.append(datapoint)
        else:
            hotspots.append(group[0])

    return fire_events, hotspots


def find_correlation(climate, hotspots, fire_events):
    climate["hotspots"] = []
    if climate and hotspots:
        for hotspot in hotspots:
            # relate climate and hotspots data only if they are close ie
            # same geohash with precision 3
            if climate["geohash"][:3] == hotspot["geohash"][:3]:
                # hotspots are categorized only if there are fire_events
                if hotspot["geohash"] in fire_events:
                    if (
                        climate["air_temperature_celcius"] > 20
                        and climate["GHI_w/m2"] > 180
                    ):
                        hotspot["cause"] = "natural"
                    else:
                        hotspot["cause"] = "other"
                    print(f"{hotspot['cause']} fire detected!")

            # clean up hotspot data
            hotspot["timestamp"] = datetime.strptime(
                hotspot["datetime"], "%Y-%m-%dT%H:%M:%S"
            ).strftime("%H:%M:%S")
            hotspot.pop("producer_id")
            hotspot.pop("geohash")
            hotspot.pop("datetime")

            climate["hotspots"].append(hotspot)

    # clean up climate data
    climate.pop("producer_id")
    climate.pop("latitude")
    climate.pop("longitude")
    climate.pop("geohash")

    climate["date"] = datetime.strptime(climate["date"], "%Y-%m-%d")
    return climate


def process_batch(df, epoch_id):
    climate = None
    aqua = []
    terra = []

    # iterate through batch dataset and assign to appropriate list based on producer_id
    dataset = df.collect()
    for record in dataset:
        data = json.loads(record.value)

        # encode geohash based on latitude and longitude
        data["geohash"] = pgh.encode(data["latitude"], data["longitude"], precision=5)

        if data["producer_id"] == "climate_producer":
            climate = data
        elif data["producer_id"] == "aqua_producer":
            aqua.append(data)
        elif data["producer_id"] == "terra_producer":
            terra.append(data)
        else:
            print("Invalid producer_id....Skipping")

    print(
        f"received {0 if not climate else 1} climate records, {len(aqua)} aqua records, {len(terra)} terra records"
    )

    # only proceed to process data for current window if there's a climate data
    # climate date form the basis of a document in the DB hence no point in processing if there is no data
    if climate:
        fire_events, hotspots = group_hotspots(aqua, terra)
        measurement = find_correlation(climate, hotspots, fire_events)
        submit_datapoint(measurement)

In [None]:
# configure spark writer to process stream in mini batches
writer = (
    kafka_sdf.writeStream.format("Console")
    .option("checkpointLocation", "climate-hotspots-checkpoint")
    .outputMode("append")
    .trigger(processingTime=f"{BATCH_INTERVAL} seconds")  # trigger action in batches
    .foreachBatch(process_batch)
)

In [None]:
try:
    query = writer.start()
    query.awaitTermination()
except KeyboardInterrupt:
    print("Interrupted by CTRL-C. Stopping query.")
finally:
    query.stop()