In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Working with Strings & Dates")
    .config("spark.streaming.stopGracefullyOnShutdown", True)
    .master("local[*]")
    .getOrCreate()
)

spark

In [2]:
# !pip install python-dotenv

In [3]:
import requests
import json
from os import getenv
from dotenv import load_dotenv
from pprint import pprint
from typing import List
load_dotenv() 

api_key = getenv("OPENWEATHER_API_KEY", "b8f10c8c959340f989701949250801")

In [4]:
api_key

'b8f10c8c959340f989701949250801'

In [5]:
# define a function as a base to get the weather data
# This uses the base url and takes the city and endpoint as an argument

def get_endpoint_weather_data(city, endpoint, days=None):
    base_url = "http://api.weatherapi.com/v1/"
   
    if days:
        target_url = f"{base_url}{endpoint}?key={api_key}&q={city}&days={days}"
    else:
        target_url = f"{base_url}{endpoint}?key={api_key}&q={city}"

    # print("Target URL", target_url)
    response = requests.get(target_url)
    response.raise_for_status()

    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print("Error in GET request", response.status_code, response.text)

## Get current weather data

In [6]:

def get_current_weather_data(city="Boulder"):

    endpoint = "current.json"
    data = get_endpoint_weather_data(city, endpoint)

    # Extracting data
    location = data.get("location").get("name")
    region = data.get("location").get("region")
    country = data.get("location").get("country")
    temp_c = data.get("current").get("temp_c")
    temp_f = data.get("current").get("temp_f")

    feels_like_c = data.get("current").get("feelslike_c")
    feels_like_f = data.get("current").get("feelslike_f")
    
    local_time = data.get("location").get("localtime")
    last_updated_time = data.get("current").get("last_updated")
    current_condition = data.get("current").get("condition").get("text")

    wind_speed_kph = data.get("current").get("wind_kph")
    wind_speed_mph = data.get("current").get("wind_mph")
    wind_direction = data.get("current").get("wind_dir")

    humidity = data.get("current").get("humidity")

    precipitation = data.get("current").get("precip_mm")

    uv = data.get("current").get("uv")

    # return the extracted data as a dictionary
    weather_data = {
        "location": location,
        "region": region,
        "country": country,
        "temp_c": temp_c,
        "temp_f": temp_f,
        "feels_like_c": feels_like_c,
        "feels_like_f": feels_like_f,
        "local_time": local_time,
        "last_updated_time": last_updated_time,
        "current_condition": current_condition,
        "wind_speed_kph": wind_speed_kph,
        "wind_speed_mph": wind_speed_mph,
        "wind_direction": wind_direction,
        "humidity": humidity,
        "precipitation": precipitation,
        "uv": uv
    }
    return weather_data

## Get Alerts data

In [7]:

def get_alerts_data(city="Boulder"):

    endpoint = "alerts.json"

    data = get_endpoint_weather_data(city, endpoint)

    # Check if there are any alerts
    alerts_exist = data.get("alerts").get("alert")
 
    if len(alerts_exist) == 0:
        return {
            "alerts": "No alerts for this location"
        }
    return data.get("alerts").get("alert")

## Get forecast data

In [8]:
   
### Define a function to get the forecast data
def get_forecasted_data(days:int, city="Boulder")-> List[dict]:

    endpoint = "forecast.json"

    data = get_endpoint_weather_data(city, endpoint, days=2)

    forecasted_data = data.get("forecast").get("forecastday")
    
    # print(f"Forecasted data for the next {days} days", len(forecasted_data))
    # Extracting the forecasted data for the next 3 days
    # forecasted_data = forecasted_data[:days]

    return [
        {
            "date": day.get("date"),
            "day_condition": day.get("day").get("condition").get("text"),
            "max_temp_c": day.get("day").get("maxtemp_c"),
            "min_temp_c": day.get("day").get("mintemp_c"),
            "max_temp_f": day.get("day").get("maxtemp_f"),
            "min_temp_f": day.get("day").get("mintemp_f"),
            "precipitation": day.get("day").get("totalprecip_mm"),
            "wind_speed_kph": day.get("day").get("maxwind_kph"),
            "wind_speed_mph": day.get("day").get("maxwind_mph"),
            "humidity": day.get("day").get("avghumidity"),
            "snow_cm": day.get("day").get("totalsnow_cm"),
            "uv": day.get("day").get("uv"),
            "chances_of_rain": day.get("day").get("daily_chance_of_rain"),
            "chances_of_snow": day.get("day").get("daily_chance_of_snow"),
        }
        for day in forecasted_data
    ]


## Flatten the data

In [9]:

### Merge the functions into one main function
### This will be the flattened dictionary that will be used to store the data

def flatten_data():
    current_weather_data = get_current_weather_data(city="Boulder")
    alerts_data = get_alerts_data(city="Boulder")
    forecasted_data = get_forecasted_data(days=3, city="Boulder")

    return {
        "current_weather": current_weather_data,
        "alerts": alerts_data,
        "forecast": forecasted_data
    }

## use spark streaming to process the data

In [10]:
    

def process_batch_data(batch_df, batch_id):
    try:
        print("Processing the batch id: {}".format(batch_id))

        # Flatten the data
        flattened_data = flatten_data()
        from pyspark.sql import Row
        rows = [Row(**flattened_data)] 
        flattened_df = spark.createDataFrame(rows)
        
     # Write the JSON data to a file
        (
            flattened_df.write
            .mode("append")
            .format("json") 
            .option("path", "output_dir/json") 
            .save()
        )

        print(f"Batch {batch_id} written successfully!")
        # Write to console
        # flattened_data.show(truncate=False)
        
        return flattened_data
    
    except Exception as e:
        print("Error in processing the batch data", e)
        raise e
    

In [11]:

streaming_df = (
            spark
            .readStream
            .format("rate")
            .option("rowsPerSecond", 1)
            .load()
)


In [12]:

(
            streaming_df
            .writeStream
            .foreachBatch(process_batch_data)
            .trigger(processingTime="30 seconds")
            .outputMode("append")
            .start()
            .awaitTermination()
)

Processing the batch id: 0
Batch 0 written successfully!
Processing the batch id: 1
Batch 1 written successfully!
Processing the batch id: 2
Batch 2 written successfully!


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 