### Connections

In [None]:
#connect to blob storage
storage_account_name = "your-account-name"
storage_account_access_key = "your-access-key"

#configure
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_access_key)

In [None]:
#check connection
try:
    files = dbutils.fs.ls(f"wasbs://bronze@{storage_account_name}.blob.core.windows.net/")
    print("Connection successful. Files in container:")
    for file in files:
        print(file.name)
except Exception as e:
    print("Connection failed:", str(e))

Connection successful. Files in container:
geocode/
pollution/
weather/


### Transformations

In [None]:
# import necessary packages for creating Spark session, schema def, and transform
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, MapType, ArrayType, IntegerType
from pyspark.sql.functions import col, to_timestamp, when, broadcast

#### Schemas

In [None]:
#define schema. Nullable should be True for values that can be null
#local_names can have varying amounts of languages and columns, ascii vals, etc
#local_names is a map (kv pair) and both its k and v are strings
geocode_schema = StructType([
    StructField("name", StringType(), False),
    StructField("local_names", MapType(StringType(), StringType()), True),
    StructField("lat", DoubleType(), False),
    StructField("lon", DoubleType(), False),
    StructField("country", StringType(), False),
    StructField("state", StringType(), True),
])

In [None]:
#set crucial keys to nullable = false, such as IDs, coordinates, and actual temperature
#nested StructTypes are for nested json object literals

#coordinates (nested object)
coord_schema = StructType([
        StructField("lon", DoubleType(), False),
        StructField("lat", DoubleType(), False)
    ])

#weather (array within top-level object)
weather_schema = ArrayType(StructType([
        StructField("id", IntegerType(), False),
        StructField("main", StringType(), True),
        StructField("description", StringType(), True),
        StructField("icon", StringType(), True)
    ]), True)

#main (nested object)
main_schema = StructType([
        StructField("temp", DoubleType(), False),
        StructField("feels_like", DoubleType(), True),
        StructField("temp_min", DoubleType(), True),
        StructField("temp_max", DoubleType(), True),
        StructField("pressure", IntegerType(), True),
        StructField("humidity", IntegerType(), True),
        StructField("sea_level", IntegerType(), True),
        StructField("grnd_level", IntegerType(), True)
    ])

#wind (nested object)
wind_schema = StructType([
        StructField("speed", DoubleType(), True),
        StructField("deg", IntegerType(), True),
        StructField("gust", DoubleType(), True)
    ])

#rain (nested object)
rain_schema = StructType([
        StructField("1h", DoubleType(), True)
    ])

#cloud (nested object)
cloud_schema = StructType([
        StructField("all", IntegerType(), True)
    ])

#sys (nested object)
sys_schema = StructType([
        StructField("type", IntegerType(), True),
        StructField("id", IntegerType(), False),
        StructField("country", StringType(), True),
        StructField("sunrise", LongType(), True),
        StructField("sunset", LongType(), True)
    ])

# final schema
weather_curr_schema = StructType([
    StructField("coord", coord_schema, True),
    StructField("weather", weather_schema , True),
    StructField("base", StringType(), True),
    StructField("main", main_schema, True),
    StructField("visibility", IntegerType(), True),
    StructField("wind", wind_schema, True),
    StructField("rain", rain_schema, True),
    StructField("clouds", cloud_schema, True),
    StructField("dt", LongType(), True),
    StructField("sys", sys_schema, True),
    StructField("timezone", IntegerType(), True),
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("cod", IntegerType(), True)
])

In [None]:
# define schema for 3 hr forecast api
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, LongType, ArrayType, MapType

# list.main schema (nested object within list)
list_main_schema = StructType([
    StructField("temp", DoubleType(), False),
    StructField("feels_like", DoubleType(), True),
    StructField("temp_min", DoubleType(), True),
    StructField("temp_max", DoubleType(), True),
    StructField("pressure", IntegerType(), True),
    StructField("sea_level", IntegerType(), True),
    StructField("grnd_level", IntegerType(), True),
    StructField("humidity", IntegerType(), True),
    StructField("temp_kf", DoubleType(), True)
])

# list.weather (nested array within list containing an object)
weather_schema = ArrayType(StructType([
    StructField("id", IntegerType(), True),
    StructField("main", StringType(), True),
    StructField("description", StringType(), True),
    StructField("icon", StringType(), True)
]))

# list.clouds schema (nested object within list)
clouds_schema = StructType([
    StructField("all", IntegerType(), True)
])

# list.wind schema (nested object within list)
wind_schema = StructType([
    StructField("speed", DoubleType(), True),
    StructField("deg", IntegerType(), True),
    StructField("gust", DoubleType(), True)
])

# list.rain schema (nested object within list)
rain_schema = StructType([
    StructField("3h", DoubleType(), True)
])

# list.sys schema (nested object within list)
sys_schema = StructType([
    StructField("pod", StringType(), True)
])

# list schema within main object (array)
list_schema = ArrayType(StructType([
    StructField("dt", LongType(), True),
    StructField("main", main_schema, True),
    StructField("weather", weather_schema, True),
    StructField("clouds", clouds_schema, True),
    StructField("wind", wind_schema, True),
    StructField("visibility", IntegerType(), True),
    StructField("pop", DoubleType(), True),
    StructField("rain", rain_schema, True),
    StructField("sys", sys_schema, True),
    StructField("dt_txt", StringType(), True)
]))

# Define the schema for the "coord" field inside "city"
coord_schema = StructType([
    StructField("lat", DoubleType(), True),
    StructField("lon", DoubleType(), True)
])

# Define the schema for the "city" field
city_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("coord", coord_schema, True),
    StructField("country", StringType(), True),
    StructField("population", IntegerType(), True),
    StructField("timezone", IntegerType(), True),
    StructField("sunrise", LongType(), True),
    StructField("sunset", LongType(), True)
])

# Main schema
schema = StructType([
    StructField("cod", StringType(), True),
    StructField("message", IntegerType(), True),
    StructField("cnt", IntegerType(), True),
    StructField("list", list_schema, True),
    StructField("city", city_schema, True)
])

#### Load Datasets

In [None]:
# fetch geocode json --> parse and read into spark df
geocode_path = f"wasbs://bronze@{storage_account_name}.blob.core.windows.net/geocode/Batch-GeocodingAPI/geocoding.json"
geocode_df = spark.readStream.schema(geocode_schema).json(geocode_path)

In [None]:
# fetch current weather json --> parse and read into spark df
weather_curr_path = f"wasbs://bronze@{storage_account_name}.blob.core.windows.net/weather/RT-CurrentAPI/Modena_current_weather.json"
weather_curr_df = spark.readStream.schema(curr_weather_schema).json(weather_curr_path)

In [None]:
# fetch 3 hr forecast json --> parse and read into spark df
weather_3_hr_path = f"wasbs://bronze@{storage_account_name}.blob.core.windows.net/weather/RT-3HrForecastAPI/Modena_three_hour_forecast.json"
weather_3_hr_df = spark.readStream.schema(curr_weather_schema).json(weather_3_hr_path)

#### Geocode Transformations

In [None]:
# drop local_names
geocode_df = geocode_df.drop("local_names")

#### Current Weather Transformations

In [None]:
# extract lat/lon and flatten + select *, for upcoming join with geocode
# order as lat and lon for consistency with geocode
# don't need empty coord object so drop

weather_curr_df = weather_curr_df.select(
    col("coord.lat").alias("lat"),
    col("coord.lon").alias("lon"),
    weather_curr_df["*"]
).drop("coord")

#### 3-Hr Forecast Transformations

#### Fast-Slow Joins

In [None]:
# enrich current weather with geocode
# geocode can be broadcast joined for query optimization: copy it to all worker nodes since its a small static dataset
# left join for enrichment
enriched_curr_weather = curr_weather_df.join(
    broadcast(geocode_df),
    on = ["lat", "lon"],
    how = "left"
    )
