### Preprocess Weather Data


In [5]:
import os
import sys
sys.path.append("../")
from scripts.weather_scrape  import get_weather_data
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, stddev, mean, col, to_date, concat
import numpy as np

Scrape the weather data from the NOAA website

In [6]:
get_weather_data()

Begin 2022 for weather
https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access/by-year/2022/psv/GHCNh_USW00094728_2022.psv
../data/landing/noaa_data/2022.psv
Completed 2022 for weather
Begin 2023 for weather
https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access/by-year/2023/psv/GHCNh_USW00094728_2023.psv
../data/landing/noaa_data/2023.psv
Completed 2023 for weather
Begin 2022 for weather
https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access/by-year/2022/psv/GHCNh_USW00094728_2022.psv
../data/raw/noaa_data/2022.psv
Completed 2022 for weather
Begin 2023 for weather
https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access/by-year/2023/psv/GHCNh_USW00094728_2023.psv
../data/raw/noaa_data/2023.psv
Completed 2023 for weather


In [7]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Preprocess Weather Data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

years = ["2022", "2023"]

weather_sdf_list = []
for year in years:
    # Read the PSV file into a DataFrame
    psv_df = spark.read.format("csv") \
        .option("delimiter", "|") \
        .option("header", "true") \
        .load(f"../data/raw/noaa_data/{year}.psv")
    weather_sdf_list.append(psv_df)

# Join the data frames together to create a 2022 & 2023 dataframe
weather_sdf = weather_sdf_list[0]
weather_df = weather_sdf.union(weather_sdf_list[1])

num_instances, num_features = weather_df.count(), len(weather_df.columns)
print(f"The shape of the weather dataframe: {num_instances} x {num_features}")

The shape of the weather dataframe: 15825 x 190


There are 8 primary numerical features in the dataset with each attribute having several columns describing specific information about the main primary feature. These columns were often empty or beyond the scope of the research question and thus were dropped.

In [8]:
# Filter the relevant attributes
attributes = [
    "Year",
    "Month",
    "Day",
    "temperature",
    "dew_point_temperature",
    "station_level_pressure",
    "sea_level_pressure",
    "wind_speed",
    "precipitation",
    "relative_humidity",
    "wet_bulb_temperature"
]

weather_df = weather_df.select(attributes)

# Feature engineered a date column
weather_df = weather_df.withColumn(
    "date",
    to_date(
        concat(
            col("Year"), 
            lit("-"), 
            col("Month"), 
            lit("-"), 
            col("Day")
        ), 
        "yyyy-MM-dd"
    )
)

weather_df = weather_df.orderBy("date")

num_instances, num_features = weather_df.count(), len(weather_df.columns)
print(f"The shape of the weather dataframe: {num_instances} x {num_features}")

The shape of the weather dataframe: 15825 x 12


Gain general outline of the data

In [9]:
weather_df.describe()

24/08/25 00:29:34 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

summary,Year,Month,Day,temperature,dew_point_temperature,station_level_pressure,sea_level_pressure,wind_speed,precipitation,relative_humidity,wet_bulb_temperature
count,15825.0,15825.0,15825.0,15823.0,15822.0,15703.0,12370.0,15082.0,13881.0,15822.0,15703.0
mean,2022.28897314376,5.465023696682464,15.479810426540284,12.1707767174366,4.836581974465858,1010.9803668088772,1016.8801050929648,2.369897891525936,0.2798645630718247,64.89072177980027,8.808590715149998
stddev,0.4532997358330133,3.422023281663908,8.877999213496155,8.992139178474988,10.06296335827768,8.174583374180722,7.920357844551943,1.6676380759587226,1.3536896464437305,22.002522477733766,8.215600892214656
min,2022.0,1.0,1.0,-0.6,-0.6,1000.0,1000.0,0.0,0.0,10.0,-0.1
max,2023.0,12.0,31.0,9.4,9.4,999.7,999.9,9.8,9.9,97.0,9.9


Filter out the dates outside of 2022-10-01 to 2023-03-31

In [10]:
# Define the start and end dates
start_date = "2022-10-01"
end_date = "2023-03-31"

# Filter df to only include the specified date range
weather_df = weather_df.filter(
    (col("date") >= start_date) & (col("date") <= end_date)
)

num_instances, num_features = weather_df.count(), len(weather_df.columns)
print(f"The shape of the weather dataframe: {num_instances} x {num_features}")

The shape of the weather dataframe: 5603 x 12


The weather was generally distributed so we removed any instances beyond sqrt(2*log(N)) standard deviations away from the mean

In [11]:
for column in attributes:
    stats = weather_df.agg(
        mean(column).alias("mean"),
        stddev(column).alias("stddev")
    ).collect()[0]
    
    column_mean = stats["mean"]
    column_stddev = stats["stddev"]

    bound_sd = np.sqrt(2*np.log(weather_df.count()))

    weather_df = weather_df.filter(
        (col(column) >= column_mean - bound_sd * column_stddev) &
        (col(column) <= column_mean + bound_sd * column_stddev)
    )


num_instances, num_features = weather_df.count(), len(weather_df.columns)
print(f"The shape of the weather dataframe: {num_instances} x {num_features}")



The shape of the weather dataframe: 3950 x 12


                                                                                

Aggregate the weather data into daily averages and construct the dataframe

In [12]:
daily_weather_df = weather_df.groupBy("date").agg(
    mean("temperature").alias("mean_temperature"),
    mean("dew_point_temperature").alias("mean_dew_point_temperature"),
    mean("station_level_pressure").alias("mean_station_level_pressure"),
    mean("sea_level_pressure").alias("mean_sea_level_pressure"),
    mean("wind_speed").alias("mean_wind_speed"),
    mean("precipitation").alias("mean_precipitation"),
    mean("relative_humidity").alias("mean_relative_humidity"),
    mean("wet_bulb_temperature").alias("mean_wet_bulb_temperature")
)

daily_weather_df = daily_weather_df.orderBy("date")

num_instances, num_features = weather_df.count(), len(weather_df.columns)
print(f"The shape of the weather dataframe: {num_instances} x {num_features}")

The shape of the weather dataframe: 3950 x 12


In [13]:
# Export into data/curated
daily_weather_df.write.mode('overwrite').parquet('../data/curated/weather_data.parquet')

                                                                                