This notebook preprocess and filter the weather dataset to align with the research objects.

In [13]:
import pandas as pd
import numpy as np

In [14]:
weather_2023_df = pd.read_csv("../data/weather_2023.csv")
weather_2023_df['DATE'] = pd.to_datetime(weather_2023_df['DATE'], format="%Y-%m-%dT%H:%M:%S")
weather_2023_df.shape


  weather_2023_df = pd.read_csv("../data/weather_2023.csv")


(11842, 91)

In [15]:
weather_2023_df.columns

Index(['STATION', 'DATE', 'SOURCE', 'LATITUDE', 'LONGITUDE', 'ELEVATION',
       'NAME', 'REPORT_TYPE', 'CALL_SIGN', 'QUALITY_CONTROL', 'WND', 'CIG',
       'VIS', 'TMP', 'DEW', 'SLP', 'AA1', 'AA2', 'AA3', 'AB1', 'AD1', 'AE1',
       'AH1', 'AH2', 'AH3', 'AH4', 'AH5', 'AH6', 'AI1', 'AI2', 'AI3', 'AI4',
       'AI5', 'AI6', 'AJ1', 'AK1', 'AL1', 'AM1', 'AN1', 'AT1', 'AT2', 'AT3',
       'AT4', 'AT5', 'AU1', 'AU2', 'AW1', 'AW2', 'AW3', 'AX1', 'AX2', 'AX3',
       'AX4', 'GA1', 'GA2', 'GA3', 'GD1', 'GD2', 'GD3', 'GE1', 'GF1', 'KA1',
       'KA2', 'KB1', 'KB2', 'KB3', 'KC1', 'KC2', 'KD1', 'KD2', 'KE1', 'KG1',
       'KG2', 'MA1', 'MD1', 'MF1', 'MG1', 'MH1', 'MK1', 'MW1', 'OC1', 'OD1',
       'OE1', 'OE2', 'OE3', 'RH1', 'RH2', 'RH3', 'WA1', 'REM', 'EQD'],
      dtype='object')

Get hourly report only

In [16]:
hourly_weather = weather_2023_df.loc[weather_2023_df['REPORT_TYPE'] == 'FM-15', :]
hourly_weather.shape
hourly_weather['DATE']

5       2023-01-01 00:51:00
8       2023-01-01 01:51:00
10      2023-01-01 02:51:00
13      2023-01-01 03:51:00
16      2023-01-01 04:51:00
                ...        
11837   2023-12-31 19:51:00
11838   2023-12-31 20:51:00
11839   2023-12-31 21:51:00
11840   2023-12-31 22:51:00
11841   2023-12-31 23:51:00
Name: DATE, Length: 8757, dtype: datetime64[ns]

This part generates a csv file `hourly_weather_2023.csv` in `data` directory for further used.

In [17]:
columns = ['date','hour', 'wind_speed', 'dew_point', 'atmospheric_pressure', 'temperature']
hourly_weather_2023_filtered = pd.DataFrame(columns=columns)


In [18]:

hourly_weather_2023_filtered['wind_speed'] = hourly_weather['WND'].apply(lambda x: int((x.split(',')[-2]))/10).replace(999.9, np.nan)
hourly_weather_2023_filtered['temperature'] = hourly_weather['TMP'].apply(lambda x: int((x.split(',')[0]))/10).replace(999.9, np.nan)
hourly_weather_2023_filtered['dew_point'] = hourly_weather['DEW'].apply(lambda x: int((x.split(',')[0]))/10).replace(999.9, np.nan)  #precipitation
hourly_weather_2023_filtered['atmospheric_pressure'] = hourly_weather['SLP'].apply(lambda x: int((x.split(',')[0]))/10).replace(9999.9, np.nan)  #precipitation
hourly_weather_2023_filtered['date'] = [i for i in hourly_weather['DATE'].dt.date]
hourly_weather_2023_filtered['hour'] = hourly_weather['DATE'].dt.hour.apply(lambda x: f'{x:02}')




In [19]:
hourly_weather_2023_filtered.head(10)

Unnamed: 0,date,hour,wind_speed,dew_point,atmospheric_pressure,temperature
5,2023-01-01,0,0.0,9.4,1009.8,10.0
8,2023-01-01,1,3.1,11.7,1008.6,12.8
10,2023-01-01,2,0.0,11.7,1008.6,12.2
13,2023-01-01,3,2.6,11.1,1007.8,12.8
16,2023-01-01,4,0.0,11.1,1007.0,12.2
22,2023-01-01,5,2.6,10.6,1007.2,11.7
23,2023-01-01,6,3.6,10.0,1006.9,11.7
24,2023-01-01,7,2.6,10.0,1007.8,11.1
25,2023-01-01,8,4.1,9.4,1008.1,11.7
26,2023-01-01,9,2.1,8.9,1008.5,11.1


In [20]:
# Imputing null values using the most recent values.
hourly_weather_2023_filtered = hourly_weather_2023_filtered.ffill()

In [21]:
hourly_weather_2023_filtered.to_csv("../data/hourly_weather_2023.csv", index=False)
