In [1]:
import numpy as np
import pandas as pd
import os
import netCDF4

from datetime import datetime, timedelta
from dateutil import tz
import math

In [2]:
aws_locations = {
    'Melbourne Airport': {'station_id':86282, },
    'Moorabbin Airport': {'station_id':86077, },
    'Laverton RAAF': {'station_id':87031, },
}

aws_data_dir = 'data/raw_data/weather_station_data/'
plumber_data_dir = "data/raw_data/AU-Preston/timeseries/"

out_data_dir = 'data/cleaned_data/'


## Additional functions

In [3]:
## Small function to return dte time from the nc file as the nc file has date as seconds from a baseline
## Need tro convert to UTC time

def return_date_from_seconds(nc_data, seconds_to_add):
    nc_start = datetime.strptime(nc_data.variables["time"].units[14:], '%Y-%m-%d %H:%M:%S')
    new_time = nc_start + timedelta(seconds=int(seconds_to_add))

    return new_time

def return_seconds_since_date(nc_data, date):
    ## USe this as less loops
    nc_start = datetime.strptime(nc_data.variables["time"].units[14:], '%Y-%m-%d %H:%M:%S')

    ## Convert from local time > UTC (https://stackoverflow.com/questions/4770297/convert-utc-datetime-string-to-local-datetime)
    to_zone = tz.gettz("Australia/Melbourne")
    from_zone = tz.gettz('UTC')

    nc_start = nc_start.replace(tzinfo=from_zone)
    # Convert time zone
    nc_start = nc_start.astimezone(to_zone)
    nc_start = nc_start.replace(tzinfo=None)
    # print(nc_start)

    time_diff = date - nc_start
    time_diff = time_diff.total_seconds()

    return int(time_diff)

# Clean data for one station

In [4]:
## Read the plumber data (.nc)
plumber_fpath = f"{plumber_data_dir}AU-Preston_metforcing_v1.nc"
preston_data = netCDF4.Dataset(plumber_fpath)

## Loop through each location to produce a clean dataset
for loc in aws_locations:
    station_id = aws_locations[loc]['station_id']
    print(loc, station_id)

    # Read AWS data (csv)
    csv_file = f"{aws_data_dir}{station_id}_data.csv"
    aws_df = pd.read_csv(csv_file)

    # Filling missing values and convert data type to float
    aws_df['Precipitation_since_9am_local_time_in_mm'] = aws_df['Precipitation_since_9am_local_time_in_mm'].interpolate(method='linear').fillna(aws_df['Precipitation_since_9am_local_time_in_mm'].mean()).astype(float)

    aws_df['Air_Temperature_in_degrees_C'] = aws_df['Air_Temperature_in_degrees_C'].interpolate(method='linear').fillna(aws_df['Air_Temperature_in_degrees_C'].mean()).astype(float)

    aws_df['Relative_humidity_in_percentage'] = aws_df['Relative_humidity_in_percentage'].interpolate(method='linear').fillna(aws_df['Relative_humidity_in_percentage'].mean()).astype(float)

    aws_df['Wind_speed_in_kmh'] = aws_df['Wind_speed_in_kmh'].interpolate(method='linear').fillna(aws_df['Wind_speed_in_kmh'].mean()).astype(float)
    
    aws_df['Mean_sea_level_pressure_in_hPa'] = aws_df['Mean_sea_level_pressure_in_hPa'].interpolate(method='linear').fillna(aws_df['Mean_sea_level_pressure_in_hPa'].mean()).astype(float)

    # drop the standard time column as we will be sticking to local time (AEST/AEDT)
    aws_df = aws_df.drop(columns=['standard_time'])

    # Converting the time stamps to date time 
    aws_df["local_time"] = pd.to_datetime(aws_df["local_time"])

    nc_swdown = preston_data.variables["SWdown"][:]
    nc_lwdown = preston_data.variables["LWdown"][:]
    nc_windN = preston_data.variables["Wind_N"][:]
    nc_windE = preston_data.variables["Wind_E"][:]

    time_values = preston_data.variables["time"][:]
    # print(preston_data.variables["time"].units[14:])

    # temporary arrays to store the needed values 
    temp_swdown = []
    temp_lwdown = []
    temp_windN = []
    temp_windE = []

    ## Now loop through the csv file and get the relevant index in the data array
    for index,row in aws_df.iterrows():
        # print(row['standard_time'])
        seconds_since_start = return_seconds_since_date(preston_data, row['local_time'])

        idx = np.where(time_values == seconds_since_start)

        temp_swdown.append(nc_swdown[idx][0][0][0])
        temp_lwdown.append(nc_lwdown[idx][0][0][0])
        temp_windN.append(nc_windN[idx][0][0][0])
        temp_windE.append(nc_windE[idx][0][0][0])

    aws_df['swdown'] = temp_swdown
    aws_df['lwdown'] = temp_lwdown
    aws_df['windN'] = temp_windN
    aws_df['windE'] = temp_windE

    ## Extract month and time of day (hour of day)
    aws_df['month_of_year'] = aws_df['local_time'].dt.month
    aws_df['hour_of_day'] = aws_df['local_time'].dt.hour

    ## Calculating wind direction based on windN and windE
    # https://stackoverflow.com/questions/21484558/how-to-calculate-wind-direction-from-u-and-v-wind-components-in-r
    #https://www.eol.ucar.edu/content/wind-direction-quick-reference
    wind_abs = np.sqrt(aws_df['windE'].values**2 + aws_df['windN'].values**2)
    # wind_dir_trig_to = math.atan2(aws_df['windE'].values/wind_abs, aws_df['windN'].values/wind_abs) 
    wind_dir_trig_to = np.arctan2(-aws_df['windN'].values, -aws_df['windE'].values) 
    wind_dir_trig_to_degrees = wind_dir_trig_to * 180/math.pi
    wind_dir_trig_from_degrees = wind_dir_trig_to_degrees + 180
    wind_dir_cardinal = 90 - wind_dir_trig_from_degrees

    # make sure direction is computed form N>E>S>W
    for w in range(len(wind_dir_cardinal)):
        if (wind_dir_cardinal[w] < 0):
            wind_dir_cardinal[w] = wind_dir_cardinal[w] + 360

    aws_df['wind_direction'] = wind_dir_cardinal

    # Drop the wind components
    aws_df = aws_df.drop(columns=['windE', 'windN'])

    # Save the processed file
    aws_df.to_csv(f"{out_data_dir}{station_id}_aws_plumber_data.csv", index=False)


Melbourne Airport 86282
Moorabbin Airport 86077
Laverton RAAF 87031


Instead of saving the direction and the speed of the wind. It is better to save the north and east components. The direction can be misleading because angles 0.01 and -0.01 are close but if the direction is defined as a number between 0 and 360 then these direction will be far.

I have seen that you are supposing that the direction of the wind is the same in all locations. I am going to continue with the same assumption. But, I am going to modify the components to adjust the speed to the measured speed. The csv file will contain the components.

In [6]:
## Read the plumber data (.nc)
plumber_fpath = f"{plumber_data_dir}AU-Preston_metforcing_v1.nc"
preston_data = netCDF4.Dataset(plumber_fpath)

## Loop through each location to produce a clean dataset
for loc in aws_locations:
    station_id = aws_locations[loc]['station_id']
    print(loc, station_id)

    # Read AWS data (csv)
    csv_file = f"{aws_data_dir}{station_id}_data.csv"
    aws_df = pd.read_csv(csv_file)

    # Filling missing values and convert data type to float
    aws_df['Precipitation_since_9am_local_time_in_mm'] = aws_df['Precipitation_since_9am_local_time_in_mm'].interpolate(method='linear').fillna(aws_df['Precipitation_since_9am_local_time_in_mm'].mean()).astype(float)

    aws_df['Air_Temperature_in_degrees_C'] = aws_df['Air_Temperature_in_degrees_C'].interpolate(method='linear').fillna(aws_df['Air_Temperature_in_degrees_C'].mean()).astype(float)

    aws_df['Relative_humidity_in_percentage'] = aws_df['Relative_humidity_in_percentage'].interpolate(method='linear').fillna(aws_df['Relative_humidity_in_percentage'].mean()).astype(float)

    aws_df['Wind_speed_in_kmh'] = aws_df['Wind_speed_in_kmh'].interpolate(method='linear').fillna(aws_df['Wind_speed_in_kmh'].mean()).astype(float)
    
    aws_df['Mean_sea_level_pressure_in_hPa'] = aws_df['Mean_sea_level_pressure_in_hPa'].interpolate(method='linear').fillna(aws_df['Mean_sea_level_pressure_in_hPa'].mean()).astype(float)

    # drop the standard time column as we will be sticking to local time (AEST/AEDT)
    aws_df = aws_df.drop(columns=['standard_time'])

    # Converting the time stamps to date time 
    aws_df["local_time"] = pd.to_datetime(aws_df["local_time"])

    nc_swdown = preston_data.variables["SWdown"][:]
    nc_lwdown = preston_data.variables["LWdown"][:]
    nc_windN = preston_data.variables["Wind_N"][:]
    nc_windE = preston_data.variables["Wind_E"][:]

    time_values = preston_data.variables["time"][:]
    # print(preston_data.variables["time"].units[14:])

    # temporary arrays to store the needed values 
    temp_swdown = []
    temp_lwdown = []
    temp_windN = []
    temp_windE = []

    ## Now loop through the csv file and get the relevant index in the data array
    for index,row in aws_df.iterrows():
        # print(row['standard_time'])
        seconds_since_start = return_seconds_since_date(preston_data, row['local_time'])

        idx = np.where(time_values == seconds_since_start)

        temp_swdown.append(nc_swdown[idx][0][0][0])
        temp_lwdown.append(nc_lwdown[idx][0][0][0])
        temp_windN.append(nc_windN[idx][0][0][0])
        temp_windE.append(nc_windE[idx][0][0][0])

    aws_df['swdown'] = temp_swdown
    aws_df['lwdown'] = temp_lwdown
    aws_df['windN'] = temp_windN
    aws_df['windE'] = temp_windE

    ## Extract month and time of day (hour of day)
    aws_df['month_of_year'] = aws_df['local_time'].dt.month
    aws_df['hour_of_day'] = aws_df['local_time'].dt.hour

    # Adjust the components
    aws_df["windN"]=aws_df["windN"]*aws_df['Wind_speed_in_kmh']/(np.sqrt(aws_df["windN"]**2+aws_df["windE"]**2))
    aws_df["windE"]=aws_df["windE"]*aws_df['Wind_speed_in_kmh']/(np.sqrt(aws_df["windN"]**2+aws_df["windE"]**2))
    

    # Drop the wind components
    aws_df = aws_df.drop(columns=['Wind_speed_in_kmh'])

    # Save the processed file
    aws_df.to_csv(f"{out_data_dir}{station_id}_aws_plumber_data_wind.csv", index=False)


Melbourne Airport 86282
Moorabbin Airport 86077
Laverton RAAF 87031
