In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from config import WEATHER_PATH

# Get weekly data

Parameter
---------
          Unit                                 Description
fu3010d1  km/h                                 Gust peak (one second); daily maximum
tre200dx  �C                                   Air temperature 2 m above ground; daily maximum
tre200dn  �C                                   Air temperature 2 m above ground; daily minimum
tre200d0  �C                                   Air temperature 2 m above ground; daily mean
rka150d0  mm                                   Precipitation; daily total 0 UTC - 0 UTC
ure200dx  %                                    Relative air humidity 2 m above ground; daily maximum
ure200dn  %                                    Relative air humidity 2 m above ground; daily minimum
ure200d0  %                                    Relative air humidity; 2 m above ground; daily mean
sre000d0  min                                  Sunshine duration; daily total
fu3010d0  km/h                                 Wind speed scalar; daily mean

In [15]:
# Import weather data
weather = pd.read_csv(f'{WEATHER_PATH}/order_116861_data.txt', sep=';')
weather.head()

# Keep columns: tre200d0 (°C; air temperature 2 m above ground; daily mean); ure200d0 (%; relative air humidity; 2 m above ground; daily mean))
selected_cols = {'stn':'stn', 'time': 'date', 'tre200d0':'avg_daily_temp', 'ure200d0':'avg_daily_hum', 'tre200dx':'max_daily_temp', 'tre200dn':'min_daily_temp', 'ure200dx':'max_daily_hum', 'ure200dn':'min_daily_hum'}
weather = weather[selected_cols.keys()]
weather.rename(columns=selected_cols, inplace=True)


In [16]:
# Locate inconsistent values
# Remove repeated header columns
weather = weather.loc[weather['date'].apply(lambda x: x.isnumeric())].copy()
# Locate missing values (marked by "-" according to documentation)
weather.replace('-', np.nan, inplace=True)
# Convert temp and hum to numeric type
for col in weather.columns:
    if col not in ['stn', 'date']:
        weather[col] = pd.to_numeric(weather[col])

In [17]:
# Create a mask for rows where any NaN values are present
mask = weather.isna().any(axis=1)

# Only max_daily_hum contains missing values
weather[mask]

Unnamed: 0,stn,date,avg_daily_temp,avg_daily_hum,max_daily_temp,min_daily_temp,max_daily_hum,min_daily_hum
3667,BAS,20230116,4.0,74.0,7.0,1.0,,50.0
16179,CDF,20150818,11.5,69.5,15.8,7.8,,49.3
16180,CDF,20150819,11.2,78.0,15.4,6.5,,53.0
38711,STG,20141024,5.3,85.0,8.0,1.7,,70.0
38712,STG,20141025,8.1,85.0,10.6,4.3,,70.0
38713,STG,20141026,9.7,86.0,14.0,7.0,,68.0
38714,STG,20141027,5.9,88.0,7.4,5.1,,80.0
38715,STG,20141028,7.8,83.0,11.5,5.2,,70.0
38716,STG,20141029,10.0,75.0,12.8,8.0,,65.0
38717,STG,20141030,9.4,76.0,12.7,6.7,,65.0


In [18]:
# Impute missing max daily hum values with prior day's value
weather['max_daily_hum'] = weather['max_daily_hum'].fillna(method='ffill')

In [19]:
# Convert date to datetime format
weather['date'] = pd.to_datetime(weather['date'])
weather.set_index('date', inplace=True)
# Group by 'weather station' and then resample to weekly, aggregating by mean
weather = weather.groupby('stn').resample('W').mean(numeric_only=True)
weather.reset_index(inplace=True)

In [20]:
# assigning region
stn_dict = {
    'GVE': 'Region_1', 'CDF': 'Region_1', 'NEU': 'Region_1', 'SIO': 'Region_1',
    'BER': 'Region_2', 'BAS': 'Region_3', 'LUZ': 'Region_4', 'STG': 'Region_5', 'SMA': 'Region_5',
    'DAV': 'Region_6', 'OTL': 'Region_6', 'SAM': 'Region_6'
}

# weigh by city populations
populations = {
    'GVE': 484736, 'CDF': 40143, 'NEU':  53778, 'SIO': 45932,
    'BER': 1017483, 'BAS': 191817, 'LUZ': 398762, 'STG': 499065, 'SMA': 1466424,
    'DAV': 11109, 'OTL': 63688, 'SAM': 3014 
}

In [21]:
weather.dtypes

stn                       object
date              datetime64[ns]
avg_daily_temp           float64
avg_daily_hum            float64
max_daily_temp           float64
min_daily_temp           float64
max_daily_hum            float64
min_daily_hum            float64
dtype: object

In [22]:
# assigning region
stn_dict = {
    'GVE': 'Region_1', 'CDF': 'Region_1', 'NEU': 'Region_1', 'SIO': 'Region_1',
    'BER': 'Region_2', 'BAS': 'Region_3', 'LUZ': 'Region_4', 'STG': 'Region_5', 'SMA': 'Region_5',
    'DAV': 'Region_6', 'OTL': 'Region_6', 'SAM': 'Region_6'
}

# weigh by city populations
populations = {
    'GVE': 484736, 'CDF': 40143, 'NEU':  53778, 'SIO': 45932,
    'BER': 1017483, 'BAS': 191817, 'LUZ': 398762, 'STG': 499065, 'SMA': 1466424,
    'DAV': 11109, 'OTL': 63688, 'SAM': 3014 
}

# Assigning region
weather['region'] = weather['stn'].apply(lambda x: stn_dict[x])

# Create a dict with populations of included cities as a basis for calculating weights
regional_populations = {}
for city, pop in populations.items():
    region = stn_dict[city]
    regional_populations[region] = regional_populations.get(region, 0) + pop

# Calculate weights for each station based on the city's population
city_weights = {city: pop / regional_populations[stn_dict[city]] for city, pop in populations.items()}


In [23]:
city_weights

{'GVE': 0.776087955439497,
 'CDF': 0.06427106465211523,
 'NEU': 0.08610142029398532,
 'SIO': 0.07353955961440244,
 'BER': 1.0,
 'BAS': 1.0,
 'LUZ': 1.0,
 'STG': 0.2539139114998863,
 'SMA': 0.7460860885001137,
 'DAV': 0.14276901723406715,
 'OTL': 0.8184960995232037,
 'SAM': 0.038734883242729175}

In [24]:
# Mapping weights to the 'stn' values in weather DataFrame
weather['weight'] = weather['stn'].map(city_weights)

selected_cols = [col for col in weather if col not in ['stn', 'date', 'region', 'weight']]

# Apply weights
for col in selected_cols:
    weather[f'weighted_{col}'] = weather[col] * weather['weight']

reg_weather = weather.groupby(['region', 'date'])[[f'weighted_{col}' for col in selected_cols]].sum()
reg_weather

Unnamed: 0_level_0,Unnamed: 1_level_0,weighted_avg_daily_temp,weighted_avg_daily_hum,weighted_max_daily_temp,weighted_min_daily_temp,weighted_max_daily_hum,weighted_min_daily_hum
region,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Region_1,2013-01-06,3.044123,82.918337,6.597668,-1.041057,95.043840,63.721907
Region_1,2013-01-13,1.899121,85.732108,4.005557,-0.232400,94.972297,74.365709
Region_1,2013-01-20,-1.508554,77.031993,0.870462,-4.233867,90.875307,63.741066
Region_1,2013-01-27,-0.759157,77.826791,2.469204,-4.891194,90.988158,60.461963
Region_1,2013-02-03,4.551592,78.802057,7.993836,0.627968,92.950733,58.114673
...,...,...,...,...,...,...,...
Region_6,2023-05-07,14.939863,67.277623,20.080837,10.631480,90.256137,44.080142
Region_6,2023-05-14,12.484502,81.365646,16.467443,10.039742,95.725453,61.647225
Region_6,2023-05-21,13.802783,67.059148,17.759773,10.523481,88.191809,46.933802
Region_6,2023-05-28,16.966840,70.844560,22.765740,12.086358,93.296484,46.150461


In [25]:
# Check weighting
weather.query('region == "Region_1" and date == "2013-01-06"')

Unnamed: 0,stn,date,avg_daily_temp,avg_daily_hum,max_daily_temp,min_daily_temp,max_daily_hum,min_daily_hum,region,weight,weighted_avg_daily_temp,weighted_avg_daily_hum,weighted_max_daily_temp,weighted_min_daily_temp,weighted_max_daily_hum,weighted_min_daily_hum
1088,CDF,2013-01-06,0.433333,90.4,3.583333,-3.733333,99.0,76.016667,Region_1,0.064271,0.027851,5.810104,0.230305,-0.239945,6.362835,4.885672
2176,GVE,2013-01-06,3.266667,82.766667,6.983333,-1.033333,95.466667,62.566667,Region_1,0.776088,2.535221,64.234213,5.419681,-0.801958,74.09053,48.557236
3264,NEU,2013-01-06,3.95,83.983333,6.166667,1.433333,92.416667,69.616667,Region_1,0.086101,0.340101,7.231084,0.530959,0.123412,7.957206,5.994094
4896,SIO,2013-01-06,1.916667,76.733333,5.666667,-1.666667,90.2,58.266667,Region_1,0.07354,0.140951,5.642936,0.416724,-0.122566,6.633268,4.284905


In [27]:
reg_weather.to_csv(f"{WEATHER_PATH}/reg_weather.csv")