In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import kagglehub

In [None]:
historical_hourly_weather_data_path = kagglehub.dataset_download('selfishgene/historical-hourly-weather-data')

city = "Portland"

city_attributes = pd.read_csv(f"{historical_hourly_weather_data_path}/city_attributes.csv")
humidity = pd.read_csv(f"{historical_hourly_weather_data_path}/humidity.csv")
pressure = pd.read_csv(f"{historical_hourly_weather_data_path}/pressure.csv")
temperature = pd.read_csv(f"{historical_hourly_weather_data_path}/temperature.csv")
weather_description = pd.read_csv(f"{historical_hourly_weather_data_path}/weather_description.csv")
wind_speed = pd.read_csv(f"{historical_hourly_weather_data_path}/wind_speed.csv")
wind_direction = pd.read_csv(f"{historical_hourly_weather_data_path}/wind_direction.csv")

In [None]:
if city not in city_attributes['City'].values:
    raise ValueError(f"City '{city}' does not exist in the data. Available cities are: {city_attributes['City'].unique()}")

selected_city = city_attributes[city_attributes['City'] == city].index[0]
data_frames = [humidity, pressure, temperature, weather_description, wind_speed, wind_direction]

for i, df in enumerate(data_frames):
    df.set_index('datetime', inplace=True)
    data_frames[i] = df.iloc[:, selected_city]

In [None]:
combined_data = pd.concat(data_frames, axis=1)
combined_data.columns = ['humidity', 'pressure', 'temperature', 'weather_description', 'wind_speed', 'wind_direction']
combined_data.index = pd.to_datetime(combined_data.index)

# aggregate daily
aggregated_data = (
    combined_data
    .resample('D')
    .agg({
        'temperature': 'mean',
        'humidity': 'mean',
        'wind_speed': 'max',  # Maksymalna prędkość wiatru
        'pressure': 'mean',
        'weather_description': lambda x: x.mode()[0] if not x.mode().empty else np.nan
    })
)

In [None]:
def get_wind_direction_for_max_speed(group):
    max_wind_speed_idx = group['wind_speed'].idxmax()
    return group.loc[max_wind_speed_idx, 'wind_direction']

In [None]:
aggregated_data.rename(columns={'wind_speed': 'wind_speed_max'}, inplace=True)

aggregated_data['wind_direction'] = combined_data.groupby(combined_data.index.date).apply(get_wind_direction_for_max_speed)

aggregated_data['wind_speed_mean'] = combined_data['wind_speed'].resample('D').mean()

encoder = LabelEncoder()
aggregated_data['weather_description'] = encoder.fit_transform(aggregated_data['weather_description'])

In [None]:
weather_mapping = dict(enumerate(encoder.classes_))
print("Mapping for weather_description:", weather_mapping)

In [None]:
aggregated_data['mean_temperature_next_day'] = aggregated_data['temperature'].shift(-1)
aggregated_data['max_wind_speed_next_day'] = aggregated_data['wind_speed_max'].shift(-1)

aggregated_data = aggregated_data.dropna()

In [None]:
X = aggregated_data.drop(columns=['mean_temperature_next_day', 'max_wind_speed_next_day'])
y = aggregated_data[['mean_temperature_next_day', 'max_wind_speed_next_day']]

train_size = int(0.8 * len(aggregated_data))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
print("Train data:", X_train.shape, y_train.shape)
print("Test data:", X_test.shape, y_test.shape)

In [None]:
print(X_train)

In [None]:
print(y_train)

In [None]:
def create_time_windows(data, window_size=5):
    X, y = [], []
    for i in range(window_size, len(data)):
        X.append(data.iloc[i-window_size:i][['temperature', 'humidity', 'pressure', 'wind_speed_max', 'wind_speed_mean', 'wind_direction', 'weather_description']].values.T)
        y.append(data.iloc[i][['mean_temperature_next_day', 'max_wind_speed_next_day']].values)
    return np.array(X), np.array(y)

In [None]:
X, y = create_time_windows(aggregated_data)

train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
print("Training data windowed:", X_train.shape, y_train.shape)
print("Test data windowed:", X_test.shape, y_test.shape)