In [1]:
# %pip install pandas
# %pip install numpy
# %pip install scikit-learn
# %pip install kagglehub
# %pip install cupy

In [2]:
import pandas as pd
import numpy as np
# from sklearn.preprocessing import LabelEncoder
import kagglehub
import cupy as cp


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# def get_aggregated_data_for_single_city():
#     historical_hourly_weather_data_path = kagglehub.dataset_download('selfishgene/historical-hourly-weather-data')

#     city = "Portland"

#     city_attributes = pd.read_csv(f"{historical_hourly_weather_data_path}/city_attributes.csv")
#     humidity = pd.read_csv(f"{historical_hourly_weather_data_path}/humidity.csv")
#     pressure = pd.read_csv(f"{historical_hourly_weather_data_path}/pressure.csv")
#     temperature = pd.read_csv(f"{historical_hourly_weather_data_path}/temperature.csv")
#     weather_description = pd.read_csv(f"{historical_hourly_weather_data_path}/weather_description.csv")
#     wind_speed = pd.read_csv(f"{historical_hourly_weather_data_path}/wind_speed.csv")
#     wind_direction = pd.read_csv(f"{historical_hourly_weather_data_path}/wind_direction.csv")
#     if city not in city_attributes['City'].values:
#         raise ValueError(f"City '{city}' does not exist in the data. Available cities are: {city_attributes['City'].unique()}")

#     selected_city = city_attributes[city_attributes['City'] == city].index[0]
#     data_frames = [humidity, pressure, temperature, weather_description, wind_speed, wind_direction]

#     for i, df in enumerate(data_frames):
#         df.set_index('datetime', inplace=True)
#         data_frames[i] = df.iloc[:, selected_city]
#     combined_data = pd.concat(data_frames, axis=1)
#     combined_data.columns = [
#         'humidity', 'pressure', 'temperature', 'weather_description', 
#         'wind_speed', 'wind_direction'
#     ]
#     combined_data.index = pd.to_datetime(combined_data.index)

#     # aggregate daily
#     aggregated_data = combined_data.resample('D').agg({
#         'temperature': 'mean',
#         'humidity': 'mean',
#         'wind_speed': ['max', 'mean'],
#         'pressure': 'mean',
#         'weather_description': lambda x: x.mode()[0] if not x.mode().empty else np.nan,
#         'wind_direction': 'mean'
#     })
#     aggregated_data.columns = ['_'.join(col).strip('_') for col in aggregated_data.columns.values]

#     return aggregated_data

In [4]:
# encoder = LabelEncoder()
# aggregated_data['weather_description'] = encoder.fit_transform(aggregated_data['weather_description'])
# weather_mapping = dict(enumerate(encoder.classes_))
# print("Mapping for weather_description:", weather_mapping)

In [5]:
historical_hourly_weather_data_path = kagglehub.dataset_download('selfishgene/historical-hourly-weather-data')

city_attributes = pd.read_csv(f"{historical_hourly_weather_data_path}/city_attributes.csv")
humidity = pd.read_csv(f"{historical_hourly_weather_data_path}/humidity.csv")
pressure = pd.read_csv(f"{historical_hourly_weather_data_path}/pressure.csv")
temperature = pd.read_csv(f"{historical_hourly_weather_data_path}/temperature.csv")
weather_description = pd.read_csv(f"{historical_hourly_weather_data_path}/weather_description.csv")
wind_speed = pd.read_csv(f"{historical_hourly_weather_data_path}/wind_speed.csv")
wind_direction = pd.read_csv(f"{historical_hourly_weather_data_path}/wind_direction.csv")

data_frames = []
for city in city_attributes['City']:
    city_data = pd.DataFrame({
        'datetime': pd.to_datetime(humidity['datetime']),
        'humidity': humidity[city],
        'pressure': pressure[city],
        'temperature': temperature[city],
        'weather_description': weather_description[city],
        'wind_speed': wind_speed[city],
        'wind_direction': wind_direction[city],
        'latitude': city_attributes.loc[city_attributes['City'] == city, 'Latitude'].values[0],
        'longitude': city_attributes.loc[city_attributes['City'] == city, 'Longitude'].values[0],
        'city': city
    })
    city_data.set_index('datetime', inplace=True)
    data_frames.append(city_data)

combined_data = pd.concat(data_frames)

combined_data = combined_data.ffill().bfill().interpolate()

aggregated_data = combined_data.groupby(['city']).resample('D').agg({
    'temperature': 'mean',
    'humidity': 'mean',
    'wind_speed': ['max', 'mean'],
    'pressure': 'mean',
    'weather_description': lambda x: x.mode()[0] if not x.mode().empty else np.nan,
    'wind_direction': 'mean',
    'latitude': 'mean',
    'longitude': 'mean'
}).reset_index()

aggregated_data.columns = [
    '_'.join(col).strip('_') if isinstance(col, tuple) else col for col in aggregated_data.columns
]

  combined_data = combined_data.ffill().bfill().interpolate()


In [6]:
# def preprocess_weather_data_for_single_city(data, window_size=3):
#     X, y = [], []
#     for i in range(window_size, len(data) - 1):
#         X_window = data.iloc[i-window_size:i][[
#             'temperature_mean', 'humidity_mean', 'pressure_mean', 'wind_speed_max', 'wind_speed_mean', 'wind_direction_mean'
#         ]].values
#         y_target = data.iloc[i + 1][['temperature_mean', 'wind_speed_max']].values

#         # encode wind speed >= 6 as binary
#         y_target[1] = 1 if y_target[1] >= 6 else 0
#         X.append(X_window)
#         y.append(y_target)

#     X = np.array(X)
#     y = np.array(y)

#     # normalize X
#     X_mean = X.mean(axis=(0, 1), keepdims=True)
#     X_std = X.std(axis=(0, 1), keepdims=True)
#     X_normalized = (X - X_mean) / (X_std + 1e-9)

#     return X_normalized, y

def preprocess_weather_data(data, window_size=3):
    X, y = [], []
    for i in range(window_size, len(data) - 1):
        X_window = data.iloc[i-window_size:i][[
            'temperature_mean', 'humidity_mean', 'pressure_mean', 'wind_speed_max', 'wind_speed_mean', 'wind_direction_mean', 'latitude_mean', 'longitude_mean'
        ]].values
        y_target = data.iloc[i + 1][['temperature_mean', 'wind_speed_max']].values

        # encode wind speed >= 6 as binary
        y_target[1] = 1 if y_target[1] >= 6 else 0
        X.append(X_window)
        y.append(y_target)

    X = np.array(X)
    y = np.array(y)

    # normalize X
    X_mean = X.mean(axis=(0, 1), keepdims=True)
    X_std = X.std(axis=(0, 1), keepdims=True)
    X_normalized = (X - X_mean) / (X_std + 1e-9)

    return X_normalized, y

In [7]:
X, y = preprocess_weather_data(aggregated_data, window_size=3)

# split into train/test (0.7 or 0.8)
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [8]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(54342, 3, 8)
(54342, 2)
(13586, 3, 8)
(13586, 2)


In [9]:
# print(X_train)
# print(y_train)

In [10]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test, epochs=1000, learning_rate=0.001, bath_size=128):
    X_train_cp = cp.array(X_train.reshape(X_train.shape[0], -1), dtype=cp.float32)
    y_train_cp = cp.array(y_train, dtype=cp.float32)
    X_test_cp = cp.array(X_test.reshape(X_test.shape[0], -1), dtype=cp.float32)
    y_test_cp = cp.array(y_test, dtype=cp.float32)

    model.train(X_train_cp, y_train_cp, X_test_cp, y_test_cp, epochs, learning_rate, batch_size=bath_size)

    predictions = model.predict(X_test_cp)
    
    mae = cp.mean(cp.abs(predictions[:, 0] - y_test_cp[:, 0]))

    from sklearn.metrics import roc_auc_score
    auc = roc_auc_score(cp.asnumpy(y_test_cp[:, 1]), cp.asnumpy(predictions[:, 1]))

    print(f"Test Regression MAE: {mae}")
    print(f"Test Classification AUC: {auc}")

    print("Predictions:" , predictions[:5, :])
    print("True values:", y_test_cp[:5, :])

    return mae, auc

In [11]:
from weather_prediction import WeatherPredictionNetwork

layers = [X_train.shape[1] * X_train.shape[2], 128, 64, 2]
activations = ["relu", "relu"]
model = WeatherPredictionNetwork(layers, activations, seed=21)

train_and_evaluate(model, X_train, y_train, X_test, y_test, epochs=5000, learning_rate=0.01, bath_size=256)

Epoch 0, Regression Loss: 29.0032901763916, Classification AUC: 0.5704383964646578, Learning Rate: 0.001
Test Regression MAE: 39.77280044555664, Test Classification AUC: 0.5
Epoch 100, Regression Loss: 3.3196463584899902, Classification AUC: 0.6779207694597101, Learning Rate: 0.001
Epoch 200, Regression Loss: 3.443655252456665, Classification AUC: 0.7159709576368334, Learning Rate: 0.001
Epoch 300, Regression Loss: 3.4750566482543945, Classification AUC: 0.7178535418710441, Learning Rate: 0.001
Epoch 400, Regression Loss: 3.313891887664795, Classification AUC: 0.7231865495033699, Learning Rate: 0.001
Epoch 500, Regression Loss: 3.3306355476379395, Classification AUC: 0.7305777628048009, Learning Rate: 0.0001
Epoch 600, Regression Loss: 2.5916759967803955, Classification AUC: 0.7412842597836671, Learning Rate: 0.0001
Epoch 700, Regression Loss: 2.581341028213501, Classification AUC: 0.7445192796402623, Learning Rate: 0.0001
Epoch 800, Regression Loss: 2.5990467071533203, Classification 

KeyboardInterrupt: 