In [8]:
import pandas as pd
import numpy as np
import random
from joblib import dump, load
from utils.utils import sliding_window

from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

In [9]:
city_data = pd.read_csv("data/extra_info.csv", index_col=0)

In [10]:
air = pd.read_csv("data/air_data.csv")
weather = pd.read_csv("data/weather_data.csv")    

In [11]:
weather_np = []
air_np = []

# take some minutes to run
for city_id in ['Hà Nội', 'Hưng Yên', 'Bắc Ninh']:          
    # load air quality and weather data files 
    air_df = air.loc[air['province'] == city_id].drop(columns=['province'])
    weather_df = weather.loc[weather['province'] == city_id].drop(columns=['province'])   
    
    # air quality data preprocessing
    air_df = air_df.loc[(air_df.iloc[:, 1:] >= 0).all(axis=1)]
    air_df.drop("aqi", axis=1, inplace=True)
    air_df.reset_index(drop=True, inplace=True)
    
    # weather data preprocessing
    weather_df.dropna(axis=0, inplace=True)
    weather_df.reset_index(drop=True, inplace=True)
    
    # making sliding windows
    X, y = sliding_window(weather_df, air_df, target_size="same")
    
    # flatten the windows and concanate extra attibutes
    m = X.shape[0]
    X = X.reshape((m, -1))
    
    # add to main dataset arrays
    weather_np.append(X)
    air_np.append(y)
    
weather_np = np.vstack(weather_np)
air_np = np.vstack(air_np)
air_np = air_np[:, -1]

In [16]:
weather_np = weather_np.astype("float32")
air_np = air_np.astype("float32")

random.seed(42)
idx = list(range(len(weather_np)))
random.shuffle(idx)

train_ratio = 0.8  # 80% train, 20% test
split_point = int(train_ratio * len(X))

train_idx = idx[:split_point]
test_idx = idx[split_point:]
X_train, X_test, y_train, y_test = weather_np[train_idx], weather_np[test_idx], air_np[train_idx], air_np[test_idx]

In [17]:
y_train

array([[6.74250e+02, 1.64500e+01, 6.79500e+01, 1.55000e+01, 1.05120e+02,
        1.30650e+02],
       [1.05476e+03, 5.07200e+01, 1.16200e+01, 3.57600e+01, 5.45900e+01,
        8.11300e+01],
       [6.40870e+02, 1.55900e+01, 9.15500e+01, 1.66900e+01, 7.83500e+01,
        8.92200e+01],
       ...,
       [1.60217e+03, 4.52400e+01, 1.07000e+00, 1.43100e+01, 8.21300e+01,
        1.06140e+02],
       [1.32179e+03, 5.34700e+01, 3.29000e+01, 3.43300e+01, 1.14360e+02,
        1.18500e+02],
       [8.67840e+02, 2.14200e+01, 1.05860e+02, 1.74000e+01, 5.77400e+01,
        6.92600e+01]], dtype=float32)

In [14]:
X_train

array([[ 14.4,  64. ,   7.6, ..., 100. ,  10.5,  22. ],
       [ 22.6,  93. ,  21.4, ...,  35. ,   4. ,  95. ],
       [ 13.5,  88. ,  11.6, ...,  12. ,   1.1,  18. ],
       ...,
       [ 25.7,  91. ,  24.2, ...,  98. ,   3.8, 319. ],
       [ 27.1,  94. ,  26.1, ...,  99. ,   2.2,  81. ],
       [ 26.5,  92. ,  25.1, ...,  74. ,   3.9, 202. ]])

In [18]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('forest', RandomForestRegressor(n_estimators=30, max_depth=60, min_samples_split=2, min_samples_leaf=2, n_jobs=-1))    
])

model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)

def custom_scorer(y_true, y_pred):
    scaler = StandardScaler()
    scaled_y_true = scaler.fit_transform(y_true)
    return -root_mean_squared_error(
        scaled_y_true,
        scaler.transform(y_pred),
        multioutput="uniform_average"
    )

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = custom_scorer(y_test.reshape(-1, 1), y_pred.reshape(-1, 1))
print("Custom RMSE score:", score)



Custom RMSE score: -0.539310617595306


In [None]:
pd.Series(root_mean_squared_error(model.predict(X_test), y_test, multioutput="raw_values"), 
          index=["co", "no2", "o3", "so2", "pm2_5", "pm10"])

co       1087.405532
no2        16.536725
o3         25.473569
so2        14.957574
pm2_5      62.968041
pm10       69.606491
dtype: float64