In [79]:
import pandas as pd
import numpy as np
import random
from joblib import dump, load
from utils.utils import sliding_window
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

In [80]:
air_df = pd.read_csv("data/air_data.csv")
weather_df = pd.read_csv("data/weather_data.csv")    

In [81]:
air_df = air_df.loc[(air_df.iloc[:, 1:] >= 0).all(axis=1)]
air_df.drop("aqi", axis=1, inplace=True)
#air_df.reset_index(drop=True, inplace=True)
air_df = air_df.reset_index(drop=True).sort_values(by=['time'])

weather_df.dropna(axis=0, inplace=True)
#weather_df.reset_index(drop=True, inplace=True)
weather_df = weather_df.reset_index(drop=True).sort_values(by=['time'])  

# making sliding windows
X, y = sliding_window(weather_df, air_df, target_size="same")
m = X.shape[0]
X = X.reshape((m, -1))
y = y[:,-1]

In [82]:
random.seed(42)
idx = list(range(len(X)))
random.shuffle(idx)

train_ratio = 0.8  # 80% train, 20% test
split_point = int(train_ratio * len(X))

train_idx = idx[:split_point]
test_idx = idx[split_point:]

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

In [83]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('forest', RandomForestRegressor(n_estimators=30, max_depth=60, min_samples_split=2, min_samples_leaf=2, n_jobs=-1))    
])

model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)

def custom_scorer(y_true, y_pred):
    scaler = StandardScaler()
    scaled_y_true = scaler.fit_transform(y_true)
    return -root_mean_squared_error(
        scaled_y_true,
        scaler.transform(y_pred),
        multioutput="uniform_average"
    )

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = custom_scorer(y_test.reshape(-1, 1), y_pred.reshape(-1, 1))
print("Custom RMSE score:", score)

Custom RMSE score: -0.4759632787048874


In [85]:
pd.Series(root_mean_squared_error(model.predict(X_test), y_test, multioutput="raw_values"), 
          index=["co", "no2", "o3", "so2", "pm2_5", "pm10"])

co       1087.405532
no2        16.536725
o3         25.473569
so2        14.957574
pm2_5      62.968041
pm10       69.606491
dtype: float64