In [1]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [2]:
def read_weather_data(path):
    df = pd.read_csv(path, sep=',')
    df.columns=['time', 'temp', 'hum', 'press', 'wind']
    df['time'] = pd.to_datetime(df['time'], errors='coerce')
    return df

In [3]:
weather_2025 = read_weather_data('../downloading_data/warsaw_weather_2025_hourly.csv')

In [4]:
def read_smog_data(path):
    df = pd.read_csv(path, sep=',', header=None)
    df.drop(columns=[2], inplace=True)
    df.columns = ['time', 'pm25']
    df['time'] = pd.to_datetime(df['time'], errors='coerce')
    df.dropna(inplace=True)
    return df

In [5]:
pm25_2025 = read_smog_data('../downloading_data/gios-pjp-data.csv')

In [6]:
pm25_2025.head()

Unnamed: 0,time,pm25
0,2025-01-01 01:00:00,65.7
1,2025-01-01 02:00:00,31.5
2,2025-01-01 03:00:00,20.2
3,2025-01-01 04:00:00,21.7
4,2025-01-01 05:00:00,20.8


In [7]:
df2025 = pd.merge(weather_2025, pm25_2025, on='time', how='inner')

In [8]:
df2025['month'] = df2025['time'].dt.month

In [9]:
df2025

Unnamed: 0,time,temp,hum,press,wind,pm25,month
0,2025-01-01 01:00:00,2,68,1020,18,65.7,1
1,2025-01-01 02:00:00,2,67,1020,19,31.5,1
2,2025-01-01 03:00:00,2,67,1019,20,20.2,1
3,2025-01-01 04:00:00,3,66,1018,21,21.7,1
4,2025-01-01 05:00:00,3,64,1018,22,20.8,1
...,...,...,...,...,...,...,...
6525,2025-10-28 20:00:00,8,73,1005,18,7.7,10
6526,2025-10-28 21:00:00,7,77,1004,17,6.9,10
6527,2025-10-28 22:00:00,7,83,1004,17,6.7,10
6528,2025-10-28 23:00:00,8,89,1003,17,7.4,10


In [10]:
df2025.describe()

Unnamed: 0,time,temp,hum,press,wind,pm25,month
count,6530,6530.0,6530.0,6530.0,6530.0,6530.0,6530.0
mean,2025-06-11 08:00:48.514548480,13.099847,65.509495,1015.679173,12.957427,12.855191,5.840276
min,2025-01-01 01:00:00,-4.0,16.0,986.0,0.0,2.4,1.0
25%,2025-04-06 18:15:00,8.0,51.0,1011.0,9.0,7.0,4.0
50%,2025-06-14 22:30:00,13.0,66.0,1016.0,12.0,9.9,6.0
75%,2025-08-21 22:45:00,19.0,82.0,1021.0,17.0,14.8,8.0
max,2025-10-29 00:00:00,35.0,100.0,1040.0,32.0,91.6,10.0
std,,7.403506,18.903297,7.536086,5.753876,9.974677,2.762083


In [11]:
X = df2025[['temp', 'press', 'hum', 'wind', 'month']]
y = df2025['pm25']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

In [14]:
param_dist = {
    'regressor__n_estimators': np.arange(200, 1200, 200), #number of trees
    'regressor__max_depth': np.arange(3, 11), #maximum depth of each tree
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2], #step size shrinkage - how much the model is updated at each step
    'regressor__subsample': [0.5, 0.7, 1.0], #fraction of samples used for fitting the individual base learners
    'regressor__colsample_bytree': [0.5, 0.7, 1.0] #fraction of features used for fitting the individual base learners
}
pipeline1 = Pipeline(steps=[
    ('regressor', XGBRegressor(
        random_state=42,
    ))
])

search = RandomizedSearchCV(
    pipeline1,
    param_dist,
    n_iter=40,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [15]:
search.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END regressor__colsample_bytree=0.7, regressor__learning_rate=0.05, regressor__max_depth=3, regressor__n_estimators=400, regressor__subsample=1.0; total time=   0.2s
[CV] END regressor__colsample_bytree=0.7, regressor__learning_rate=0.05, regressor__max_depth=3, regressor__n_estimators=400, regressor__subsample=1.0; total time=   0.1s
[CV] END regressor__colsample_bytree=0.5, regressor__learning_rate=0.05, regressor__max_depth=6, regressor__n_estimators=400, regressor__subsample=0.5; total time=   0.4s
[CV] END regressor__colsample_bytree=0.7, regressor__learning_rate=0.05, regressor__max_depth=3, regressor__n_estimators=400, regressor__subsample=1.0; total time=   0.1s
[CV] END regressor__colsample_bytree=0.5, regressor__learning_rate=0.05, regressor__max_depth=6, regressor__n_estimators=400, regressor__subsample=0.5; total time=   0.4s
[CV] END regressor__colsample_bytree=0.7, regressor__learning_rate=0.05, regressor_

In [16]:

param_grid2 = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30]
}

pipeline2 = Pipeline(steps=[
    ('regressor', RandomForestRegressor(
        random_state=42,
    ))
])

grid = GridSearchCV(
    pipeline2,
    param_grid2,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1,
    verbose=2
)

In [17]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END regressor__max_depth=None, regressor__n_estimators=100; total time=   1.1s
[CV] END regressor__max_depth=None, regressor__n_estimators=100; total time=   1.1s
[CV] END regressor__max_depth=None, regressor__n_estimators=100; total time=   1.1s
[CV] END regressor__max_depth=None, regressor__n_estimators=100; total time=   1.1s
[CV] END regressor__max_depth=None, regressor__n_estimators=100; total time=   1.2s
[CV] END regressor__max_depth=None, regressor__n_estimators=200; total time=   2.2s
[CV] END regressor__max_depth=None, regressor__n_estimators=200; total time=   2.4s
[CV] END regressor__max_depth=None, regressor__n_estimators=200; total time=   2.5s
[CV] END regressor__max_depth=10, regressor__n_estimators=100; total time=   0.7s
[CV] END regressor__max_depth=None, regressor__n_estimators=200; total time=   2.2s
[CV] END regressor__max_depth=None, regressor__n_estimators=200; total time=   2.3s
[CV] END regresso

In [18]:
RF = grid.best_estimator_

In [19]:
y_pred = RF.predict(X_test)

mse = round(mean_squared_error(y_test, y_pred),2)
rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),2)
mae = round(mean_absolute_error(y_test, y_pred),2)
r2 = round(r2_score(y_test, y_pred),2)

print(f"MSE: {mse}, RMSE: {rmse}, MAE: {mae}, R2: {r2}")

MSE: 17.63, RMSE: 4.2, MAE: 2.64, R2: 0.82


In [20]:
y_pred = RF.predict(X_test)

mse = round(mean_squared_error(y_test, y_pred),2)
rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),2)
mae = round(mean_absolute_error(y_test, y_pred),2)
r2 = round(r2_score(y_test, y_pred),2)

print(f"MSE: {mse}, RMSE: {rmse}, MAE: {mae}, R2: {r2}")

MSE: 17.63, RMSE: 4.2, MAE: 2.64, R2: 0.82


In [21]:
XGB = search.best_estimator_

In [22]:
y_pred = XGB.predict(X_test)

mse = round(mean_squared_error(y_test, y_pred),2)
rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),2)
mae = round(mean_absolute_error(y_test, y_pred),2)
r2 = round(r2_score(y_test, y_pred),2)

print(f"MSE: {mse}, RMSE: {rmse}, MAE: {mae}, R2: {r2}")

MSE: 16.45, RMSE: 4.06, MAE: 2.53, R2: 0.83


In [2]:
import pandas as pd
print(f"Evaluated on date: {pd.Timestamp.now()}")

Evaluated on date: 2026-01-14 12:28:20.803716
