In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load dataset
df = pd.read_csv("LSTM-Multivariate_pollution.csv")

# 1. Date Parsing & Indexing
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# 2. Imputation of Missing Values replace missing values in pollution column with 0
df['pollution'].fillna(0, inplace=True)


# 3. Feature Removal as observed from visualization, 'wnd_dir' has no strong numerical relationship with pollution.
df.drop(columns=['wnd_dir'], inplace=True)

# 4. Feature Selection
selected_features = ['dew', 'temp', 'press', 'wnd_spd', 'rain', 'snow', 'pollution']
df = df[selected_features]

# 6. Scaling (for LSTM) in the range of [0, 1]
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns, index=df.index)

# 7. Lag Feature for random forest
def add_lag_features(data, target='pollution', lags=3):
    data_with_lags = data.copy()
    for lag in range(1, lags + 1):
        data_with_lags[f'{target}_lag{lag}'] = data_with_lags[target].shift(lag)
    return data_with_lags.dropna()

# Used for Random Forest or XGBoost
rf_df = add_lag_features(df.copy())

# 8. Outputs
print("✅ Preprocessing Complete:")
print("- `scaled_df`: for LSTM or deep learning models (scaled multivariate time series)")
print("- `rf_df`: for Random Forest (tabular with lag features)")
print("- `df['pollution']`: for univariate ARIMA models")

✅ Preprocessing Complete:
- `scaled_df`: for LSTM or deep learning models (scaled multivariate time series)
- `rf_df`: for Random Forest (tabular with lag features)
- `df['pollution']`: for univariate ARIMA models


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['pollution'].fillna(0, inplace=True)


**LSTM**

In [2]:
# Importing modules needed for this model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Prepare sequences for LSTM
n_past = 24  # past 24 hours
n_future = 1  # next hour prediction

X, y = [], []
for i in range(n_past, len(scaled_df) - n_future + 1):
    X.append(scaled_df.iloc[i - n_past:i].values)
    y.append(scaled_df.iloc[i + n_future - 1]['pollution'])

X, y = np.array(X), np.array(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Build LSTM model
model = Sequential([
    LSTM(50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

# Predict and evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Root Mean Squared Error (RMSE):", rmse)

  super().__init__(**kwargs)


Epoch 1/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 17ms/step - loss: 0.0060 - val_loss: 7.6292e-04
Epoch 2/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - loss: 0.0012 - val_loss: 6.4172e-04
Epoch 3/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - loss: 9.4599e-04 - val_loss: 6.1771e-04
Epoch 4/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - loss: 8.2211e-04 - val_loss: 6.1020e-04
Epoch 5/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - loss: 8.8329e-04 - val_loss: 6.0134e-04
Epoch 6/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 7.8857e-04 - val_loss: 5.6426e-04
Epoch 7/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - loss: 9.8592e-04 - val_loss: 5.2923e-04
Epoch 8/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - loss: 8.4881e

**Random Forest**

In [3]:
# Importing modules needed for this model
from sklearn.ensemble import RandomForestRegressor

# Create lag features (use past 3 hours)
def add_lag_features(data, target='pollution', lags=3):
    for lag in range(1, lags + 1):
        data[f'{target}_lag{lag}'] = data[target].shift(lag)
    return data.dropna()

rf_df = add_lag_features(df.copy())

# Define X and y
X = rf_df.drop(columns=['pollution'])
y = rf_df['pollution']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Show feature importances (optional)
importances = model.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)

print("✅ Random Forest RMSE:", rmse)
print("\nTop Features:")
print(importance_df)

✅ Random Forest RMSE: 23.948320878022166

Top Features:
          Feature  Importance
6  pollution_lag1    0.919107
7  pollution_lag2    0.021901
8  pollution_lag3    0.015082
0             dew    0.012987
3         wnd_spd    0.010745
1            temp    0.009221
2           press    0.008870
4            rain    0.001869
5            snow    0.000216


**ARIMA**

In [4]:
# Importing modules needed for this model
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings("ignore")

# Load data
df = pd.read_csv("LSTM-Multivariate_pollution.csv")
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Use only the target variable
pollution_series = df['pollution'].fillna(0)

# Split into train/test (80/20)
train_size = int(len(pollution_series) * 0.8)
train, test = pollution_series[:train_size], pollution_series[train_size:]

# Fit ARIMA model (p=5, d=1, q=2 is a common starting point; use grid search for optimal)
model = ARIMA(train, order=(5, 1, 2))
model_fit = model.fit()

# Forecast
forecast = model_fit.forecast(steps=len(test))
rmse = np.sqrt(mean_squared_error(test, forecast))

print("✅ ARIMA RMSE:", rmse)

✅ ARIMA RMSE: 93.67847467359053
