In [4]:
!pip install mlflow keras-tuner optuna dvc -q
!pip install dvc[s3]
!apt install git -y
!pip install git+https://github.com/philipperemy/keras-tcn.git -q

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.12).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [10]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.keras
import os
import subprocess
from tcn import TCN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from sklearn.model_selection import train_test_split
import keras_tuner as kt
from tensorflow.keras.optimizers import Adam

In [None]:
MLFLOW_TRACKING_URI = ""
EXPERIMENT_NAME = "AQI hyperparameter & model testing (Manual)"
WINDOW_SIZE = 24
PREDICT_HORIZON = 72  # Predict next 3 days = 72 hours

In [7]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='/home/umair/mlruns/3', creation_time=1752022350617, experiment_id='3', last_update_time=1752022350617, lifecycle_stage='active', name='AQI hyperparameter & model testing (Manual)', tags={}>

In [8]:
!git init
!dvc init
!dvc remote add -d myremote s3://s3-bucket-umairrr

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/
Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org

In [9]:
!git clone https://github.com/uma1r111/10pearls-AQI-Project-
%cd 10pearls-AQI-Project-

Cloning into '10pearls-AQI-Project-'...
remote: Enumerating objects: 410, done.[K
remote: Counting objects: 100% (186/186), done.[K
remote: Compressing objects: 100% (119/119), done.[K
remote: Total 410 (delta 84), reused 148 (delta 50), pack-reused 224 (from 1)[K
Receiving objects: 100% (410/410), 5.60 MiB | 22.50 MiB/s, done.
Resolving deltas: 100% (192/192), done.
/content/10pearls-AQI-Project-


In [None]:
# Load and preprocess data
print("\nPulling latest feature_selection.csv from DVC remote (S3)...")
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_DEFAULT_REGION"] = ""
subprocess.run(["dvc", "pull"], check=True)


Pulling latest feature_selection.csv from DVC remote (S3)...


CompletedProcess(args=['dvc', 'pull'], returncode=0)

In [12]:
for root, dirs, files in os.walk(".", topdown=True):
    for name in files:
        print(os.path.join(root, name))

./README.md
./.dvcignore
./requirements.txt
./.gitignore
./full_preprocessed_aqi_weather_data_with_all_features.csv
./karachi_weather_apr1_to_current.csv
./fetch_daily_data.py
./feature_selection.csv.dvc
./feature_selection.csv
./Exploratory Visualization Analysis/Exploratory Visualization Analysis.ipynb
./Exploratory Visualization Analysis/Exploratory Visualization Analysis.txt
./.dvc/config
./.dvc/.gitignore
./.dvc/cache/files/md5/a3/42be088d6bbebcb8002a166df6f909
./.dvc/tmp/btime
./.dvc/tmp/lock
./Data Preprocessing/Data_PreProcessing.ipynb
./Data Preprocessing/data_quality_check.py
./Data Preprocessing/run_preprocessing.py
./Data Collection/karachi_weather_jun2025.ipynb
./Data Collection/pollutants info Apr - Jun.ipynb
./Data Collection/karachi_weather_apr1_jun14.ipynb
./.github/workflows/feature_engineering.yml
./.github/workflows/feature_selection.yml
./.github/workflows/update_data.yml
./Feature Selection/initial_feature_extraction.ipynb
./Feature Selection/feature_selection.py


In [13]:
df = pd.read_csv("feature_selection.csv")

In [14]:
df['datetime'] = pd.to_datetime(df['datetime'])
df = df[(df["datetime"] >= "2025-04-01") & (df["datetime"] <= "2025-07-07")]
df = df.sort_values("datetime")

features = df.drop(columns=["datetime"])
num_features = features.shape[1]  # total features including AQI

In [15]:
print(df["datetime"].head())
print("Filtered Rows:", len(df))

0   2025-04-01 00:00:00
1   2025-04-01 01:00:00
2   2025-04-01 02:00:00
3   2025-04-01 03:00:00
4   2025-04-01 04:00:00
Name: datetime, dtype: datetime64[ns]
Filtered Rows: 2329


# **Sequence Building for Walk forward Validation**

In [16]:
def create_sequences(X, window_size, horizon):
    Xs, ys = [], []
    for i in range(len(X) - window_size - horizon):
        Xs.append(X[i:(i + window_size)].values)
        ys.append(X[(i + window_size):(i + window_size + horizon)].values)
    return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(features, WINDOW_SIZE, PREDICT_HORIZON)
X_train, X_val, y_train, y_val = train_test_split(X_seq, y_seq, test_size=0.2, shuffle=False)

In [17]:
print("Total Samples:", len(features))
print("X_seq shape:", X_seq.shape)
print("y_seq shape:", y_seq.shape)

Total Samples: 2329
X_seq shape: (2233, 24, 13)
y_seq shape: (2233, 72, 13)


# **Keras Hyperparameter Tuning**

In [18]:
def build_tcn_model(hp):
    model = Sequential()
    model.add(TCN(
        input_shape=(X_train.shape[1], X_train.shape[2]),
        nb_filters=hp.Int('nb_filters', 32, 128, step=32),
        kernel_size=hp.Choice('kernel_size', [2, 3, 4]),
        nb_stacks=hp.Int('nb_stacks', 1, 3),
        dilations=[1, 2, 4, 8],
        activation=hp.Choice("activation", ["relu", "tanh"]),
        dropout_rate=hp.Float("dropout_rate", 0.1, 0.4, step=0.1),
        use_skip_connections=True
    ))
    model.add(Dense(PREDICT_HORIZON * num_features))
    model.compile(
        optimizer=Adam(learning_rate=hp.Float('learning_rate', 1e-5, 1e-2, sampling='log')),
        loss='mse'
    )
    return model

tuner = kt.RandomSearch(
    build_tcn_model,
    objective="val_loss",
    max_trials=20,
    executions_per_trial=1,
    directory="tcn_tuner",
    project_name="aqi_tcn_tuning"
)

with mlflow.start_run(run_name="Best_TCN_Keras_Hyperparams"):
    tuner.search(
        X_train, y_train.reshape(y_train.shape[0], -1),
        validation_data=(X_val, y_val.reshape(y_val.shape[0], -1)),
        epochs=50,
        batch_size=32,
        callbacks=[EarlyStopping(patience=5)],
        verbose=0
    )
    best_hps = tuner.get_best_hyperparameters(1)[0]
    mlflow.log_params(best_hps.values)
    best_model = tuner.get_best_models(1)[0]
    preds = best_model.predict(X_val).reshape(y_val.shape)
    rmse = sqrt(mean_squared_error(y_val.flatten(), preds.flatten()))
    mlflow.log_metric("val_rmse", rmse)
    print("Best TCN RMSE:", rmse)

    print("Best hyperparameters:", best_hps.values)
    mlflow.log_metric("best_rmse", rmse)


  super(TCN, self).__init__(**kwargs)
  super(TCN, self).__init__(**kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step
Best TCN RMSE: 6.08134501169222
Best hyperparameters: {'nb_filters': 128, 'kernel_size': 4, 'nb_stacks': 1, 'activation': 'relu', 'dropout_rate': 0.30000000000000004, 'learning_rate': 0.0009243894680341854}
🏃 View run Best_TCN_Keras_Hyperparams at: http://172.174.154.85:8000/#/experiments/3/runs/5d8f7024a81a43068387921c951472f3
🧪 View experiment at: http://172.174.154.85:8000/#/experiments/3


# **TCN Model Fit**

In [19]:
with mlflow.start_run(run_name="Final_TCN_Model_Keras"):
    model = build_tcn_model(best_hps)
    model.fit(
        X_train, y_train.reshape(y_train.shape[0], -1),
        validation_data=(X_val, y_val.reshape(y_val.shape[0], -1)),
        epochs=best_hps["epochs"] if "epochs" in best_hps.values else 50,
        batch_size=32,
        callbacks=[EarlyStopping(patience=5, restore_best_weights=True)],
        verbose=1
    )

    # Evaluation
    preds = model.predict(X_val).reshape(y_val.shape)
    rmse = sqrt(mean_squared_error(y_val.flatten(), preds.flatten()))
    mae = mean_absolute_error(y_val.flatten(), preds.flatten())

    mlflow.log_params(best_hps.values)
    mlflow.log_metric("final_rmse", rmse)
    mlflow.log_metric("final_mae", mae)
    mlflow.set_tag("model_type", "TCN_MultiOutput")
    mlflow.set_tag("tuner", "Keras_Tuner")
    mlflow.set_tag("train_start", "2025-04-01")
    mlflow.set_tag("train_end", "2025-07-07")
    mlflow.keras.log_model(model, artifact_path="model")


Epoch 1/50


  super(TCN, self).__init__(**kwargs)


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 87ms/step - loss: 4962.4985 - val_loss: 158.2843
Epoch 2/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 93ms/step - loss: 310.7373 - val_loss: 104.7337
Epoch 3/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 80ms/step - loss: 209.1150 - val_loss: 59.0348
Epoch 4/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 102ms/step - loss: 161.9523 - val_loss: 53.2892
Epoch 5/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 80ms/step - loss: 141.0963 - val_loss: 79.3271
Epoch 6/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 94ms/step - loss: 132.1711 - val_loss: 49.1738
Epoch 7/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 87ms/step - loss: 133.4793 - val_loss: 53.3091
Epoch 8/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 79ms/step - loss: 112.6213 - val_loss: 45.9514
Epoch 9/50
[1m56/56[0m [32



🏃 View run Final_TCN_Model_Keras at: http://172.174.154.85:8000/#/experiments/3/runs/64150fd605134870a10295465f398445
🧪 View experiment at: http://172.174.154.85:8000/#/experiments/3


# **Predictions**

In [20]:
last_sequence = features.values[-WINDOW_SIZE:].reshape(1, WINDOW_SIZE, num_features)
future_preds = model.predict(last_sequence).reshape(PREDICT_HORIZON, num_features)

future_dates = pd.date_range(start=df['datetime'].iloc[-1] + pd.Timedelta(hours=1),
                              periods=PREDICT_HORIZON, freq='H')

future_df = pd.DataFrame(future_preds, columns=features.columns)
future_df["datetime"] = future_dates
future_df.to_csv("future_predictions_tcn_keras.csv", index=False)
mlflow.log_artifact("future_predictions_tcn_keras.csv")

print("✅ TCN model training complete. Predictions logged to MLflow.")
print(future_df.head())

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
✅ TCN model training complete. Predictions logged to MLflow.
   aqi_us_lag1  aqi_us_lag12  aqi_us_lag24      pm2_5  log_pm10  \
0    87.116890     88.207314     89.619804  29.181417  4.123623   
1    89.522728     88.895256     89.223045  28.299263  3.566315   
2    88.369255     86.463753     89.427956  29.306366  4.861019   
3    88.795662     88.871552     88.662865  29.159832  4.854623   
4    89.121201     88.895714     88.507774  29.235826  4.331135   

   scaled_humidity_%  scaled_temp_C_scaled_log_windspeed_kph   log_so2  \
0          -0.063496                                1.014492  2.422512   
1          -0.030410                                1.046001  1.975366   
2           0.175540                                0.491965  3.089325   
3           0.517245                                0.898655  3.132502   
4          -1.180160                                0.848134  2.732738   

   day_of_week  sca

  future_dates = pd.date_range(start=df['datetime'].iloc[-1] + pd.Timedelta(hours=1),
