In [8]:
# Import libraries
import warnings
from pprint import PrettyPrinter
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from IPython.display import VimeoVideo
from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error, accuracy_score
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.ar_model import AutoReg
import inspect
import time
from sklearn.metrics import precision_score, recall_score, f1_score,r2_score
from statsmodels.tsa.arima.model import ARIMA
warnings.simplefilter(action="ignore", category=FutureWarning)


In [9]:
#Connect to MongoDB server running at host "localhost" on port 27017
#Then connect to the "air-quality" database and assign the collection for Dar es Salaam to the variable name dar
client = MongoClient(host='localhost', port=27017)
db = client['air-quality']
dar = db['dar-es-salaam']

In [10]:
def wrangle(collection):

    results = collection.find(
        {"metadata.site": 11, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id": 0},
    )

    # Read results into DataFrame
    df = pd.DataFrame(list(results)).set_index("timestamp")

    # Localize timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Dar_es_Salaam")

    # Remove outliers
    df = df[df["P2"] <= 100]

    # Resample and forward-fill
    y = df['P2'].resample('1H').mean().fillna(method='ffill')
    return y

In [None]:
y = wrangle(dar)
y.head()

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))

y.plot(xlabel='Date', ylabel='PM2.5 Level', ax=ax);

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
y.rolling(168).mean().plot(ax=ax, ylabel='PM2.5 Level', 
                           title="Dar es Salaam PM2.5 Levels, 7-Day Rolling Average");

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y,ax=ax)
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient")
plt.title("Dar es Salaam PM2.5 Readings, ACF");

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
plot_pacf(y,ax=ax)
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient")
plt.title("Dar es Salaam PM2.5 Readings, PACF");

In [None]:
cutoff_test = int(len(y)*0.9)
y_train = y.iloc[:cutoff_test]
y_test = y.iloc[cutoff_test:]
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


In [None]:
#Build a baseline model
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean]*len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)


print("Mean P2 Reading:", y_train_mean)
print("Baseline MAE:", mae_baseline)

In [None]:
#looking for the best hyoerparameters for AR models
p_params = range(1, 31)
maes = []
for p in p_params:
    # Note start time
    start_time = time.time()
    # Train model
    model = AutoReg(y_train,lags=p).fit()
    # Calculate model training time
    elapsed_time = round(time.time() - start_time, 2)
    print(f"Trained AR {p} in {elapsed_time} seconds.")
    # Generate in-sample (training) predictions
    y_pred = model.predict().dropna()
    # Calculate training MAE
    mae = mean_absolute_error(y_train.iloc[p:],y_pred)
    # Append MAE to list in dictionary
    maes.append(mae)
print((maes.index(min(maes))+1),min(maes) )

In [None]:
#print the best hyperparameter
best_p = maes.index(min(maes))+1
best_model = AutoReg(y_train,lags=best_p ).fit()
best_model 
best_p

In [None]:
y_train_resid = best_model.resid
y_train_resid.name = "residuals"
y_train_resid.head()


In [None]:
y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
    model=AutoReg(history,lags=best_p).fit()
    next_pred=model.forecast()
    y_pred_wfv=y_pred_wfv.append(next_pred)
    history=history.append(y_test[next_pred.index])
y_pred_wfv.name = "prediction"
y_pred_wfv.index.name = "timestamp"
y_pred_wfv.head()

In [None]:
test_mae = mean_absolute_error(y_test,y_pred_wfv)
print("Test MAE (walk forward validation):", round(test_mae, 4))

In [None]:
print("Precision Score: \t {0:.4f}".format(r2_score(y_test, 
                                                           y_pred_wfv)))