### Air Quality Data - Models 

In [None]:
import pandas as pd
import numpy as np

In [None]:
#import math
#import os
#import glob
#import datetime
#import re
#import openpyxl
#import xlrd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#### Load the data

In [None]:
#Import cleaned data 

nb_air_quality = pd.read_csv("data/nb_air_quality.csv", low_memory=False)

In [None]:
aq_1_station = nb_air_quality[(nb_air_quality["STATION_ID"]==2)].copy()

cols = ['DATE_TIME','PM_25_API']

aq_1_station = aq_1_station[cols]

In [None]:
aq_1_station.set_index('DATE_TIME', inplace=True)
aq_1_station.index = pd.to_datetime(aq_1_station.index)
aq_1_station.sort_index(inplace=True)

#### Explore the dataset

In [None]:
aq_1_station.info()

In [None]:
aq_1_station.plot()

In [None]:
#aq_1_station = np.log(aq_1_station) # don't forget to transform the data back when making real predictions

#aq_1_station.plot()

In [None]:
aq_1_station.head(2)

In [None]:
#Replace NAN with mean
mean_value=aq_1_station['PM_25_API'].mean()
  
aq_1_station['PM_25_API'].fillna(value=mean_value, inplace=True)

In [None]:
#Split data

split_condition = (aq_1_station.index < pd.to_datetime("2021-07-01 00:00:00"))
aq_1_station_train = aq_1_station[split_condition].copy()
aq_1_station_test = aq_1_station[~split_condition].copy()

#### Check for stationarity of time series

Method #1: time series plot

Method #2: ACF plot and PACF plot

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller

In [None]:
acf_original = plot_acf(aq_1_station_train)

pacf_original = plot_pacf(aq_1_station_train)

Method #3: ADF test

In [None]:
adf_test = adfuller(aq_1_station_train)
print(f'p-value: {adf_test[1]}')

#### Transform to stationary: differencing

In [None]:
aq_diff = aq_1_station_train.diff().dropna()
aq_diff.plot()

In [None]:
acf_diff = plot_acf(aq_diff)

pacf_diff = plot_pacf(aq_diff)

In [None]:
adf_test = adfuller(aq_diff)
print(f'p-value: {adf_test[1]}')

#### Determine ARIMA models parameters p, q

Fit the ARIMA model

In [None]:
from statsmodels.tsa.arima.model import ARIMA

In [None]:
model = ARIMA(aq_1_station_train, order=(2,1,0))
model_fit = model.fit()
print(model_fit.summary())

#### Make time series predictions

In [None]:
import matplotlib.pyplot as plt
residuals = model_fit.resid[1:]
fig, ax = plt.subplots(1,2)
residuals.plot(title='Residuals', ax=ax[0])
residuals.plot(title='Density', kind='kde', ax=ax[1])
plt.show()

In [None]:
acf_res = plot_acf(residuals)

pacf_res = plot_pacf(residuals)

In [None]:
forecast_test = model_fit.forecast(len(aq_1_station_test))

aq_1_station['forecast'] = [None]*len(aq_1_station_train) + list(forecast_test)

aq_1_station.plot()