### ARIMA 

In [1]:
import pandas as pd
# Import ARIMA and datetime
from statsmodels.tsa.arima.model import ARIMA
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split

In [130]:
column_names = ['date', 'hour', 'AQSID', 'sitename', 'GMT offset', 'parameter name', 'reporting units', 'value', 'datasource']

df = pd.read_csv('./data/HourlyData_2022030100.dat',  sep='|', names=column_names)
df = df.rename(columns={'date ': 'date'})  # renames 'date ' to 'date'
df['date'] = df['date'].str.strip()  # removes any leading/trailing whitespace in 'date' column
df=df[:-1]
df

Unnamed: 0,date,hour,AQSID,sitename,GMT offset,parameter name,reporting units,value,datasource
0,03/01/22,00:00,000010102,St. John's,-4.0,OZONE,PPB,29.0,Newfoundland & Labrador DEC
1,03/01/22,00:00,000010401,Mount Pearl,-4.0,OZONE,PPB,30.0,Newfoundland & Labrador DEC
2,03/01/22,00:00,000010401,Mount Pearl,-4.0,PM2.5,UG/M3,13.0,Newfoundland & Labrador DEC
3,03/01/22,00:00,000010601,Goose Bay,-4.0,OZONE,PPB,37.0,Canadian Air and Precipitation Monitoring Network
4,03/01/22,00:00,000010602,MacPherson Avenue -,-4.0,PM2.5,UG/M3,6.0,Environment Canada
...,...,...,...,...,...,...,...,...,...
8733,03/01/22,00:00,240190004,Horn Point,-5.0,BARPR,MILLIBAR,1023.4,Maryland Department of the Environment
8734,03/01/22,00:00,240190004,Horn Point,-5.0,OZONE,PPB,43.0,Maryland Department of the Environment
8735,03/01/22,00:00,240190004,Horn Point,-5.0,PM2.5,UG/M3,6.0,Maryland Department of the Environment
8736,03/01/22,00:00,240190004,Horn Point,-5.0,PRECIP,MM,0.0,Maryland Department of the Environment


In [3]:
df = pd.read_csv('./data/6data.csv',  sep=',')
df_ozone = df[df['parameter name']== 'OZONE'].reset_index(drop = True)
df_ozone['date'] = pd.to_datetime(df_ozone['date'])
df = df_ozone.sort_values(by=['date']).reset_index(drop = True)
df.tail(2)

  df_ozone['date'] = pd.to_datetime(df_ozone['date'])


Unnamed: 0,date,hour,AQSID,sitename,GMT offset,parameter name,reporting units,value,datasource
166696,2023-03-31,09:00,170314007,DESPLNS,-6.0,OZONE,PPB,48.0,Illinois EPA
166697,2023-03-31,09:00,20401,SOUTHAMPTON,-4.0,OZONE,PPB,42.0,Canada-Prince Edward Island1


In [4]:
df['hour'] = pd.to_datetime(df['hour'])
df['hour'] = df['hour'].dt.hour
last_dates = df['date'].iloc[-1]
print(last_dates)

2023-03-31 00:00:00


  df['hour'] = pd.to_datetime(df['hour'])


In [5]:
df['date'] = pd.to_datetime(df['date'])
df['hour'] = df['hour'].astype(str).str.zfill(6)
df['hour'] = df['hour'].str[:2] + ':' + df['hour'].str[2:4] + ':' + df['hour'].str[4:]
df['datetime'] = pd.to_datetime(df['date'].dt.strftime('%Y-%m-%d') + ' ' + df['hour'])

In [6]:
df.tail(2)

Unnamed: 0,date,hour,AQSID,sitename,GMT offset,parameter name,reporting units,value,datasource,datetime
166696,2023-03-31,00:00:09,170314007,DESPLNS,-6.0,OZONE,PPB,48.0,Illinois EPA,2023-03-31 00:00:09
166697,2023-03-31,00:00:09,20401,SOUTHAMPTON,-4.0,OZONE,PPB,42.0,Canada-Prince Edward Island1,2023-03-31 00:00:09


In [7]:
last_date = df['datetime'].iloc[-1]
print(last_date)
last_date = last_date + timedelta(days=1)
print(last_date)

2023-03-31 00:00:09
2023-04-01 00:00:09


In [9]:
# Define the number of days to forecast
# forecast_horizon = 7
forecast_horizon = 24

# Split data into training and testing sets
train, test = train_test_split(df, test_size=forecast_horizon, shuffle=False)

# Generate the date range for the forecast
forecast_dates = [last_date + timedelta(hours=i) for i in range(1, forecast_horizon+1)]
print(forecast_dates)

# Fit ARIMA model
model = ARIMA(df['value'], order=(1, 0, 0))
# Split data into training and testing sets
results = model.fit()

# Make predictions on test set
print(test.index[0], test.index[-1])
predictions = results.predict(start=test.index[0], end=test.index[-1])

# Evaluate model performance on test set
mse = ((predictions - test['value']) ** 2).mean()
print('MSE:', mse)

# Forecast future values
forecast_values = results.forecast(steps=forecast_horizon)

# Combine the forecasted values with the corresponding dates
forecast = pd.DataFrame({
    'date': forecast_dates,
    'value': forecast_values
})

# Set the date as the index
forecast = forecast.set_index('date')


[Timestamp('2023-04-01 01:00:09'), Timestamp('2023-04-01 02:00:09'), Timestamp('2023-04-01 03:00:09'), Timestamp('2023-04-01 04:00:09'), Timestamp('2023-04-01 05:00:09'), Timestamp('2023-04-01 06:00:09'), Timestamp('2023-04-01 07:00:09'), Timestamp('2023-04-01 08:00:09'), Timestamp('2023-04-01 09:00:09'), Timestamp('2023-04-01 10:00:09'), Timestamp('2023-04-01 11:00:09'), Timestamp('2023-04-01 12:00:09'), Timestamp('2023-04-01 13:00:09'), Timestamp('2023-04-01 14:00:09'), Timestamp('2023-04-01 15:00:09'), Timestamp('2023-04-01 16:00:09'), Timestamp('2023-04-01 17:00:09'), Timestamp('2023-04-01 18:00:09'), Timestamp('2023-04-01 19:00:09'), Timestamp('2023-04-01 20:00:09'), Timestamp('2023-04-01 21:00:09'), Timestamp('2023-04-01 22:00:09'), Timestamp('2023-04-01 23:00:09'), Timestamp('2023-04-02 00:00:09')]
166674 166697
MSE: 50.63697238786555


In [15]:
print(forecast)

                         value
date                          
2023-04-01 01:00:09  38.405929
2023-04-01 02:00:09  36.406365
2023-04-01 03:00:09  35.293906
2023-04-01 04:00:09  34.674989
2023-04-01 05:00:09  34.330654
2023-04-01 06:00:09  34.139083
2023-04-01 07:00:09  34.032503
2023-04-01 08:00:09  33.973206
2023-04-01 09:00:09  33.940217
2023-04-01 10:00:09  33.921863
2023-04-01 11:00:09  33.911652
2023-04-01 12:00:09  33.905971
2023-04-01 13:00:09  33.902811
2023-04-01 14:00:09  33.901052
2023-04-01 15:00:09  33.900074
2023-04-01 16:00:09  33.899530
2023-04-01 17:00:09  33.899227
2023-04-01 18:00:09  33.899058
2023-04-01 19:00:09  33.898965
2023-04-01 20:00:09  33.898912
2023-04-01 21:00:09  33.898883
2023-04-01 22:00:09  33.898867
2023-04-01 23:00:09  33.898858
2023-04-02 00:00:09  33.898853


 The data shows ozone level predictions for air quality at different times on March 31st and April 1st.

In [18]:
from sklearn.metrics import accuracy_score
acs = accuracy_score(forecast['value'].astype(int), test['value'].astype(int))


In [19]:
acs

0.0

In [20]:
print(forecast['value'].astype(int))

date
2023-04-01 01:00:09    38
2023-04-01 02:00:09    36
2023-04-01 03:00:09    35
2023-04-01 04:00:09    34
2023-04-01 05:00:09    34
2023-04-01 06:00:09    34
2023-04-01 07:00:09    34
2023-04-01 08:00:09    33
2023-04-01 09:00:09    33
2023-04-01 10:00:09    33
2023-04-01 11:00:09    33
2023-04-01 12:00:09    33
2023-04-01 13:00:09    33
2023-04-01 14:00:09    33
2023-04-01 15:00:09    33
2023-04-01 16:00:09    33
2023-04-01 17:00:09    33
2023-04-01 18:00:09    33
2023-04-01 19:00:09    33
2023-04-01 20:00:09    33
2023-04-01 21:00:09    33
2023-04-01 22:00:09    33
2023-04-01 23:00:09    33
2023-04-02 00:00:09    33
Name: value, dtype: int32


In [21]:
print(test['value'].astype(int))

166674    47
166675    44
166676    47
166677    49
166678    51
166679    52
166680    47
166681    52
166682    46
166683    48
166684    40
166685    36
166686    48
166687    48
166688    45
166689    48
166690    55
166691    54
166692    46
166693    43
166694    50
166695    46
166696    48
166697    42
Name: value, dtype: int32


In [25]:
test['pred'] = forecast.values

In [26]:
test['pred']

166674    38.405929
166675    36.406365
166676    35.293906
166677    34.674989
166678    34.330654
166679    34.139083
166680    34.032503
166681    33.973206
166682    33.940217
166683    33.921863
166684    33.911652
166685    33.905971
166686    33.902811
166687    33.901052
166688    33.900074
166689    33.899530
166690    33.899227
166691    33.899058
166692    33.898965
166693    33.898912
166694    33.898883
166695    33.898867
166696    33.898858
166697    33.898853
Name: pred, dtype: float64

In [27]:
from sklearn.metrics import accuracy_score
acs = accuracy_score(test['pred'].astype(int), test['value'].astype(int))
print(acs)

0.0
