In [130]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from statsmodels.tsa.arima_model import ARIMA

In [131]:
data_file = r'DowJones.csv'

In [132]:
dateparse = lambda x: pd.datetime.strptime(x, '%d-%m-%Y')

In [133]:
data = pd.read_csv(data_file, parse_dates=['Date'], date_parser=dateparse)

In [134]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2003-12-01,,,,,,
1,2004-01-01,10452.74023,10705.17969,10367.41016,10488.07031,10488.07031,4380510000.0
2,2004-02-01,10487.78027,10753.62988,10434.66992,10583.91992,10583.91992,3894640000.0
3,2004-03-01,10582.25,10695.54981,10007.49023,10357.7002,10357.7002,5009640000.0
4,2004-04-01,10357.51953,10570.80957,10219.17969,10225.57031,10225.57031,4530000000.0


In [135]:
# Collecting only reqd parameters Date and Open from data and deleting the first row containing NaN values
df = data[['Date', 'Open']].drop(data.index[0]).set_index('Date')

In [136]:
df.head()

Unnamed: 0_level_0,Open
Date,Unnamed: 1_level_1
2004-01-01,10452.74023
2004-02-01,10487.78027
2004-03-01,10582.25
2004-04-01,10357.51953
2004-05-01,10227.26953


In [137]:
# Preparing input training data for the ARIMA model
train_data = df.values
history = [x for x in train_data]

In [138]:
# Generating a list of dates for which output is reqd (From 01 Sept 2019 -> 01 Sept 2021)
dates = pd.date_range('2019-09-01', '2021-10-01', freq = '1M') - pd.offsets.MonthBegin(1)
dates_list = [x for x in dates]

In [139]:
# Create parameters to run forecasting
start_index = len(history)
end_index = start_index + len(dates)
start_index, end_index

(188, 213)

In [140]:
# Function to invert differenced values after running the ARIMA model
def inverse_difference(history, yhat, interval = 1):
    return yhat + history[-interval]

In [141]:
# Implement ARIMA with p=5, d=1, q=0
model = ARIMA(history, order=(5,1,0))
model_fit = model.fit(disp=0)
# This forecasts the differences from the previous time period
forecast = model_fit.predict(start = start_index, end = end_index)
for yhat in forecast:
    # The inverse difference function has to be used to generate the usable value from the differences obtained
    inverted = inverse_difference(history, yhat, 1)
    history.append(inverted)

In [142]:
# Dates to be predicted for
df = pd.DataFrame({'year': [2020, 2021], 'month': [9, 9], 'day': [1, 1]})
pred = pd.to_datetime(df)
pred

0   2020-09-01
1   2021-09-01
dtype: datetime64[ns]

In [143]:
# Find index of said dates in the forecast order
index = []
for date in pred:
    index.append(dates_list.index(date))
index

[12, 24]

In [145]:
# Generate output for these dates from the forecast
print("01 Sept 2020 : " + str(history[start_index + index[0]][0]))
print("01 Sept 2021 : " + str(history[start_index + index[1]][0]))

01 Sept 2020 : 28101.27399855731
01 Sept 2021 : 29159.70967930367
