In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from statsmodels.tsa.arima_model import ARIMA

In [2]:
data_file = r'NASDAQ.csv'

In [3]:
dateparse = lambda x: pd.datetime.strptime(x, '%d-%m-%Y')

In [4]:
data = pd.read_csv(data_file, parse_dates=['Date'], date_parser=dateparse)

In [5]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2003-12-01,,,,,,
1,2004-01-01,2011.079956,2153.830078,1999.77002,2066.149902,2066.149902,45972690000.0
2,2004-02-01,2072.129883,2094.919922,1991.050049,2029.819946,2029.819946,35916840000.0
3,2004-03-01,2036.920044,2069.02002,1896.910034,1994.219971,1994.219971,42497500000.0
4,2004-04-01,1996.449951,2079.120117,1919.390015,1920.150024,1920.150024,40153120000.0


In [6]:
# Collecting only reqd parameters Date and Open from data and deleting the first row containing NaN values
df = data[['Date', 'Open']].drop(data.index[0]).set_index('Date')

In [7]:
df.head()

Unnamed: 0_level_0,Open
Date,Unnamed: 1_level_1
2004-01-01,2011.079956
2004-02-01,2072.129883
2004-03-01,2036.920044
2004-04-01,1996.449951
2004-05-01,1928.719971


In [8]:
# Preparing input training data for the ARIMA model
train_data = df.values
history = [x for x in train_data]

In [9]:
# Generating a list of dates for which output is reqd (From 01 Sept 2019 -> 01 Sept 2021)
dates = pd.date_range('2019-09-01', '2021-10-01', freq = '1M') - pd.offsets.MonthBegin(1)
dates_list = [x for x in dates]

In [10]:
# Create parameters to run forecasting
start_index = len(history)
end_index = start_index + len(dates)
start_index, end_index

(188, 213)

In [11]:
# Function to invert differenced values after running the ARIMA model
def inverse_difference(history, yhat, interval = 1):
    return yhat + history[-interval]

In [12]:
# Implement ARIMA with p=5, d=1, q=0
model = ARIMA(history, order=(5,1,0))
model_fit = model.fit(disp=0)
# This forecasts the differences from the previous time period
forecast = model_fit.predict(start = start_index, end = end_index)
for yhat in forecast:
    # The inverse difference function has to be used to generate the usable value from the differences obtained
    inverted = inverse_difference(history, yhat, 1)
    history.append(inverted)

  out_full[ind] += zi
  out = out_full[ind]
  zf = out_full[ind]


In [13]:
# Dates to be predicted for
df = pd.DataFrame({'year': [2020, 2021], 'month': [9, 9], 'day': [1, 1]})
pred = pd.to_datetime(df)
pred

0   2020-09-01
1   2021-09-01
dtype: datetime64[ns]

In [14]:
# Find index of said dates in the forecast order
index = []
for date in pred:
    index.append(dates_list.index(date))
index

[12, 24]

In [16]:
# Generate output for these dates from the forecast
print("01 Sept 2020 : " + str(history[start_index + index[0]][0]))
print("01 Sept 2021 : " + str(history[start_index + index[1]][0]))

01 Sept 2020 : 8650.88503464945
01 Sept 2021 : 9053.880857286356
