# Using ARIMA to Predict Close Prices from 2018 to 2020 of NUS Fintech Pte Ltd

---

## Data
It consists of the Low, High, Open, Close and Volume of the stock prices of NUS Fintech Pte Ltd across a period of 11 years, from 2010 to 2021.

## Your task
To predict the Close price from 2018 to 2020. The exact dates are already present in the sample_solution.csv file. Replace the 0s in the Predicted column with your forecasted values!

In [None]:
import os, time
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf, adfuller
from statsmodels.tsa.arima.model import ARIMA

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Read Competition CSV Files and Parse only Date & Close Columns

In [None]:
def parser(s): return datetime.strptime(s, '%Y-%m-%d')

submission = pd.read_csv('/kaggle/input/nus-fintech-society-project-1/sample_submission.csv', parse_dates=[0], index_col=0, squeeze=True)
df = pd.read_csv('/kaggle/input/nus-fintech-society-project-1/data.csv', parse_dates=[0], index_col=0, usecols=['Date', 'Close'], squeeze=True)
df = df.resample('D').ffill().astype('int32')
df

## Simple plot function that can plot multiple Pandas Series to avoid duplicating code

In [None]:
def plot(series_list, title, ylabel, drawlines=False):
    plt.figure(figsize=(15,5))
    
    for series in series_list:
        plt.plot(series)
    
    plt.title(title, fontsize=20)
    plt.ylabel(ylabel, fontsize=16)
    
    if drawlines: 
        for year in range(series.index[0].year+1, series.index[-1].year+1): plt.axvline(pd.to_datetime(f'{year}-01-01'), color='k', linestyle='--', alpha=0.2)

In [None]:
plot([df], 'Closing Price over Time', 'Close', True)

# 

# Finding Stationarity of the Series

---

## Check using the Augmented Dickey-Fuller test to test for a unit root in a univariate process in the presence of serial correlation

Null Hypothesis (that series is stationary) cannot be rejected since:
* p-value > 0.05
* test statistics are greater than the critical values

In [None]:
adf = adfuller(df)
print(f'p-value: {adf[1]} > 0.05')
print(f'critical value: {adf[0]} > {adf[-2]}')

# Get Returns (Daily Percent Change) to Obtain Stationarity

---

Null Hypothesis (that series is stationary) can now be rejected since:
* p-value < 0.05
* test statistics are all smaller than the critical values

In [None]:
df_returns = df.pct_change().dropna()
df_returns

In [None]:
adf_returns = adfuller(df_returns)
print(f'p-value: {adf_returns[1]} < 0.05')
print(f'critical value: {adf_returns[0]} < {adf[-2]}')

In [None]:
plot([df_returns], 'Returns over Time', '% Change', True)

# Finding the AR(p) and MA(q) values using PACF and ACF respectively

---

* We observe p,q = 4,5,21,33 are statistically significant (> 0.05)
* Choose p,q = 5 since sharp drop observed after 5th lag

In [None]:
plot_pacf(df_returns, zero=False, lags=50);

In [None]:
plot_acf(df_returns, zero=False, lags=50);

# Perform Train/Test Split to Evaluate Model Performance

---

* Short term prediction of 1 month used to maximise predictive capabilities

In [None]:
days_to_pred = 180
traintest_splitdate = df.index[-1] - timedelta(days=days_to_pred)

train = df[:traintest_splitdate]
test = df[traintest_splitdate + timedelta(days=1):]

train_returns = df_returns[:traintest_splitdate]
test_returns = df_returns[traintest_splitdate + timedelta(days=1):]

most_recent_price = df[:traintest_splitdate][-1]

In [None]:
train.shape, test.shape

# Attempt #1: Train an ARIMA Model using (p,d,q) of (5,0,5) on the Returns Series

---

In [None]:
model = ARIMA(train_returns, order=(5,0,5))
fit_model = model.fit()
pred = fit_model.predict(start=test_returns.index[0], end=test_returns.index[-1])

## Get Predicted Returns over the Number of Days to Predict and Compare with Actual Returns

In [None]:
plot([pred], 'Predicted Returns', 'Returns')

In [None]:
plot([test_returns], 'Actual Returns', 'Returns')

## Get the Residuals (Errors) of the Predicted Returns over Actual Returns

In [None]:
residuals = fit_model.resid
plot([residuals[-days_to_pred:]], 'ARMIA Model Residuals', 'Error')

## Get Actual Closing Prices by Cumulatively Multiplying Returns with Last Known Price

In [None]:
pred_price = most_recent_price * (1 + pred).cumprod()
plot([train[datetime(2015,1,1):], pred_price, test], 'Actual vs Prediction', 'Closing Price')

## Evaluate Model using RMSE on Returns and Closing Prices

* Close Price RMSE = $980 is a decent prediction since only predicting 1 month

In [None]:
metrics.mean_squared_error(test, pred_price, squared=False)

# Attempt #2: Try using only data from 1 year ago onwards since it looks like a clearer trend

---

In [None]:
df_recent = df[-365:]
plot([df_recent], 'Closing Price over Time from 1 year ago', 'Close')

In [None]:
df_returns_recent = df_recent.pct_change().dropna()

In [None]:
train_recent = df_recent[:traintest_splitdate]
test_recent = df_recent[traintest_splitdate + timedelta(days=1):]

train_returns_recent = df_returns_recent[:traintest_splitdate]
test_returns_recent = df_returns_recent[traintest_splitdate + timedelta(days=1):]

In [None]:
model_recent = ARIMA(train_returns_recent, order=(5,0,5))
fit_model_recent = model_recent.fit()
pred_recent = fit_model_recent.predict(start=test_returns_recent.index[0], end=test_returns_recent.index[-1])

## The model did fit better since the past 1 year had a clear uptrend but might perform worse on longer predictions

In [None]:
pred_recent_price = train_recent[-1] * (1 + pred_recent).cumprod()
plot([train_recent, test_recent, pred_recent_price], 'Actual vs Prediction using only Recent Data', 'Closing Price')

In [None]:
metrics.mean_squared_error(test_recent, pred_recent_price, squared=False)

# Attempt #3: Try choosing only the date range with the most stationarity

---

## Seems like somewhen between 2012-2013 and 2015-2016, the prices are fluctuating about a certain mean. Slicing the dataframe helps get the exact months. 

In [None]:
plot([df], 'Closing Price over Time', 'Close', True)

In [None]:
plot([df[datetime(2012,1,1):datetime(2012,6,1)]], 'Jan 2012 to Jun 2012','Close')

In [None]:
plot([df[datetime(2015,6,1):datetime(2016,1,1)]], 'Jun 2015 to Jan 2016', 'Close')

In [None]:
train_start = datetime(2012,5,1)
test_start = datetime(2015,11,1)

train_chosen = df[train_start:test_start]
test_chosen = df[test_start + timedelta(days=1):]

train_returns_chosen = df_returns[train_start:test_start]
test_returns_chosen = df_returns[test_start + timedelta(days=1):]

In [None]:
plot([train_chosen], 'Chosen Stationary Period','Close', True)

## Although the PACF graph looks good with strong correlation for lag 1, the ACF graph indicates non-stationarity so using Returns will probably be better

In [None]:
plot_pacf(train_chosen, zero=False);

In [None]:
plot_acf(train_chosen, zero=False);

## It looks much better with good p,q value of 5

In [None]:
plot([train_returns_chosen], 'Chosen Stationary Period','Close', True)

In [None]:
plot_pacf(train_returns_chosen, zero=False);

In [None]:
plot_acf(train_returns_chosen, zero=False);

In [None]:
model_chosen = ARIMA(train_returns_chosen, order=(5,0,5))
fit_model_chosen = model_chosen.fit()
pred_chosen = fit_model_chosen.predict(start=test_returns_chosen.index[0], end=test_returns_chosen.index[-1])

## Unsurprisingly, it is unfortunate that long term predictions are very bad because it cannot predict the strong uptrend from 2016-2017. Nonetheless, it might perform better in the long run if prices move sideways (which happens more often than the clear uptrends from mid 2012 - mid 2013 and 2016 - 2017). 

In [None]:
pred_chosen_price = train_chosen[-1] * (1 + pred_chosen).cumprod()
plot([train_chosen, test_chosen, pred_chosen_price], 'Actual vs Prediction using only Recent Data', 'Closing Price')

In [None]:
metrics.mean_squared_error(test_chosen, pred_chosen_price, squared=False)

# Now Predict 3 years of Prices (100% Terrible)

---

## Very Basic Linear Uptrend: Might perform fine considering a 3-year prediction from only 6.5 years of data

In [None]:
model = ARIMA(df_returns, order=(5,0,5))
fit_model = model.fit()
submission_pred = fit_model.predict(start=submission.index[0], end=submission.index[-1])

In [None]:
submission_pred_price = df[-1] * (1 + submission_pred).cumprod()
plot([df, submission_pred_price, pred_price], 'Submission Prediction using Full Data', 'Closing Price')

## Exponential Growth: Stonks only go up

In [None]:
model_recent = ARIMA(df_returns[-365:], order=(5,0,5))
fit_model_recent = model_recent.fit()
submission_pred_recent = fit_model_recent.predict(start=submission.index[0], end=submission.index[-1])

In [None]:
submission_pred_recent_price = df[-1] * (1 + submission_pred_recent).cumprod()
plot([df, submission_pred_recent_price, pred_recent_price], 'Submission Prediction using only Recent Data', 'Closing Price')

## Barely any Growth: Since only trained on the sideways data (ARIMA/AR/MA Models produce around the same results)

In [None]:
model_chosen = ARIMA(train_returns_chosen, order=(5,0,5))
fit_model_chosen = model_chosen.fit()
submission_pred_chosen = fit_model_chosen.predict(start=submission.index[0], end=submission.index[-1])

In [None]:
submission_pred_chosen_price = df[-1] * (1 + submission_pred_chosen).cumprod()
plot([df, submission_pred_chosen_price, pred_chosen_price], 'ARIMA Prediction using only Chosen Semi-Stationary Data', 'Closing Price')

### AR(5) Model instead

In [None]:
model_chosen = ARIMA(train_returns_chosen, order=(5,0,0))
fit_model_chosen = model_chosen.fit()
submission_pred_chosen = fit_model_chosen.predict(start=submission.index[0], end=submission.index[-1])

In [None]:
submission_pred_chosen_price = df[-1] * (1 + submission_pred_chosen).cumprod()
plot([df, submission_pred_chosen_price, pred_chosen_price], 'AR Prediction using only Chosen Semi-Stationary Data', 'Closing Price')

### MA(5) Model instead

In [None]:
model_chosen = ARIMA(train_returns_chosen, order=(0,0,5))
fit_model_chosen = model_chosen.fit()
submission_pred_chosen = fit_model_chosen.predict(start=submission.index[0], end=submission.index[-1])

In [None]:
submission_pred_chosen_price = df[-1] * (1 + submission_pred_chosen).cumprod()
plot([df, submission_pred_chosen_price, pred_chosen_price], 'MA Prediction using only Chosen Semi-Stationary Data', 'Closing Price')

# Conclusion

---

* They are all terrible. But the ones trained on all available data and on sideways data look the most sensible. 

In [None]:
to_submit = submission_pred_chosen_price.copy()
my_submission = to_submit[to_submit.index.isin(submission.index)]
my_submission = my_submission.reset_index()
my_submission.columns = ['Date', 'Predicted']
my_submission.to_csv('submission.csv', index=False)
my_submission