In [None]:
#train = pd.read_csv("/kaggle/input/ue21cs342aa2/train.csv", index_col = 0)
#test = pd.read_csv("/kaggle/input/ue21cs342aa2/test.csv", index_col = 0)

In [None]:
# Installing the required libraries
!pip install pmdarima

# Importing the required libraries
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [None]:
# The Date column is of type string. We're converting the Date column to the timestamp datatype.
train["Date"] = pd.to_datetime(train["Date"])
test["Date"] = pd.to_datetime(test["Date"])
train
type(train['Date'][0])

In [None]:
# Setting the date column as index
train.set_index("Date", inplace=True)
test.set_index("Date", inplace=True)

In [None]:
# Plotting close price's signature
plt.figure(figsize=(16, 10))
plt.plot(train.index, train['Close'])
plt.title('Stock Close Price Over Time')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.show()

In [None]:
# Decomposition
result = seasonal_decompose(train['Close'], model='additive', period=7)

plt.figure(figsize=(20, 10))
plt.subplot(4, 1, 1)
plt.plot(train.index, train['Close'], label='Original Data')
plt.title('Original Data')

plt.subplot(4, 1, 2)
plt.plot(train.index, result.trend, label='Trend')
plt.title('Trend Component')

plt.subplot(4, 1, 3)
plt.plot(train.index, result.seasonal, label='Seasonal')
plt.title('Seasonal Component')

plt.subplot(4, 1, 4)
plt.plot(train.index, result.resid, label='Residual')
plt.title('Residual Component')

plt.tight_layout()
plt.show()

In [None]:
# Plotting ACF and PACF plots to obtain p and q values
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
plot_acf(train['Close'], lags=100, ax=ax1, title='Autocorrelation Function (ACF)')
plot_pacf(train['Close'], lags=100, ax=ax2, title='Partial Autocorrelation Function (PACF)')
plt.show()

In [None]:
# Computing the ADF statistic and p-values of the columns
# Checking whether the columns are stationary or non-stationary.
# We're also determining whether differencing is required.
print("Column:Close")
result_close = adfuller(train['Close'])
print('ADF Statistic:', result_close[0])
print('p-value:', result_close[1])

if result_close[1] > 0.05:
    print("The data is non-stationary. Differencing is required.")
else:
    print("The data is stationary.")
print()
print("Column:Open")
result_open = adfuller(train['Open'])
print('ADF Statistic:', result_open[0])
print('p-value:', result_open[1])

if result_open[1] > 0.05:
    print("The data is non-stationary. Differencing is required.")
else:
    print("The data is stationary.")
print()
print("Column:Volume")
result_open = adfuller(train['Volume'])
print('ADF Statistic:', result_open[0])
print('p-value:', result_open[1])

if result_open[1] > 0.05:
    print("The data is non-stationary. Differencing is required.")
else:
    print("The data is stationary.")

In [None]:
# Applying log transformation to the Close column.
train["Close_diff"] = np.log(train["Close"])
span = 7
train["Open_Lag1"] = train["Open"].shift(1)
train["Open_Lag2"] = train["Open"].shift(2)
train["Volume_Lag1"] = train["Volume"].shift(1)
train["Open_EMA"] = train["Open"].ewm(span=span, adjust=False).mean()
train['Volume_Ratio'] = train['Volume'] / train['Open']
# train['Price_Rate_of_Change'] = train['Open'].pct_change()
# train['Price_Momentum'] = train['Open'] - train['Open'].shift(1)
train = train.dropna()


test["Open_Lag1"] = test["Open"].shift(1)
test["Open_Lag2"] = test["Open"].shift(2)
test["Volume_Lag1"] = test["Volume"].shift(1)
test["Open_EMA"] = test["Open"].ewm(span=span, adjust=False).mean()
test['Volume_Ratio'] = test['Volume'] / test['Open']
# test['Price_Rate_of_Change'] = test['Open'].pct_change()
# test['Price_Momentum'] = test['Open'] - test['Open'].shift(1)
test = test.bfill()
train

In [None]:
# Checking the impact of the transformation on the Close data's ACF and PACF plots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
plot_acf(train['Close_diff'], lags=50, ax=ax1, title='Autocorrelation Function (ACF)')
plot_pacf(train['Close_diff'], lags=100, ax=ax2, title='Partial Autocorrelation Function (PACF)')
plt.show()

In [None]:
def hyperparameter_tuning(train_data, exogenous_data):
    stepwise_fit = auto_arima(
        train_data['Close_diff'],
        exogenous=exogenous_data,
        seasonal=True,
        stepwise=True,
        suppress_warnings=True,
        error_action="ignore",
        seasonal_order=(1,0,1,12)
    )
    return stepwise_fit.order, stepwise_fit.seasonal_order

In [None]:
# Splitting the data into 80-20 train-test
n_splits = 5
mse_results = []
tscv = TimeSeriesSplit(n_splits=n_splits)
for train_index, test_index in tscv.split(train):
    train_data = train.iloc[train_index]
    test_data = train.iloc[test_index]
    exogenous_data = train_data[['Open','Volume','Open_EMA','Volume_Ratio']]
    order, seasonal_order = hyperparameter_tuning(train_data, exogenous_data)
    #p, d, q, P, D, Q, S = (2, 1, 1, 1, 1, 2, 7)
    model = SARIMAX(train_data['Close_diff'], order=(2,1,1), seasonal_order=(1,1,2,7), exog=exogenous_data)
    result = model.fit(disp=False)

forecast_steps = len(test_data)
exogenous_test_data = test_data[['Open', 'Volume','Open_EMA','Volume_Ratio']]
forecast = result.get_forecast(steps=forecast_steps, exog=exogenous_test_data)

forecasted_values = forecast.predicted_mean
confidence_intervals = forecast.conf_int()

actual_close = test_data['Close_diff']
smape = 100 * 2 * np.mean(np.abs(forecasted_values - actual_close) / (np.abs(forecasted_values) + np.abs(actual_close)))
print("SMAPE:", smape)

mse = mean_squared_error(actual_close, forecasted_values)
print("Mean Squared Error:", mse)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(test_data.index, test_data['Close_diff'], label='Actual Close', color='blue')
plt.plot(test_data.index, forecasted_values, label='Forecasted Close', color='red')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.title('Forecasts vs. Actual Data')
plt.show()

In [None]:
best_order, best_seasonal_order = None, None
exogenous_data = train[['Open','Volume','Open_EMA','Volume_Ratio']]
stepwise_fit = auto_arima(
    train['Close_diff'],
    exogenous=exogenous_data,
    seasonal=True,
    stepwise=True,
    suppress_warnings=True,
    error_action="ignore",
    m=7,  # Seasonal period = 7 (one week)
    max_order=(5, 2, 5, 7) # The maximum values P, D, Q, S can take
)

# Get the best order and seasonal order
best_order, best_seasonal_order = stepwise_fit.order, stepwise_fit.seasonal_order

# Use these best hyperparameters to create your SARIMA model
model = SARIMAX(train['Close_diff'], order=best_order, seasonal_order=best_seasonal_order, exog=exogenous_data)
result_new = model.fit(disp=False)

# Forecast future values using the test.csv data
exog_future = test[['Open', 'Volume','Open_EMA','Volume_Ratio']]
forecast_future = result_new.get_forecast(steps=100, exog=exog_future)

forecasted_values_final = forecast_future.predicted_mean
confidence_intervals = forecast_future.conf_int()

forecasted_values_og = np.exp(forecasted_values_final)
forecasted_values_og

In [None]:
test['Close']=forecasted_values_og
test