<a href="https://colab.research.google.com/github/theekshanamadumal/MachineLearning/blob/main/time-series/Multi_variant_stocks_prediction_using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Multi variant time series problem using LSTM on Yahoo stock data
https://youtu.be/tepxdcepTbY

dataset: https://finance.yahoo.com/quote/GE/history/

Also try S&P: https://finance.yahoo.com/quote/%5EGSPC/history?p=%5EGSPC

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , LSTM , Dropout


In [4]:
df = pd.read_csv('/content/GE.csv')

In [5]:
df.shape

(251, 7)

In [6]:
df.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2022-05-24,58.337238,58.446526,56.252926,58.173302,57.922199,7851377
1,2022-05-25,57.35363,58.227947,56.690086,58.18111,57.929977,9089976
2,2022-05-26,58.719749,60.663544,58.313816,60.117096,59.857605,6008274
3,2022-05-27,60.335674,61.498829,60.101482,61.483215,61.217827,6240904
4,2022-05-31,60.850899,61.701794,60.702576,61.116314,60.852512,6649671
5,2022-06-01,61.733021,62.014053,59.86729,60.515221,60.254017,7593768
6,2022-06-02,60.905544,61.405151,60.241997,60.889931,60.627106,7898774
7,2022-06-03,59.906322,60.226387,59.071037,60.085869,59.826515,6754201
8,2022-06-06,60.819672,61.116314,59.929741,60.109291,59.849834,6077192
9,2022-06-07,60.039032,61.007027,59.461357,60.889931,60.627106,6566150


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       251 non-null    object 
 1   Open       251 non-null    float64
 2   High       251 non-null    float64
 3   Low        251 non-null    float64
 4   Close      251 non-null    float64
 5   Adj Close  251 non-null    float64
 6   Volume     251 non-null    int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 13.9+ KB


In [8]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,251.0,251.0,251.0,251.0,251.0,251.0
mean,69.983615,70.962377,69.21228,70.16496,70.05959,7686597.0
std,17.007339,17.144271,16.956269,17.059839,17.121118,3052999.0
min,47.619049,47.74395,46.783764,47.689304,47.540154,2426342.0
25%,56.986729,57.611243,55.647931,56.916472,56.804752,5804350.0
50%,64.207649,65.253708,63.48946,64.699455,64.608337,6960698.0
75%,83.549999,84.475003,82.335003,83.889999,83.812306,8832002.0
max,104.519997,105.940002,104.099998,104.550003,104.550003,28592300.0


In [9]:
train_dates = pd.to_datetime(df['Date'])
train_dates.tail(15)

236   2023-05-03
237   2023-05-04
238   2023-05-05
239   2023-05-08
240   2023-05-09
241   2023-05-10
242   2023-05-11
243   2023-05-12
244   2023-05-15
245   2023-05-16
246   2023-05-17
247   2023-05-18
248   2023-05-19
249   2023-05-22
250   2023-05-23
Name: Date, dtype: datetime64[ns]

In [10]:
train_dates

0     2022-05-24
1     2022-05-25
2     2022-05-26
3     2022-05-27
4     2022-05-31
         ...    
246   2023-05-17
247   2023-05-18
248   2023-05-19
249   2023-05-22
250   2023-05-23
Name: Date, Length: 251, dtype: datetime64[ns]

In [None]:

#Variables for training
cols = list(df)[1:6]
#Date and volume columns are not used in training. 
print(cols) #['Open', 'High', 'Low', 'Close', 'Adj Close']

#New dataframe with only training data - 5 columns
df_for_training = df[cols].astype(float)

# df_for_plot=df_for_training.tail(5000)
# df_for_plot.plot.line()

#LSTM uses sigmoid and tanh that are sensitive to magnitude so values need to be normalized
# normalize the dataset
scaler = StandardScaler()
scaler = scaler.fit(df_for_training)
df_for_training_scaled = scaler.transform(df_for_training)


#As required for LSTM networks, we require to reshape an input data into n_samples x timesteps x n_features. 
#In this example, the n_features is 5. We will make timesteps = 14 (past days data used for training). 

#Empty lists to be populated using formatted training data
trainX = []
trainY = []

n_future = 1   # Number of days we want to look into the future based on the past days.
n_past = 14  # Number of past days we want to use to predict the future.

#Reformat input data into a shape: (n_samples x timesteps x n_features)
#In my example, my df_for_training_scaled has a shape (12823, 5)
#12823 refers to the number of data points and 5 refers to the columns (multi-variables).
for i in range(n_past, len(df_for_training_scaled) - n_future +1):
    trainX.append(df_for_training_scaled[i - n_past:i, 0:df_for_training.shape[1]])
    trainY.append(df_for_training_scaled[i + n_future - 1:i + n_future, 0])

trainX, trainY = np.array(trainX), np.array(trainY)

print('trainX shape == {}.'.format(trainX.shape))
print('trainY shape == {}.'.format(trainY.shape))

#In my case, trainX has a shape (12809, 14, 5). 
#12809 because we are looking back 14 days (12823 - 14 = 12809). 
#Remember that we cannot look back 14 days until we get to the 15th day. 
#Also, trainY has a shape (12809, 1). Our model only predicts a single value, but 
#it needs multiple variables (5 in my example) to make this prediction. 
#This is why we can only predict a single day after our training, the day after where our data ends.
#To predict more days in future, we need all the 5 variables which we do not have. 
#We need to predict all variables if we want to do that. 

# define the Autoencoder model

model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=True))
model.add(LSTM(32, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(trainY.shape[1]))

model.compile(optimizer='adam', loss='mse')
model.summary()


# fit the model
history = model.fit(trainX, trainY, epochs=5, batch_size=16, validation_split=0.1, verbose=1)

plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()

#Predicting...
#Libraries that will help us extract only business days in the US.
#Otherwise our dates would be wrong when we look back (or forward).  
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
us_bd = CustomBusinessDay(calendar=USFederalHolidayCalendar())
#Remember that we can only predict one day in future as our model needs 5 variables
#as inputs for prediction. We only have all 5 variables until the last day in our dataset.
n_past = 16
n_days_for_prediction=15  #let us predict past 15 days

predict_period_dates = pd.date_range(list(train_dates)[-n_past], periods=n_days_for_prediction, freq=us_bd).tolist()
print(predict_period_dates)

#Make prediction
prediction = model.predict(trainX[-n_days_for_prediction:]) #shape = (n, 1) where n is the n_days_for_prediction

#Perform inverse transformation to rescale back to original range
#Since we used 5 variables for transform, the inverse expects same dimensions
#Therefore, let us copy our values 5 times and discard them after inverse transform
prediction_copies = np.repeat(prediction, df_for_training.shape[1], axis=-1)
y_pred_future = scaler.inverse_transform(prediction_copies)[:,0]


# Convert timestamp to date
forecast_dates = []
for time_i in predict_period_dates:
    forecast_dates.append(time_i.date())
    
df_forecast = pd.DataFrame({'Date':np.array(forecast_dates), 'Open':y_pred_future})
df_forecast['Date']=pd.to_datetime(df_forecast['Date'])


original = df[['Date', 'Open']]
original['Date']=pd.to_datetime(original['Date'])
original = original.loc[original['Date'] >= '2020-5-1']

sns.lineplot(original['Date'], original['Open'])
sns.lineplot(df_forecast['Date'], df_forecast['Open'])

IndentationError: ignored