# Predict stock prices  with Long short-term memory (LSTM)

This simple example will show you how LSTM models predict time series data. Stock market data is a great choice for this because it's quite regular and widely available via the Internet.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Install requirements
We install Tensorflow 2.0 with GPU support first

In [2]:
!apt install graphviz

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
graphviz is already the newest version (2.42.2-6).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [3]:
!pip install tensorflow-gpu==2.9.2 pandas-datareader alpha_vantage pydot pydot-ng

Collecting tensorflow-gpu==2.9.2
  Downloading tensorflow_gpu-2.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.8/511.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting alpha_vantage
  Downloading alpha_vantage-2.3.1-py3-none-any.whl (31 kB)
Collecting flatbuffers<2,>=1.12 (from tensorflow-gpu==2.9.2)
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow-gpu==2.9.2)
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting keras<2.10.0,>=2.9.0rc0 (from tensorflow-gpu==2.9.2)
  Downloading keras-2.9.0-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-preprocessing>=1.1.1 (from tensorflow-gpu==2.9.2)
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Introduction

LSTMs are very powerful in sequence prediction problems. They can store past information.

## Loading the dataset
I use pandas-datareader to get the historical stock prices from Yahoo! finance. For this example, I get only the historical data till the end of *training_end_data*.  

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import requests, json, os
import alpha_vantage.alphavantage as ave
from pandas_datareader import data
from google.colab import userdata
from alpha_vantage.timeseries import TimeSeries
from alpha_vantage.fundamentaldata import FundamentalData
#from alpha_vantage.earningscalendar import EarningsCalendar
import pandas_datareader as pdr

def change_key_names(dictionary, old_to_new):
  """
  Changes the key names in a dictionary.

  Args:
    dictionary: The dictionary to modify.
    old_to_new: A dictionary mapping old key names to new key names.

  Returns:
    A new dictionary with the updated key names.
  """

  new_dict = {}
  for old_key, new_key in old_to_new.items():
    if old_key in dictionary:
      new_dict[new_key] = dictionary[old_key]

  return new_dict


def change_column_names(df, old_to_new):
  """
  Changes the column names in a Pandas DataFrame.

  Args:
      df: The DataFrame to modify.
      old_to_new: A dictionary mapping old column names to new column names.

  Returns:
      A new DataFrame with the updated column names.
  """

  df.rename(columns=old_to_new, inplace=True)
  return df


def filter_dataframe_by_date(df, start_date, end_date,date_column_name='date'):
  """
  Filters a Pandas DataFrame based on a start and end date.

  Args:
      df: The Pandas DataFrame to filter.
      start_date: The start date for filtering.
      end_date: The end date for filtering.

  Returns:
      A new Pandas DataFrame containing the filtered data.
  """

  # Convert the start and end dates to datetime objects
  start_date = pd.to_datetime(start_date)
  end_date = pd.to_datetime(end_date)

  # Filter the DataFrame based on the date range
  filtered_df = df[(df[date_column_name] >= start_date) & (df[date_column_name] <= end_date)]

  return filtered_df


def save_dataframe_to_file(df, filename, file_format="json", orientation='table'):
  """
  Saves a DataFrame to a file in the specified format.Saving indexes, too

  Args:
      df: The Pandas DataFrame to save.
      filename: The filename to save the DataFrame to.
      file_format: The file format to save the DataFrame in (default: "json").
  """

  if file_format == "csv":
    df.to_csv(filename, index=True)
  elif file_format == "json":
    df.to_json(filename, orient=orientation)
  else:
    raise ValueError(f"Unsupported file format: {file_format}")
  return df


def load_dataframe_from_file(filename, file_format="json",orientation='index'):
  """
  Loads a Pandas DataFrame from a file in the specified format.

  Args:
      filename: The path to the file.
      file_format: The file format (default: "json").

  Returns:
      A Pandas DataFrame containing the data from the file.
  """

  if not os.path.isfile(filename):
    print(f"File '{filename}' does not exist.")
    return None

  if file_format == "csv":
    df = pd.read_csv(filename)
  elif file_format == "json":
    df = pd.read_json(filename,orient=orientation)
  else:
    raise ValueError(f"Unsupported file format: {file_format}")

  return df



In [2]:
modelDir = '/content/drive/MyDrive/ai_models' # @param {type:"string"}
dataFileDir = '/content/drive/MyDrive/dataset' #md hold data already loaded from API @param {type:"string"}

#Change names in the results from get_daily
av_daily_map = {
    '1. open': 'open',
    '2. high': 'high',
    '3. low': 'low',
    '4. close': 'close',
    '5. volume': 'volume',
}
av_earnings_map = {
    'reportedDate': 'date',
}
# Set the ticker
ticker = 'KLAC'
dataFile = dataFileDir+'/' + ticker + '.json'
testFrame = load_dataframe_from_file(dataFile,file_format='json',orientation='index')

start_date = '2010-12-01'
end_date = '2023-12-31'
api_key= userdata.get('alphavantage')
# Initialize the TimeSeries object
ts = TimeSeries(key=api_key, output_format='pandas')
# Initialize the EarningsCalendar object
fd = FundamentalData(key=api_key, output_format='pandas')
if testFrame is None:
  # Get the daily adjusted time series data
  stock_data0, meta_data = ts.get_daily(ticker, outputsize='full')
  original_stock_data = stock_data0.copy()
  save_dataframe_to_file(stock_data0, dataFile,orientation='index')
  print("Loaded data AlphaVantage - used up an api call.")
else:
  stock_data0 = testFrame
  print(f"Loaded data from {dataFile} - saving an api call.")
#make sure the datetime index is called 'date'. That disappears when loading from file.
stock_data0.index.name='date'
stock_data0.head()
stock_data = change_column_names(stock_data0, av_daily_map)
stock_data['symbol'] = ticker

#Alphavantage returns data indexed by date. Use reset_index to make date a column
stock_data.reset_index(inplace=True)
stock_data.head()





Loaded data from /content/drive/MyDrive/dataset/KLAC.json - saving an api call.


Unnamed: 0,date,open,high,low,close,volume,symbol
0,2024-03-18,696.33,705.17,689.27,690.7,798087,KLAC
1,2024-03-15,686.83,691.892,679.71,683.9,1669472,KLAC
2,2024-03-14,698.46,701.0,687.77,692.33,933660,KLAC
3,2024-03-13,690.6,695.66,682.17,688.71,1062944,KLAC
4,2024-03-12,691.72,698.85,683.35,698.4,837017,KLAC


In [3]:
earningsDataFile = dataFileDir+'/' + ticker + '_earnings.json'
if not os.path.isfile(earningsDataFile):
  # Call alphavantage api to get earnings events
  url = 'https://www.alphavantage.co/query?function=EARNINGS&symbol='+ticker+'&apikey='+ api_key
  r = requests.get(url)
  data = r.json()
  #save json response for later work
  with open(dataFileDir+'/earnings.json', 'w') as f:
    json.dump(data, f)
  #We want the quarterly earnings part of the response
  earningsFrame = pd.DataFrame(data['quarterlyEarnings'])
  save_dataframe_to_file(earningsFrame, earningsDataFile,orientation='index')
  print("Loaded data AlphaVantage - used up an api call.")
else:
  earningsFrame = load_dataframe_from_file(earningsDataFile,file_format='json')
  print(f"Loaded data from {earningsDataFile} - saving an api call.")


earnings_data = change_column_names(earningsFrame, av_earnings_map)
earnings_data['symbol'] = ticker
#Convert earningsDate to have datetime instead of object on 'date' so we can merge
earnings_data['date'] = earnings_data['date'].astype('datetime64[ns]')
stockEarnings = pd.merge(stock_data, earnings_data, on=['symbol', 'date'],how='inner')
stockEarnings.head()


Loaded data from /content/drive/MyDrive/dataset/KLAC_earnings.json - saving an api call.


Unnamed: 0,date,open,high,low,close,volume,symbol,fiscalDateEnding,reportedEPS,estimatedEPS,surprise,surprisePercentage
0,2024-01-25,651.5,658.783,639.15,641.69,1716810,KLAC,2023-12-31,6.16,5.91,0.25,4.2301
1,2023-10-25,464.01,467.21,452.45,454.84,1515063,KLAC,2023-09-30,5.74,5.41,0.33,6.0998
2,2023-07-27,475.0,495.29,475.0,482.35,1951211,KLAC,2023-06-30,5.4,4.85,0.55,11.3402
3,2023-04-26,358.0,360.46,355.88,357.36,1267888,KLAC,2023-03-31,5.49,5.32,0.17,3.1955
4,2023-01-26,424.01,429.46,415.0,428.76,1646299,KLAC,2022-12-31,7.38,7.1,0.28,3.9437


In [4]:
#Separate into base and training dataframes
stock_data_train = filter_dataframe_by_date(stockEarnings, start_date=start_date, end_date='2020-12-31')
stock_data_test = filter_dataframe_by_date(stockEarnings, '2021-01-01', '2023-12-31')
stock_data_train.head()

Unnamed: 0,date,open,high,low,close,volume,symbol,fiscalDateEnding,reportedEPS,estimatedEPS,surprise,surprisePercentage
13,2020-10-28,194.52,196.24,190.48,190.55,1554791,KLAC,2020-09-30,3.03,2.77,0.26,9.3863
14,2020-08-03,202.59,208.04,202.5,207.65,1809921,KLAC,2020-06-30,2.73,2.41,0.32,13.278
15,2020-05-05,156.46,160.69,153.96,155.63,1868984,KLAC,2020-03-31,2.47,2.28,0.19,8.3333
16,2020-02-04,174.33,178.8,173.84,178.8,1735912,KLAC,2019-12-31,2.66,2.58,0.08,3.1008
17,2019-10-30,171.82,173.94,169.21,172.96,1706475,KLAC,2019-09-30,2.48,2.2,0.28,12.7273


In [5]:
stock_data_len = stock_data['close'].count()
print(stock_data_len)

6133


I'm only interested in *close* prices

In [19]:
close_prices = stock_data.iloc[:, 1:2].values
print(close_prices)

[[696.33]
 [686.83]
 [698.46]
 ...
 [ 84.25]
 [ 79.69]
 [ 78.75]]


dtype('float64')

Of course, some of the weekdays might be public holidays in which case no price will be available. For this reason, we will fill the missing prices with the latest available prices

In [7]:
all_bussinessdays = pd.date_range(start=start_date, end=end_date, freq='B')
print(all_bussinessdays)

DatetimeIndex(['2010-12-01', '2010-12-02', '2010-12-03', '2010-12-06',
               '2010-12-07', '2010-12-08', '2010-12-09', '2010-12-10',
               '2010-12-13', '2010-12-14',
               ...
               '2023-12-18', '2023-12-19', '2023-12-20', '2023-12-21',
               '2023-12-22', '2023-12-25', '2023-12-26', '2023-12-27',
               '2023-12-28', '2023-12-29'],
              dtype='datetime64[ns]', length=3413, freq='B')


In [8]:
close_prices = stock_data_train.reindex(all_bussinessdays)
close_prices = stock_data_train.fillna(method='ffill')

In [16]:
close_prices.head()
print(close_prices)

         date      open     high       low   close    volume symbol  \
13 2020-10-28  194.5200  196.240  190.4800  190.55   1554791   KLAC   
14 2020-08-03  202.5900  208.040  202.5000  207.65   1809921   KLAC   
15 2020-05-05  156.4600  160.690  153.9600  155.63   1868984   KLAC   
16 2020-02-04  174.3300  178.800  173.8400  178.80   1735912   KLAC   
17 2019-10-30  171.8200  173.940  169.2100  172.96   1706475   KLAC   
18 2019-08-05  130.1000  130.960  125.1900  126.46   3480471   KLAC   
19 2019-05-06  125.0900  125.610  123.8700  124.61   1458737   KLAC   
20 2019-01-29  102.4000  102.430  100.3500  100.99   1839656   KLAC   
21 2018-10-29   88.6700   89.040   84.5900   85.13   2113180   KLAC   
22 2018-07-30  107.3200  108.030  105.4200  106.30   1863505   KLAC   
23 2018-04-26   99.4700  101.250   99.2300  100.83   1673964   KLAC   
24 2018-01-25  117.2900  121.650  112.5900  113.09   1814082   KLAC   
25 2017-10-26  107.0400  109.780  107.0036  108.85   2326560   KLAC   
26 201

The dataset is now complete and free of missing values. Let's have a look to the data frame summary:

## Feature scaling

In [11]:
training_set = close_prices.iloc[:, 1:2].values

In [12]:
print(training_set)

[[194.52  ]
 [202.59  ]
 [156.46  ]
 [174.33  ]
 [171.82  ]
 [130.1   ]
 [125.09  ]
 [102.4   ]
 [ 88.67  ]
 [107.32  ]
 [ 99.47  ]
 [117.29  ]
 [107.04  ]
 [100.89  ]
 [101.29  ]
 [ 84.65  ]
 [ 71.93  ]
 [ 77.38  ]
 [ 71.68  ]
 [ 64.38  ]
 [ 64.39  ]
 [ 51.72  ]
 [ 59.68  ]
 [ 69.49  ]
 [ 70.74  ]
 [ 73.46  ]
 [ 68.02  ]
 [ 63.4   ]
 [ 64.15  ]
 [ 59.95  ]
 [ 55.8   ]
 [ 52.22  ]
 [ 47.29  ]
 [ 49.7   ]
 [ 53.42  ]
 [ 51.26  ]
 [ 46.5201]
 [ 41.61  ]
 [ 43.69  ]
 [ 44.45  ]]


In [14]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
training_set_scaled = sc.fit_transform(training_set)
print(training_set_scaled.shape)

(40, 1)


LSTMs expect the data in a specific format, usually a 3D tensor. I start by creating data with 60 days and converting it into an array using NumPy. Next, I convert the data into a 3D dimension array with feature_set samples, 60 days and one feature at each step.

In [15]:
features = []
labels = []
for i in range(60, stock_data_len):
    features.append(training_set_scaled[i-60:i, 0])
    labels.append(training_set_scaled[i, 0])

features = np.array(features)
labels = np.array(labels)

features = np.reshape(features, (features.shape[0], features.shape[1], 1))

IndexError: index 60 is out of bounds for axis 0 with size 40

In [None]:
print(labels)

[0.80899538 0.80499011 0.79966611 ... 0.0977259  0.09421866 0.09264741]


In [None]:
print(features)

[[[0.94773081]
  [0.96050841]
  [0.95301693]
  ...
  [0.79377394]
  [0.78865336]
  [0.79498043]]

 [[0.96050841]
  [0.95301693]
  [0.95749218]
  ...
  [0.78865336]
  [0.79498043]
  [0.80899538]]

 [[0.95301693]
  [0.95749218]
  [0.94708267]
  ...
  [0.79498043]
  [0.80899538]
  [0.80499011]]

 ...

 [[0.06168544]
  [0.0582764 ]
  [0.06405634]
  ...
  [0.1029026 ]
  [0.10026515]
  [0.09710863]]

 [[0.0582764 ]
  [0.06405634]
  [0.06475779]
  ...
  [0.10026515]
  [0.09710863]
  [0.0977259 ]]

 [[0.06405634]
  [0.06475779]
  [0.06423872]
  ...
  [0.09710863]
  [0.0977259 ]
  [0.09421866]]]


Feature tensor with three dimension: features[0] contains the ..., features[1] contains the last 60 days of values and features [2] contains the  ...

In [None]:
print(features.shape)

(6072, 60, 1)


## Create the LSTM network
Let's create a sequenced LSTM network with 50 units. Also the net includes some dropout layers with 0.2 which means that 20% of the neurons will be dropped.

In [None]:
import tensorflow as tf

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(units = 50, return_sequences = True, input_shape = (features.shape[1], 1)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(units = 50, return_sequences = True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(units = 50, return_sequences = True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(units = 50),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units = 1)
])

In [None]:

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 60, 50)            10400     
                                                                 
 dropout (Dropout)           (None, 60, 50)            0         
                                                                 
 lstm_1 (LSTM)               (None, 60, 50)            20200     
                                                                 
 dropout_1 (Dropout)         (None, 60, 50)            0         
                                                                 
 lstm_2 (LSTM)               (None, 60, 50)            20200     
                                                                 
 dropout_2 (Dropout)         (None, 60, 50)            0         
                                                                 
 lstm_3 (LSTM)               (None, 50)                2

In [None]:
#tf.keras.utils.plot_model(model, to_file='my_model.png')


In [None]:
# Run tensorboard with the logdir
#import os
#LOG_BASE_DIR = './log'
#os.makedirs(LOG_BASE_DIR, exist_ok=True)

In [None]:
#!ls -l log

## Load the Colab TensorBoard extention and start TensorBoard inline

In [None]:
#%load_ext tensorboard.notebook
#%tensorboard --logdir {LOG_BASE_DIR}

## Define a TensorBoard callback

In [None]:
#import datetime
#logdir = os.path.join(LOG_BASE_DIR, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

In [None]:
#from tensorflow.keras.callbacks import TensorBoard

#tbCallBack = TensorBoard(logdir,histogram_freq=1)

The model will be compiled and optimize by the adam optimizer and set the loss function as mean_squarred_error

In [None]:
model.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [None]:
#import os
#print(os.environ)

In [None]:
#tf.test.gpu_device_name()

In [None]:
#from tensorflow.python.client import device_lib
#device_lib.list_local_devices()

In [None]:
from time import time
start = time()
history = model.fit(features, labels, epochs = 20, batch_size = 32, verbose = 1)
end = time()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
print('Total training time {} seconds'.format(end - start))
print('Saving model named "model"')
model.save('/content/drive/MyDrive/ai_models/stock_close_history_model.h5')

Total training time 400.5100076198578 seconds
Saving model named "model"


In [None]:
#  [samples, days, features]
print(features.shape)

(6072, 60, 1)


In [None]:
testing_start_date = '2019-01-01'
testing_end_date = '2019-04-10'

test_stock_data = stock_data_test

NameError: name 'stock_data_test' is not defined

In [None]:
test_stock_data.tail()

In [None]:
test_stock_data_processed = test_stock_data.iloc[:, 1:2].values


In [None]:
print(test_stock_data_processed.shape)

In [None]:
all_stock_data = pd.concat((stock_data['Close'], test_stock_data['Close']), axis = 0)

In [None]:
inputs = all_stock_data[len(all_stock_data) - len(test_stock_data) - 60:].values
inputs = inputs.reshape(-1,1)
inputs = sc.transform(inputs)

In [None]:
X_test = []
for i in range(60, 129):
    X_test.append(inputs[i-60:i, 0])

In [None]:
X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
predicted_stock_price = model.predict(X_test)
predicted_stock_price = sc.inverse_transform(predicted_stock_price)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(test_stock_data_processed, color='blue', label='Actual Apple Stock Price')
plt.plot(predicted_stock_price , color='red', label='Predicted Apple Stock Price')
plt.title('Apple Stock Price Prediction')
plt.xlabel('Date')
plt.ylabel('Apple Stock Price')
plt.legend()
plt.show()

In [None]:
#inputs = inputs.reshape(-1,1)
#inputs = sc.transform(inputs)


test_inputs = test_stock_data_processed.reshape(-1,1)
test_inputs = sc.transform(test_inputs)


print(test_inputs.shape)

In [None]:
test_features = []
for i in range(60, 291):
    test_features.append(test_inputs[i-60:i, 0])

test_features = np.array(test_features)

In [None]:
test_features = np.reshape(test_features, (test_features.shape[0], test_features.shape[1], 1))
print(test_features.shape)

In [None]:
predicted_stock_price = model.predict(test_features)

In [None]:
predicted_stock_price = sc.inverse_transform(predicted_stock_price)
print(predicted_stock_price.shape)

In [None]:
print(test_stock_data_processed.shape)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(test_stock_data_processed, color='blue', label='Actual Apple Stock Price')
plt.plot(predicted_stock_price , color='red', label='Predicted Apple Stock Price')
plt.title('Apple Stock Price Prediction')
plt.xlabel('Date')
plt.ylabel('Apple Stock Price')
plt.legend()
plt.show()

## Download the model and the weights

In [None]:
from google.colab import files

In [None]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
  json_file.write(model_json)

In [None]:
files.download("model.json")

In [None]:
model.save('weights.h5')
files.download('weights.h5')