## Stock Price Prediction with LSTM

In [2]:
import numpy as np
import pandas as pd
import yfinance as yf
import pandas_ta as ta
import plotly.express as px
from sklearn.preprocessing import StandardScaler
import tensorflow as tf 
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Input

tf.random.set_seed(42)

In [3]:
# get the stock
thyao = yf.Ticker("THYAO.IS")
# get stock info
thyao.info

{'zip': '34149',
 'sector': 'Industrials',
 'fullTimeEmployees': 38097,
 'longBusinessSummary': 'Türk Hava Yollari Anonim Ortakligi provides air transport and aircraft technical maintenance services in Turkey and internationally. The company offers domestic and international passenger and cargo air transportation services; repair and maintenance, and technical and infrastructure support services related to aviation sector; catering and aviation ground handling services; software system maintenance and information technology consulting services. It is also involved in training, airport operation, and investment activities; the manufacture and trading of cabin interior accessories and aircraft seats; and the trading of aviation fuel. The company was incorporated in 1933 and is headquartered in Istanbul, Turkey.',
 'city': 'Istanbul',
 'phone': '90 212 463 63 63',
 'country': 'Turkey',
 'companyOfficers': [],
 'website': 'https://www.turkishairlines.com',
 'maxAge': 1,
 'address1': 'Türk 

In [4]:
# get stock historical market data
hist = thyao.history(period="max")
hist.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-05-10 00:00:00+03:00,0.001723,0.001723,0.001664,0.001694,212630653,0.0,0.0
2000-05-11 00:00:00+03:00,0.001664,0.001664,0.001577,0.001636,211439905,0.0,0.0
2000-05-12 00:00:00+03:00,0.001664,0.001664,0.001636,0.001664,123850733,0.0,0.0
2000-05-15 00:00:00+03:00,0.001636,0.001636,0.001577,0.001577,145015837,0.0,0.0
2000-05-16 00:00:00+03:00,0.001606,0.001606,0.001548,0.001606,150023501,0.0,0.0


In [5]:
# Filter all rows for which the volume is greater than 0
df = hist[hist['Volume'] > 0]
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-05-10 00:00:00+03:00,0.001723,0.001723,0.001664,0.001694,212630653,0.0,0.0
2000-05-11 00:00:00+03:00,0.001664,0.001664,0.001577,0.001636,211439905,0.0,0.0
2000-05-12 00:00:00+03:00,0.001664,0.001664,0.001636,0.001664,123850733,0.0,0.0
2000-05-15 00:00:00+03:00,0.001636,0.001636,0.001577,0.001577,145015837,0.0,0.0
2000-05-16 00:00:00+03:00,0.001606,0.001606,0.001548,0.001606,150023501,0.0,0.0


In [6]:
# delete Dividends and Stock Splits column because they don't provide any info
del df["Dividends"]
del df["Stock Splits"]
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-05-10 00:00:00+03:00,0.001723,0.001723,0.001664,0.001694,212630653
2000-05-11 00:00:00+03:00,0.001664,0.001664,0.001577,0.001636,211439905
2000-05-12 00:00:00+03:00,0.001664,0.001664,0.001636,0.001664,123850733
2000-05-15 00:00:00+03:00,0.001636,0.001636,0.001577,0.001577,145015837
2000-05-16 00:00:00+03:00,0.001606,0.001606,0.001548,0.001606,150023501


In [7]:
# Adding indicators as features
df['RSI']=ta.rsi(df.Close, length=15)
df['EMAF']=ta.ema(df.Close, length=20)
df['EMAM']=ta.ema(df.Close, length=100)
df['EMAS']=ta.ema(df.Close, length=150)
df.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RSI']=ta.rsi(df.Close, length=15)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EMAF']=ta.ema(df.Close, length=20)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EMAM']=ta.ema(df.Close, length=100)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

Unnamed: 0_level_0,Open,High,Low,Close,Volume,RSI,EMAF,EMAM,EMAS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-11-24 00:00:00+03:00,112.800003,114.5,111.300003,111.599998,86424863,65.358343,106.275527,81.447552,71.502184
2022-11-25 00:00:00+03:00,112.0,114.0,110.5,113.800003,84417963,67.871615,106.992144,82.088195,72.06242
2022-11-28 00:00:00+03:00,114.5,117.5,113.800003,115.099998,86237156,69.282551,107.764321,82.741894,72.632454
2022-11-29 00:00:00+03:00,115.699997,119.199997,113.400002,118.900002,92899813,72.99656,108.824862,83.457896,73.245269
2022-12-02 00:00:00+03:00,124.400002,127.0,123.699997,125.199997,30720432,77.770772,110.384398,84.284472,73.933411


In [8]:
# add daily change as a feature
df['Change'] = hist['Close']-hist.Open

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Change'] = hist['Close']-hist.Open


In [9]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,RSI,EMAF,EMAM,EMAS,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2000-05-10 00:00:00+03:00,0.001723,0.001723,0.001664,0.001694,212630653,,,,,-2.9e-05
2000-05-11 00:00:00+03:00,0.001664,0.001664,0.001577,0.001636,211439905,,,,,-2.9e-05
2000-05-12 00:00:00+03:00,0.001664,0.001664,0.001636,0.001664,123850733,,,,,0.0
2000-05-15 00:00:00+03:00,0.001636,0.001636,0.001577,0.001577,145015837,,,,,-5.9e-05
2000-05-16 00:00:00+03:00,0.001606,0.001606,0.001548,0.001606,150023501,,,,,0.0


In [10]:
# drop rows with null values
df.dropna(inplace=True)
df.reset_index(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [11]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,RSI,EMAF,EMAM,EMAS,Change
0,2001-02-05 00:00:00+02:00,0.001017,0.001017,0.000982,0.000982,67824874,39.813759,0.001088,0.001247,0.001313,-3.5e-05
1,2001-02-06 00:00:00+02:00,0.00097,0.00097,0.0009,0.000923,67767180,36.750274,0.001073,0.001241,0.001307,-4.7e-05
2,2001-02-08 00:00:00+02:00,0.000947,0.000947,0.000888,0.000911,264907778,36.190844,0.001057,0.001235,0.001302,-3.5e-05
3,2001-02-09 00:00:00+02:00,0.000888,0.000888,0.000853,0.000864,228221042,33.844834,0.001039,0.001227,0.001296,-2.3e-05
4,2001-02-12 00:00:00+02:00,0.000876,0.000876,0.000829,0.000841,110251782,32.730501,0.00102,0.00122,0.00129,-3.5e-05


In [12]:
df.shape

(5395, 11)

In [13]:
df.columns[df.columns[:]!= "Volume"]

Index(['Date', 'Open', 'High', 'Low', 'Close', 'RSI', 'EMAF', 'EMAM', 'EMAS',
       'Change'],
      dtype='object')

In [14]:
# Range slider
figure = px.line(df, x=df['Date'], y=df.columns[df.columns[:]!= "Volume"])
figure.update_xaxes(rangeslider_visible=True)
figure.show()

In [15]:
# Range slider
figure = px.line(df, x=df['Date'], y=df.columns[-1], title="Change by time")
figure.update_xaxes(rangeslider_visible=True)
figure.show()

In [16]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,RSI,EMAF,EMAM,EMAS,Change
0,2001-02-05 00:00:00+02:00,0.001017,0.001017,0.000982,0.000982,67824874,39.813759,0.001088,0.001247,0.001313,-3.5e-05
1,2001-02-06 00:00:00+02:00,0.00097,0.00097,0.0009,0.000923,67767180,36.750274,0.001073,0.001241,0.001307,-4.7e-05
2,2001-02-08 00:00:00+02:00,0.000947,0.000947,0.000888,0.000911,264907778,36.190844,0.001057,0.001235,0.001302,-3.5e-05
3,2001-02-09 00:00:00+02:00,0.000888,0.000888,0.000853,0.000864,228221042,33.844834,0.001039,0.001227,0.001296,-2.3e-05
4,2001-02-12 00:00:00+02:00,0.000876,0.000876,0.000829,0.000841,110251782,32.730501,0.00102,0.00122,0.00129,-3.5e-05


In [17]:
# Split the time-series data into training seq X and output value Y
def extract_seq_featuresX_targetY(df, window_size, offset):
    """
    Split time-series into training sequence features X and target value Y
    Args:
        df - dataset 
        window_size - window size of past values, e.g., 30 for 30 days 
        offset - position to start the split
    """
    X, y = [], []
    
    for i in range(offset, len(df)):
        X.append(df.iloc[i-window_size:i, 1:])
        y.append(df.iloc[i, 4])
    
    return np.array(X), np.array(y)

In [18]:
window_size = 60 # 60 days windows
X, y = extract_seq_featuresX_targetY(df, window_size, window_size)

In [19]:
X.shape

(5335, 60, 10)

In [20]:
y.shape

(5335,)

In [21]:
#### Train-Test split for time-series ####
training_ratio = 0.8 # and test_ratio = 0.2
validation_ratio = 0.1
test_ratio = 0.1

train_size = int(training_ratio * len(X))
validation_size = int(validation_ratio * len(X))
test_size = int(test_ratio * len(X))

print("train_size: " + str(train_size))
print("val_size: " + str(validation_size))
print("test_size: " + str(test_size))

X_train = X[:train_size]
y_train = y[:train_size]
X_val = X[train_size:train_size+validation_size]
y_val = y[train_size:train_size+validation_size]
X_test = X[train_size+validation_size:]
y_test = y[train_size+validation_size:]

train_size: 4268
val_size: 533
test_size: 533


In [22]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((4268, 60, 10), (4268,), (533, 60, 10), (533,), (534, 60, 10), (534,))

In [23]:
# scale 
target_scaler = StandardScaler()
feature_scaler = StandardScaler()

# Standart scaler expects 2d or less input thus we reshape our 3d X_train to 2d and 1d y_train to 2d
n_samples, nx, ny = X_train.shape
X_train = X_train.reshape((n_samples, nx*ny))
X_train = feature_scaler.fit_transform(X_train)
X_train = X_train.reshape((n_samples, nx, ny)) # reshape back

y_train = y_train.reshape((y_train.shape[0], 1))
y_train = target_scaler.fit_transform(y_train)
y_train = y_train.reshape((y_train.shape[0]))

n_samples, nx, ny = X_val.shape
X_val = X_val.reshape((n_samples, nx*ny))
X_val = feature_scaler.transform(X_val)
X_val = X_val.reshape((n_samples, nx, ny)) # reshape back

y_val = y_val.reshape((y_val.shape[0], 1))
y_val = target_scaler.transform(y_val)
y_val = y_val.reshape((y_val.shape[0]))

n_samples, nx, ny = X_test.shape
X_test = X_test.reshape((n_samples, nx*ny))
X_test = feature_scaler.transform(X_test)
X_test = X_test.reshape((n_samples, nx, ny))

y_test = y_test.reshape((y_test.shape[0], 1))
y_test = target_scaler.transform(y_test)
y_test = y_test.reshape((y_test.shape[0]))

In [24]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((4268, 60, 10), (4268,), (533, 60, 10), (533,), (534, 60, 10), (534,))

In [25]:
### Build a LSTM model   
def LSTM_model(X_train):     
    inp = Input(shape=(X_train.shape[1], X_train.shape[2]))
    
    x = LSTM(units=256, return_sequences=True)(inp)
    x = LSTM(units=256, return_sequences=True)(x)
    x = LSTM(units=256)(x)
    out = Dense(1, activation='linear')(x)
    model = Model(inp, out)
    
    # Compile the model
    model.compile(loss = 'mean_squared_error', optimizer = 'adam')

    return model

In [26]:
model = LSTM_model(X_train=X_train)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 60, 10)]          0         
                                                                 
 lstm (LSTM)                 (None, 60, 256)           273408    
                                                                 
 lstm_1 (LSTM)               (None, 60, 256)           525312    
                                                                 
 lstm_2 (LSTM)               (None, 256)               525312    
                                                                 
 dense (Dense)               (None, 1)                 257       
                                                                 
Total params: 1,324,289
Trainable params: 1,324,289
Non-trainable params: 0
_________________________________________________________________


In [27]:
history = model.fit(X_train, y_train, epochs=30, batch_size=32, 
                        verbose=1, validation_data=(X_val, y_val))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [28]:
# Calculate the Root Mean Squared Error (RMSE)  
def rmse(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true-y_pred)**2))                   
    return rmse

# Calculate the Mean Absolute Percentage Error (MAPE) %
def mape(y_true, y_pred): 
    y_pred, y_true = np.array(y_pred), np.array(y_true)    
    mape = np.mean(np.abs((y_true-y_pred) / y_true))*100    
    return mape

In [29]:
# Test the model
predicted_price_ = model.predict(X_test)
predicted_price_.shape



(534, 1)

In [30]:
# Scale back
predicted_price = target_scaler.inverse_transform(predicted_price_)
predicted_price.shape

(534, 1)

In [31]:
results = pd.DataFrame(y_test, columns = ['Close'])
results.head()

Unnamed: 0,Close
0,1.696089
1,1.710899
2,1.72571
3,1.710899
4,1.688683


In [32]:
# Plot predicted price vs actual closing price 
results['Predictions'] = predicted_price
results.head()

Unnamed: 0,Close,Predictions
0,1.696089,10.482559
1,1.710899,10.325397
2,1.72571,10.375472
3,1.710899,10.472921
4,1.688683,10.419193


In [33]:
# Evaluate performance
rmse_lstm = rmse(np.array(results['Close']), np.array(results['Predictions']))
mape_lstm = mape(np.array(results['Close']), np.array(results['Predictions']))
rmse_lstm, mape_lstm

(11.87686455980596, 319.1191219543452)

In [34]:
# Range slider
figure = px.line(results, x=results.index, y=results.columns[:], title="Actual Close prices vs predictions")
figure.update_xaxes(rangeslider_visible=True)
figure.show()