In [2]:
import pandas as pd
import numpy as np
import math
import datetime as dt
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score 
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, GRU

from itertools import cycle

# ! pip install plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots




# Import dataset

In [3]:
# setwd
import os
os.chdir('C:/Users/ASUS/Desktop/Daming/VSC/stock_price_prediction_with_realtime_evaluation')

In [4]:
# Import dataset
mandiri = pd.read_csv("data/processed/data_model.csv")
mandiri.head()

Unnamed: 0,tanggal,harga
0,2020-01-01,7.75
1,2020-01-02,7.75
2,2020-01-03,7.725
3,2020-01-06,7.6
4,2020-01-07,7.6


# Precprocessing

## Covert date from string to date format

In [5]:
mandiri["tanggal"]=pd.to_datetime(mandiri.tanggal)
mandiri.dtypes

tanggal    datetime64[ns]
harga             float64
dtype: object

In [6]:
mandiri.shape

(1083, 2)

## Get the duration of dataset

In [7]:
print("Starting date: ",mandiri.iloc[0][0])
print("Ending date: ", mandiri.iloc[-1][0])
print("Duration: ", mandiri.iloc[-1][0]-mandiri.iloc[0][0])

Starting date:  2020-01-01 00:00:00
Ending date:  2024-02-23 00:00:00
Duration:  1514 days 00:00:00


  print("Starting date: ",mandiri.iloc[0][0])
  print("Ending date: ", mandiri.iloc[-1][0])
  print("Duration: ", mandiri.iloc[-1][0]-mandiri.iloc[0][0])


## Normalizing / scaling close value between 0 to 1

In [8]:
scaler=MinMaxScaler(feature_range=(0,1))
mandiri_norm=scaler.fit_transform(mandiri[["harga"]])
print(mandiri_norm.shape)

(1083, 1)


## Split data

In [9]:
training_size=int(len(mandiri_norm)*0.65)
test_size=len(mandiri_norm)-training_size

train_data,test_data=mandiri_norm[0:training_size,:],mandiri_norm[training_size:len(mandiri_norm),:]
print("train_data: ", train_data.shape)
print("test_data: ", test_data.shape)

train_data:  (703, 1)
test_data:  (380, 1)


## Create new dataset

In [10]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step), 0]   ###i=0, 0,1,2,3-----99   100 
        dataX.append(a)
        dataY.append(dataset[i + time_step, 0])
    return np.array(dataX), np.array(dataY)

In [11]:
# reshape into X=t,t+1,t+2,t+3 and Y=t+4
time_step = 15
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)

X_train:  (687, 15)
y_train:  (687,)
X_test:  (364, 15)
y_test (364,)


# LSTM

In [12]:
# reshape input to be [samples, time steps, features] which is required for LSTM
X_train =X_train.reshape(X_train.shape[0],X_train.shape[1] , 1)
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1] , 1)

print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)

X_train:  (687, 15, 1)
X_test:  (364, 15, 1)


## LSTM model structure

In [13]:
model=Sequential()
model.add(LSTM(32,return_sequences=True,input_shape=(time_step,1)))
model.add(LSTM(32,return_sequences=True))
model.add(LSTM(32))
model.add(Dense(1))
model.compile(loss='mean_squared_error',optimizer='adam')





In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 15, 32)            4352      
                                                                 
 lstm_1 (LSTM)               (None, 15, 32)            8320      
                                                                 
 lstm_2 (LSTM)               (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 21025 (82.13 KB)
Trainable params: 21025 (82.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=5,verbose=1)

Epoch 1/20

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x20813135de0>

In [16]:
### Lets Do the prediction and check performance metrics
train_predict=model.predict(X_train)
test_predict=model.predict(X_test)
train_predict.shape, test_predict.shape



((687, 1), (364, 1))

In [17]:
# Transform back to original form

train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)
original_ytrain = scaler.inverse_transform(y_train.reshape(-1,1)) 
original_ytest = scaler.inverse_transform(y_test.reshape(-1,1)) 

## Evaluation metrices RMSE, MSE and MAE
Root Mean Square Error (RMSE), Mean Square Error (MSE) and Mean absolute Error (MAE) are a standard way to measure the error of a model in predicting quantitative data.

In [18]:
# Evaluation metrices RMSE and MAE
print("Train data RMSE: ", math.sqrt(mean_squared_error(original_ytrain,train_predict)))
print("Train data MSE: ", mean_squared_error(original_ytrain,train_predict))
print("Test data MAE: ", mean_absolute_error(original_ytrain,train_predict))
print("-------------------------------------------------------------------------------------")
print("Test data RMSE: ", math.sqrt(mean_squared_error(original_ytest,test_predict)))
print("Test data MSE: ", mean_squared_error(original_ytest,test_predict))
print("Test data MAE: ", mean_absolute_error(original_ytest,test_predict))

Train data RMSE:  0.19138394438429973
Train data MSE:  0.036627814168092736
Test data MAE:  0.14244943997745432
-------------------------------------------------------------------------------------
Test data RMSE:  0.34431585754929767
Test data MSE:  0.11855340975990823
Test data MAE:  0.14737259204571063


In [19]:
print("Train data R2 score:", r2_score(original_ytrain, train_predict))
print("Test data R2 score:", r2_score(original_ytest, test_predict))

Train data R2 score: 0.971982815150482
Test data R2 score: 0.9730196308702272


### Comparision between original stock close price vs predicted close price

In [24]:
mandiri_norm

array([[0.56128134],
       [0.56128134],
       [0.55779944],
       ...,
       [0.49164345],
       [0.47075209],
       [0.4637883 ]])

In [22]:
# shift train predictions for plotting
look_back=time_step
trainPredictPlot = np.empty_like(mandiri_norm)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(train_predict)+look_back, :] = train_predict
print("Train predicted data: ", trainPredictPlot.shape)

#shift test predictions for plotting
testPredictPlot = np.empty_like(mandiri_norm)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(train_predict)+(look_back*2)+1:len(train_predict)+(look_back*2)+1+len(test_predict), :] = test_predict
print("Test predicted data: ", testPredictPlot.shape)

names = cycle(['Original harga price','Train predicted harga price','Test predicted harga price'])


plotdf = pd.DataFrame({'date': mandiri['tanggal'],
                       'original_harga': mandiri['harga'],
                      'train_predicted_harga': trainPredictPlot.reshape(1,-1)[0].tolist(),
                      'test_predicted_harga': testPredictPlot.reshape(1,-1)[0].tolist()})

fig = px.line(plotdf,x=plotdf['date'], y=[plotdf['original_harga'],plotdf['train_predicted_harga'],
                                          plotdf['test_predicted_harga']],
              labels={'value':'Stock price','date': 'Date'})
fig.update_layout(title_text='Comparision between original harga price vs predicted harga price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

Train predicted data:  (1083, 1)
Test predicted data:  (1083, 1)


## Lets forcast

In [21]:
x_input=test_data[len(test_data)-time_step:].reshape(1,-1)
temp_input=list(x_input)
temp_input=temp_input[0].tolist()

from numpy import array

lst_output=[]
n_steps=time_step
i=0
pred_days = 10
while(i<pred_days):
    
    if(len(temp_input)>time_step):
        
        x_input=np.array(temp_input[1:])
        #print("{} day input {}".format(i,x_input))
        x_input = x_input.reshape(1,-1)
        x_input = x_input.reshape((1, n_steps, 1))
        
        yhat = model.predict(x_input, verbose=0)
        #print("{} day output {}".format(i,yhat))
        temp_input.extend(yhat[0].tolist())
        temp_input=temp_input[1:]
        #print(temp_input)
       
        lst_output.extend(yhat.tolist())
        i=i+1
        
    else:
        
        x_input = x_input.reshape((1, n_steps,1))
        yhat = model.predict(x_input, verbose=0)
        temp_input.extend(yhat[0].tolist())
        
        lst_output.extend(yhat.tolist())
        i=i+1
               
print("Output of predicted next days: ", len(lst_output))

Output of predicted next days:  10


In [22]:
last_days=np.arange(1,time_step+1)
temp_mat = np.empty((len(last_days)+pred_days+1,1))
temp_mat[:] = np.nan
temp_mat = temp_mat.reshape(1,-1).tolist()[0]

last_original_days_value = temp_mat
next_predicted_days_value = temp_mat

last_original_days_value[0:time_step+1] = scaler.inverse_transform(mandiri_norm[len(mandiri_norm)-time_step:]).reshape(1,-1).tolist()[0]
next_predicted_days_value[time_step+1:] = scaler.inverse_transform(np.array(lst_output).reshape(-1,1)).reshape(1,-1).tolist()[0]

new_pred_plot = pd.DataFrame({
    'last_original_days_value':last_original_days_value,
    'next_predicted_days_value':next_predicted_days_value
})
names = cycle(['Last 15 days close price','Predicted next 10 days close price'])

fig = px.line(new_pred_plot,x=new_pred_plot.index, y=[new_pred_plot['last_original_days_value'],
                                                      new_pred_plot['next_predicted_days_value']],
              labels={'value': 'Stock price','index': 'Timestamp'})
fig.update_layout(title_text='Compare last 15 days vs next 10 days',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

Meskipun GRU+LSTM memiliki accuracy lebih tinggi di train namun dengan adanya perbedaan antara test dan train lebih dari 10% merupakan indikasi overfit maka saya lebih pilih menggunakan LSTM saja dalam kasus ini, maka disini saya akan menlajutkan workflow dengan model ini saja sebagai default

# Simulasikan

In [27]:
# misal ada data baru yang masuk
def data_processing(df1):
    #buat df dengan tanggal dari 2020-01-01 ke 2024-02-23 dan kolom harga saham berisi harga "unk"
    date_rng=pd.date_range(start='1/1/2020', end='2/23/2024', freq='B')
    df2=pd.DataFrame(date_rng, columns=['tanggal'])
    df2['harga_saham']='unk'
    df2
    
    df1['tanggal']=pd.to_datetime(df1['tanggal'])
    df2['tanggal']=pd.to_datetime(df2['tanggal'])
    df=pd.merge(df1, df2, on='tanggal', how='outer')

    #mengisi missing value dengan "unk"
    df=df[["tanggal","harga"]].fillna("unk")
    
    # mengisi missing value dengan nilai interpolasi
    df['harga']=df['harga'].replace('unk', np.nan)
    df['harga']=df['harga'].interpolate()

    #isi lagi nan dengan nilai ffill
    df['harga']=df['harga'].fillna(method='ffill')

    #isi lagi nan dengan nilai bfill
    df['harga']=df['harga'].fillna(method='bfill')
    
    #ubah tipe data harga menjadi float
    df["harga"]=df["harga"].astype(float)
        
    return df

In [9]:
data=mandiri[:len(mandiri)*9//10]
oof=mandiri[len(mandiri)*9//10:]
print("Data: ", data.shape)
print("Out of fold: ", oof.shape)

Data:  (974, 2)
Out of fold:  (109, 2)


In [10]:
# normalisasi data
scaler=MinMaxScaler(feature_range=(0,1))
data["norm"]=scaler.fit_transform(data[["harga"]])
oof["norm"]=scaler.transform(oof[["harga"]])

print("data: ", data["norm"].shape)
print("Out of fold: ", oof["norm"].shape)

data:  (974,)
Out of fold:  (109,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["norm"]=scaler.fit_transform(data[["harga"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof["norm"]=scaler.transform(oof[["harga"]])


In [13]:
# ambil kolom norm
data_norm=data[["norm"]].values.reshape(-1,1)
oof_norm=oof[["norm"]].values.reshape(-1,1)

# ubah ke tensor untuk lstm
time_step = 15
X_train, y_train = create_dataset(data_norm, time_step)
X_test, y_test = create_dataset(oof_norm, time_step)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)   
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)

X_train:  (958, 15)
y_train:  (958,)
X_test:  (93, 15)
y_test (93,)


In [14]:
# reshape input to be [samples, time steps, features] which is required for LSTM
X_train =X_train.reshape(X_train.shape[0],X_train.shape[1] , 1)
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1] , 1)

print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)

X_train:  (958, 15, 1)
X_test:  (93, 15, 1)


In [15]:
model=Sequential()
model.add(LSTM(32,return_sequences=True,input_shape=(time_step,1)))
model.add(LSTM(32,return_sequences=True))
model.add(LSTM(32))
model.add(Dense(1))
model.compile(loss='mean_squared_error',optimizer='adam')







In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 15, 32)            4352      
                                                                 
 lstm_1 (LSTM)               (None, 15, 32)            8320      
                                                                 
 lstm_2 (LSTM)               (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 21025 (82.13 KB)
Trainable params: 21025 (82.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
model.fit(X_train,y_train,epochs=200,batch_size=5,verbose=1)

Epoch 1/200

Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 7

<keras.src.callbacks.History at 0x20e79bcedd0>

In [18]:
### Lets Do the prediction and check performance metrics
train_predict=model.predict(X_train)
train_predict.shape



(958, 1)

In [19]:
# Transform back to original form
train_predict = scaler.inverse_transform(train_predict)
original_ytrain = scaler.inverse_transform(y_train.reshape(-1,1)) 

In [20]:
print("Train data R2 score:", r2_score(original_ytrain, train_predict))

Train data R2 score: 0.982583506566741


In [21]:
# shift train predictions for plotting
look_back=time_step
trainPredictPlot = np.empty_like(data_norm)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(train_predict)+look_back, :] = train_predict
print("Train predicted data: ", trainPredictPlot.shape)

names = cycle(['Original harga price','Train predicted harga price','Test predicted harga price'])


plotdf = pd.DataFrame({'date': data['tanggal'],
                       'original_harga': data['harga'],
                      'train_predicted_harga': trainPredictPlot.reshape(1,-1)[0].tolist(),
                      })

fig = px.line(plotdf,x=plotdf['date'], y=[plotdf['original_harga'],plotdf['train_predicted_harga'],
                                          ],
              labels={'value':'Stock price','date': 'Date'})
fig.update_layout(title_text='Comparision between original harga price vs predicted harga price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

Train predicted data:  (974, 1)


# Real time evaluati

In [22]:
#ubah ke tensor untuk lstm
data_lama=data.copy()
data_baru=oof.copy()

## evaluasi model berdasarkan data yang masuk
banyak_data_baru=len(data_baru)

x_input=data_lama[["norm"]][len(data_lama)-time_step:].values.reshape(1,-1)
temp_input=list(x_input)
temp_input=temp_input[0].tolist()

In [23]:
#prediksi sebanyak data baru
from numpy import array

lst_output=[]
n_steps=time_step
i=0
pred_days = banyak_data_baru
while(i<pred_days):
    
    if(len(temp_input)>time_step):
        
        x_input=np.array(temp_input[1:])
        #print("{} day input {}".format(i,x_input))
        x_input = x_input.reshape(1,-1)
        x_input = x_input.reshape((1, n_steps, 1))
        
        yhat = model.predict(x_input, verbose=0)
        #print("{} day output {}".format(i,yhat))
        temp_input.extend(yhat[0].tolist())
        temp_input=temp_input[1:]
        #print(temp_input)
       
        lst_output.extend(yhat.tolist())
        i=i+1
        
    else:
        
        x_input = x_input.reshape((1, n_steps,1))
        yhat = model.predict(x_input, verbose=0)
        temp_input.extend(yhat[0].tolist())
        
        lst_output.extend(yhat.tolist())
        i=i+1
        
# Konversi prediksi menjadi format yang sesuai dan menambahkannya ke DataFrame
lst_output = scaler.inverse_transform(np.array(lst_output).reshape(-1,1)).reshape(1,-1).tolist()[0]
data_baru["prediksi"] = lst_output
print("Output of predicted next days: ", len(lst_output))

Output of predicted next days:  109


In [24]:
# buat plotly untuk data baru
names = cycle(['Original harga price','Predicted harga price'])
plotdf = pd.DataFrame({'date': data_baru['tanggal'],                   
                      'original_harga': data_baru['harga'],
                      'predicted_harga': data_baru['prediksi']})

fig = px.line(plotdf,x=plotdf['date'], y=[plotdf['original_harga'],plotdf['predicted_harga']],
                labels={'value':'Stock price','date': 'Date'})
fig.update_layout(title_text='Comparision between original harga price vs predicted harga price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

In [25]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from itertools import cycle

# Asumsikan 'data' dan 'data_baru' sudah didefinisikan sebelumnya
# 'trainPredictPlot' juga harus sudah diolah sebelumnya

# Siapkan DataFrame untuk plotting
plotdf = pd.DataFrame({
    'date': data['tanggal'],  # pastikan 'data' sudah didefinisikan
    'original_harga': data['harga'],  # pastikan 'data' sudah didefinisikan
    'train_predicted_harga': trainPredictPlot.reshape(-1).tolist(),  # sesuaikan dengan data Anda
})

# Membuat plot dengan Plotly
fig = px.line(plotdf, x='date', y=['original_harga', 'train_predicted_harga'], labels={'value': 'Harga', 'variable': 'Kategori'})

# Menentukan nama untuk setiap trace dengan menggunakan cycle
names = cycle(['Harga Original', 'Harga Prediksi Pelatihan'])

# Update nama untuk setiap trace
fig.for_each_trace(lambda t: t.update(name=next(names)))

# Tambahkan data prediksi baru dengan garis merah
fig.add_trace(go.Scatter(
    x=data_baru['tanggal'], y=data_baru['prediksi'], mode='lines',
    name='Prediksi Harga Baru (Data baru)', line=dict(color='green'),
    hovertemplate='Kategori :Prediksi Harga Baru (Data baru)<br>Tanggal: %{x}<br>Harga Prediksi: %{y}<extra></extra>'
))

# Tambahkan titik data original dengan marker hitam
fig.add_trace(go.Scatter(
    x=data_baru['tanggal'], y=data_baru['harga'], mode='markers',
    name='Harga Original (Data Baru)', marker=dict(color='black'),
    hovertemplate='Kategori :Harga Original (Data Baru)<br>Tanggal: %{x}<br>Harga Original: %{y}<extra></extra>'
))

# Update layout
fig.update_layout(
    title='Perbandingan Harga Original vs Prediksi',
    plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Harga'
)

# Update tampilan sumbu
fig.update_xaxes(showgrid=False, title_text='Tanggal')
fig.update_yaxes(showgrid=False, title_text='Harga')

# Tampilkan plot
fig.show()
