In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler
from pycaret.utils import version
version()
from pycaret.time_series import TSForecastingExperiment
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima.arima.utils import nsdiffs, ndiffs
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error, r2_score
from statsmodels.tsa.seasonal import STL
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import callbacks
import tensorflow as tf
import math
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
import keras_tuner as kt  # Import Keras Tuner as kt
import yfinance as yf

# Download Bitcoin data
df_BTC = yf.download(
    tickers=["BTC-USD"],
    start="2020-01-01",
    end="2025-01-01"
)
df_BTC.columns = ['Open', 'High', 'Low', 'Close', 'Volume']
df_BTC.shape
df_BTC.info()
print('Null Values:', df_BTC.isnull().values.sum())
print(df_BTC.columns)
df_BTC.reset_index(inplace=True)
df_BTC['Date'] = pd.to_datetime(df_BTC['Date'], format='%Y-%m-%d')
df_BTC = df_BTC[['Date', 'Close']]
df_BTC.head()
# Setting the Date column as the index
df_BTC.set_index('Date', inplace=True)
print(df_BTC.index)
df_BTC = df_BTC.asfreq('D')  # Resample to daily frequency
# Verifying the frequency of the index
print(f"Frequency of the index: {df_BTC.index.freq}")
df_BTC.head(10)

# Plot the close price
plt.figure(figsize=(16,8))
plt.plot(df_BTC['Close'], color='black')
plt.xlabel('Date')
plt.xticks(rotation=45)
plt.ylabel('Price ($)')
plt.title('BTC USD Price')
plt.grid(True)
plt.show()

# Extract the 'Close' column from the data
close_prices = df_BTC['Close']

# Get the values from the 'Close' column as a NumPy array
values = close_prices.values

# Calculate the length of the training data by taking 60% of the total length of the 'values' array
training_data_len = int(len(values) * 0.6)
validation_data_len = int(len(values) * 0.2)
test_data_len = len(values) - training_data_len - validation_data_len

# Create a MinMaxScaler object and scale the values to the range [0, 1]
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(values.reshape(-1, 1))

# Split the scaled data into training, validation, and test sets
train_data = scaled_data[0:training_data_len, :]
val_data = scaled_data[training_data_len - 60: training_data_len + validation_data_len, :]
test_data = scaled_data[training_data_len + validation_data_len - 60:, :]

# Initialize empty lists for the training, validation, and test inputs and outputs
x_train, y_train = [], []
x_val, y_val = [], []
x_test = []

# Loop through the training data and create input/output pairs
for i in range(60, len(train_data)):
    x_train.append(train_data[i - 60:i, 0])
    y_train.append(train_data[i, 0])

# Convert the training inputs and outputs to NumPy arrays
x_train = np.array(x_train)
y_train = np.array(y_train)

# Reshape the training inputs to be 3D for use with an LSTM model
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

# Loop through the validation data and create input/output pairs
for i in range(60, len(val_data)):
    x_val.append(val_data[i - 60:i, 0])
    y_val.append(val_data[i, 0])

# Convert the validation inputs and outputs to NumPy arrays
x_val = np.array(x_val)
y_val = np.array(y_val)

# Reshape the validation inputs to be 3D for use with an LSTM model
x_val = np.reshape(x_val, (x_val.shape[0], x_val.shape[1], 1))

# Loop through the test data and create input sequences
for i in range(60, len(test_data)):
    x_test.append(test_data[i - 60:i, 0])

# Convert the test inputs to a NumPy array and reshape to be 3D
x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

# Get the test outputs as a NumPy array from the 'values' array
y_test = values[training_data_len + validation_data_len:]

print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)
print('x_val shape:', x_val.shape)
print('y_val shape:', y_val.shape)
print('x_test shape:', x_test.shape)
print('y_test shape:', y_test.shape)

# Define the model building function for Keras Tuner
def build_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units', min_value=50, max_value=200, step=50), return_sequences=True, input_shape=(x_train.shape[1], 1)))
    model.add(LSTM(units=hp.Int('units', min_value=50, max_value=200, step=50), return_sequences=False))
    model.add(Dense(units=hp.Int('dense_units', min_value=10, max_value=100, step=10)))
    model.add(Dense(1))
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='mean_squared_error')
    return model


# Initialize the tuner
tuner = kt.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=5,
    executions_per_trial=3,
    directory='tuner_dir',
    project_name='btc_lstm_tuning'
)

# Perform hyperparameter tuning
tuner.search(x_train, y_train, epochs=10, validation_data=(x_val, y_val))

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

# Evaluate the best model on the test set
predictions = best_model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
rmse = np.sqrt(np.mean(predictions - y_test)**2)
print('Root mean squared error:', rmse)

# Evaluate LSTM Model Performance
rmse_lstm = np.sqrt(mean_squared_error(y_test, predictions))
mae_lstm = mean_absolute_error(y_test, predictions)
mape_lstm = mean_absolute_percentage_error(y_test, predictions)
r2_lstm = r2_score(y_test, predictions)

# Print Evaluation Metrics
print("\n--- LSTM Model Evaluation Metrics (Hold-out Set) ---")
print(f"Root Mean Squared Error (RMSE): {rmse_lstm:.4f}")
print(f"Mean Absolute Error (MAE): {mae_lstm:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape_lstm:.4%}")
print(f"R-squared (R²): {r2_lstm:.4f}")

# Print Metrics for Comparison Table in Thesis
print("\n--- LSTM Hold-out Metrics (for Thesis Table) ---")
print(f"LSTM Hold-out RMSE:  {rmse_lstm:.4f}")
print(f"LSTM Hold-out MAE:   {mae_lstm:.4f}")
print(f"LSTM Hold-out MAPE:  {mape_lstm:.4%}")
print(f"LSTM Hold-out R²:    {r2_lstm:.4f}")

# Prepare data for plotting
df_BTC = df_BTC.filter(['Close'])
train = df_BTC[:training_data_len]
validation = df_BTC[training_data_len:training_data_len + validation_data_len]
test = df_BTC[training_data_len + validation_data_len:]
test['Predictions'] = predictions
train.reset_index(inplace=True)
validation.reset_index(inplace=True)
test.reset_index(inplace=True)

# Plot the results
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=train.Date, y=train.Close, mode='lines', name='Actual Price (train)'))
fig.add_trace(go.Scatter(x=validation.Date, y=validation.Close, mode='lines', name='Actual Price (validation)'))
fig.add_trace(go.Scatter(x=test.Date, y=test.Close, mode='lines', name='Actual Price (test)'))
fig.add_trace(go.Scatter(x=test.Date, y=test.Predictions, mode='lines', name='Predicted price'))
fig.update_layout(
    title="Time series Forecasting using LSTM",
    xaxis_title="Date-Time",
    yaxis_title="Values",
    legend_title="Legend",
)
fig.show()


Trial 8 Complete [00h 02m 51s]
val_loss: 0.0001354873626648138

Best val_loss So Far: 5.068084525798137e-05
Total elapsed time: 5d 11h 58m 23s


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step
Root mean squared error: 1018.2089416837432

--- LSTM Model Evaluation Metrics (Hold-out Set) ---
Root Mean Squared Error (RMSE): 2246.9289
Mean Absolute Error (MAE): 1641.4320
Mean Absolute Percentage Error (MAPE): 2.3992%
R-squared (R²): 0.9765

--- LSTM Hold-out Metrics (for Thesis Table) ---
LSTM Hold-out RMSE:  2246.9289
LSTM Hold-out MAE:   1641.4320
LSTM Hold-out MAPE:  2.3992%
LSTM Hold-out R²:    0.9765


In [5]:
# Show counts and date ranges for training, validation, and test sets

# Training set
train_start_date = df_BTC.index[0]
train_end_date = df_BTC.index[training_data_len - 1]
train_count = training_data_len

# Validation set
val_start_date = df_BTC.index[training_data_len - 60]
val_end_date = df_BTC.index[training_data_len + validation_data_len - 1]
val_count = validation_data_len

# Test set
test_start_date = df_BTC.index[training_data_len + validation_data_len - 60]
test_end_date = df_BTC.index[-1]
test_count = test_data_len

print(f"Training Set: {train_count} data points from {train_start_date} to {train_end_date}")
print(f"Validation Set: {val_count} data points from {val_start_date} to {val_end_date}")
print(f"Test Set: {test_count} data points from {test_start_date} to {test_end_date}")

Training Set: 1096 data points from 2020-01-01 00:00:00 to 2022-12-31 00:00:00
Validation Set: 365 data points from 2022-11-02 00:00:00 to 2023-12-31 00:00:00
Test Set: 366 data points from 2023-11-02 00:00:00 to 2024-12-31 00:00:00
