<a href="https://colab.research.google.com/github/uquant0507/predicting-btc-price-with-lstm/blob/main/lstm_4h.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Load Python libraries

In [None]:
# pip install numpy, pandas, pprint
import numpy as np
import pandas as pd
import pprint
# pip install torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.autograd import Variable 
import torch.nn.init as init

# pip install matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# pip install ccxt
!pip install ccxt
import ccxt

Collecting ccxt
  Downloading ccxt-1.72.87-py2.py3-none-any.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 5.5 MB/s 
[?25hCollecting aiohttp>=3.8
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 33.2 MB/s 
Collecting cryptography>=2.6.1
  Downloading cryptography-36.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 18.8 MB/s 
[?25hCollecting aiodns>=1.1.1
  Downloading aiodns-3.0.0-py3-none-any.whl (5.0 kB)
Collecting yarl==1.7.2
  Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)
[K     |████████████████████████████████| 271 kB 45.7 MB/s 
Collecting multidict>=4.0
  Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)
[K     |████████████████████████████████| 94 kB 3

In [None]:
config = {
    "data": {
        "window_size": 14,
        "train_split_size": 0.5,
        "val_split_size": 0.2,
        "thres_frac": 1,
        "change_time": 3
    }, 
    "model": {
        "input_size": 2, # price, volume
        "num_lstm_layers": 1,
        "hidden_size": 64,
        "num_classes" : 3,
        "dropout": 0.2,
    },
    "training": {
        "device": "cuda", # "cuda" or "cpu"
        "batch_size": 32,
        "epoch": 100,
        "learning_rate": 0.01,
        "scheduler_step_size": 40,
    }
}

2. Data Preparation

(1)Fetch Data

In [None]:
binance = ccxt.binance()
fetch_num = 1000
timeframe = '4h'
ticker = binance.fetch_ohlcv("BTC/USDT", timeframe, limit=fetch_num)
startfrom = ticker[0][0]
class ohlcv:
  def __init__(self, index):
    self.index = index
    since = startfrom - 3600000000 * 4 * self.index
    ohlcv = binance.fetch_ohlcv("BTC/USDT", timeframe, since=since, limit=fetch_num)
    self.df = pd.DataFrame(ohlcv, columns=['datetime', 'open', 'high', 'low', 'close', 'volume'])

ohlcv_df = pd.concat([ohlcv(12).df, ohlcv(11).df, ohlcv(10).df, ohlcv(9).df, ohlcv(8).df, ohlcv(7).df, ohlcv(6).df, ohlcv(5).df, ohlcv(4).df, ohlcv(3).df, ohlcv(2).df, ohlcv(1).df, ohlcv(0).df])
ohlcv_df = ohlcv_df.drop_duplicates(['datetime'])
ohlcv_df = ohlcv_df.drop(ohlcv_df.loc[ohlcv_df['close']==0].index)
ohlcv_df = ohlcv_df.drop(ohlcv_df.loc[ohlcv_df['volume']==0].index)
ohlcv_df['p_change'] = ohlcv_df['close'] - ohlcv_df['close'].shift(1)
ohlcv_df['p_changerate'] = np.log(ohlcv_df['close']/ ohlcv_df['close'].shift(1))
ohlcv_df['v_changerate'] = np.log(ohlcv_df['volume']/ ohlcv_df['volume'].shift(1))
ohlcv_df = ohlcv_df.drop([0])
ohlcv_df['datetime'] = pd.to_datetime(ohlcv_df['datetime'], unit='ms')
datetime = ohlcv_df['datetime'].to_numpy()
ohlcv_df.set_index('datetime', inplace=True)
ohlcv_df.to_excel("ohlcv3.xlsx")
print(ohlcv_df.describe())
p_change = ohlcv_df['p_change'].to_numpy()
p_changerate = ohlcv_df['p_changerate'].to_numpy()
v_changerate = ohlcv_df['v_changerate'].to_numpy()
print(p_change.shape)

               open          high  ...  p_changerate  v_changerate
count   9827.000000   9827.000000  ...   9827.000000   9827.000000
mean   18045.832621  18265.798387  ...      0.000237      0.000116
std    17475.770244  17684.119669  ...      0.017781      0.484282
min     2870.900000   3148.000000  ...     -0.229366     -6.994578
25%     6762.985000   6841.545000  ...     -0.005773     -0.304961
50%     9370.000000   9448.680000  ...      0.000319     -0.017231
75%    28271.385000  28749.395000  ...      0.006598      0.280314
max    68490.000000  69000.000000  ...      0.271624      7.488635

[8 rows x 8 columns]
(9827,)


# 새 섹션

(2)Data into train, test data

In [None]:
# normalize
def normalize(x):
  mu = np.average(x)
  sd = np.std(x)
  normalized_x = (x - mu) / sd
  print(mu)
  print(sd)
  return normalized_x

p_changerate = normalize(p_changerate)
v_changerate = normalize(v_changerate)

#perform windowing
def prepare_data_x(p_changerate, v_changerate, window_size):
   n_row = p_changerate.shape[0] - window_size + 1
   x1 = np.lib.stride_tricks.as_strided(p_changerate, shape=(n_row,window_size), strides=(p_changerate.strides[0],p_changerate.strides[0]))
   x2 = np.lib.stride_tricks.as_strided(v_changerate, shape=(n_row,window_size), strides=(v_changerate.strides[0],v_changerate.strides[0]))
   li = [-i for i in range(1, config["data"]["change_time"]+1)]
   x1 = np.delete(x1, li, axis=0)
   x2 = np.delete(x2, li, axis=0)
   x = np.dstack([x1, x2])
   x = x.astype(np.float32)
   return x

#calculate deviation by window
def prepare_data_y(p_change, window_size, fraction):
  n_row = p_change.shape[0] - window_size + 1
  windowed_change = np.lib.stride_tricks.as_strided(p_change, shape=(n_row,window_size), strides=(p_change.strides[0],p_change.strides[0]))
  threshold = fraction * np.std(windowed_change, axis=1)
  print(threshold.shape)
  yn = np.zeros(n_row - config["data"]["change_time"])
  for i in range(n_row - config["data"]["change_time"]):
    li = p_change[window_size+i:window_size+i+ config["data"]["change_time"]]
    change = sum(li)
    if change >= threshold[i]:
      yn[i] = 1
    elif change <= -threshold[i]:
      yn[i] = 2
  return yn


x = prepare_data_x(p_changerate, v_changerate, config["data"]["window_size"])
y = prepare_data_y(p_change, config["data"]["window_size"],  config["data"]["thres_frac"])

# split dataset
def split_dataset(x, y, shuffle=False):
  index = int(x.shape[0]* config["data"]["train_split_size"])
  index2 = int(x.shape[0]* (config["data"]["train_split_size"] + config["data"]["val_split_size"]))
  x_train = x[:index]
  x_val = x[index:index2]
  x_test = x[index2:]
  y_train = y[:index]
  y_val = y[index:index2]
  y_test = y[index2:]
  return x_train, y_train, x_val, y_val, x_test, y_test

x_train = split_dataset(x, y)[0]
y_train = split_dataset(x, y)[1]
x_val = split_dataset(x, y)[2]
y_val = split_dataset(x, y)[3]
x_test = split_dataset(x, y)[4]
y_test = split_dataset(x, y)[5]

class TimeSeriesDataset(Dataset):
    def __init__(self, x, y):
        self.x = x.astype(np.float32)
        self.y = y.astype(np.int64)
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

dataset_train = TimeSeriesDataset(x_train, y_train)
dataset_val = TimeSeriesDataset(x_val, y_val)
dataset_test = TimeSeriesDataset(x_test, y_test)

train_loader = DataLoader(dataset_train, batch_size=config["training"]["batch_size"], shuffle=False)
val_loader = DataLoader(dataset_val, batch_size=config["training"]["batch_size"], shuffle=False)
test_loader = DataLoader(dataset_test, batch_size=config["training"]["batch_size"], shuffle=False)

for (x_train, y_train) in train_loader: 
  print(x_train.size(), x_train.type())
  print(y_train.size(), y_train.type())
  break

0.00023652873749213717
0.017780092687123123
0.00011561125948424708
0.48425696334908436
(9814,)
torch.Size([32, 14, 2]) torch.FloatTensor
torch.Size([32]) torch.LongTensor


3. Defining Model

In [None]:
class LSTMModel(nn.Module):
  def __init__(self, num_classes, input_size, hidden_size, num_layers):
    super().__init__()
    self.num_classes = num_classes #number of classes
    self.num_layers = num_layers #number of layers
    self.input_size = input_size #input size
    self.hidden_size = hidden_size #hidden state
    self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, batch_first=True) #lstm1
    self.fc1 = nn.Linear(hidden_size, 512) #fully connected 1
    self.fc2 = nn.Linear(512, 128)
    self.fc3 = nn.Linear(128, num_classes) #fully connected last layer
    self.relu = nn.ReLU()
    self.batch_norm1 = nn.BatchNorm1d(hidden_size)
    self.batch_norm2 = nn.BatchNorm1d(512)
    self.batch_norm3 = nn.BatchNorm1d(128)
    self.dropout_prob = 0.2
  
  def forward(self,x):
    # Propagate input through LSTM
    output, (hn, cn) = self.lstm(x) #lstm with input, hidden, and internal state
    x = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
    x = self.batch_norm1(x)
    x = self.fc1(x) #first Dense
    x = self.batch_norm2(x)
    x = self.relu(x) #relu
    x = F.dropout(x, training=self.training, p=self.dropout_prob)
    x = self.fc2(x) #first Dense
    x = self.batch_norm3(x)
    x = self.relu(x) #relu
    x = F.dropout(x, training=self.training, p=self.dropout_prob)
    x = self.fc3(x) #Final Output
    x = F.log_softmax(x)
    return x

model = LSTMModel(num_classes=config["model"]["num_classes"], input_size=config["model"]["input_size"], hidden_size=config["model"]["hidden_size"], num_layers=config["model"]["num_lstm_layers"])
model = model.to(config["training"]["device"])

4. Optimizer, Objective Function 설정하기

In [None]:
optimizer = optim.Adam(model.parameters(), lr=config["training"]["learning_rate"])
criterion = nn.CrossEntropyLoss()
def weight_init(m):
  if isinstance(m, nn.Linear):
    nn.init.kaiming_uniform(m.weight.data)
model.apply(weight_init)

  """


LSTMModel(
  (lstm): LSTM(2, 32, batch_first=True)
  (fc1): Linear(in_features=32, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=3, bias=True)
  (relu): ReLU()
  (batch_norm1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

5. Train model

In [None]:
def train(model, train_loader, optimizer, log_interval = 20):
  model.train()
  for idx, (x, y) in enumerate(train_loader):
    x = x.to(config["training"]["device"])
    y = y.to(config["training"]["device"])
    optimizer.zero_grad()
    out = model(x)
    loss = criterion(out, y)
    loss.backward()
    optimizer.step()
    if idx % log_interval == 0:
      print("Train Epoch: {} [{}/{} ({:.0f}%)] Train Loss: {:.6f}".format(epoch, idx * len(x),
      len(train_loader.dataset), 100. * idx / len(train_loader), loss.item()))
    if idx == len(train_loader) - 1:
      return loss.item()

6. Evaluate Model

In [None]:
def evaluate(model, test_loader):
  model.eval()
  test_loss = 0
  correct = 0
  ups = 0 
  downs = 0
  zeros = 0 
  up_true_positive = 0
  down_true_positive = 0
  zero_true_positive = 0

  with torch.no_grad():
    for x, y in test_loader:
      x = x.to(config["training"]["device"])
      y = y.to(config["training"]["device"])
      out = model(x)
      test_loss += criterion(out, y).item()
      prediction = out.max(1, True)[1]
      label = y.view_as(prediction)
      correct += prediction.eq(label).sum().item()
      ups += (prediction == 1).sum().item()
      downs += (prediction == 2).sum().item()
      zeros += (prediction == 0).sum().item()
      for i in range(len(prediction)):
        if prediction[i] == 1:
          if prediction[i] == label[i]:
            up_true_positive += 1
        elif prediction[i] == 2:
          if prediction[i] == label[i]:
            down_true_positive += 1       
        else: 
          if prediction[i] == label[i]:
            zero_true_positive += 1 

  up_accuracy = 100 * up_true_positive / (ups + 1e-7)
  down_accuracy = 100 * down_true_positive / (downs + 1e-7)
  zero_accuracy = 100 * zero_true_positive / (zeros + 1e-7)
  test_loss /= len(test_loader.dataset)
  test_accuracy = 100. * correct / len(test_loader.dataset)
  return test_loss, test_accuracy, prediction, out, up_accuracy, down_accuracy, zero_accuracy

7. 학습 진행 및 평가

In [None]:
trainloss = []
valloss = []
valaccuracy = []
upaccuracy = []
downaccuracy = []
zeroaccuracy = []
for epoch in range(1, config["training"]["epoch"] + 1):
  x = train(model, train_loader, optimizer, log_interval=100)
  trainloss.append(x)
  test_loss, test_accuracy, prediction, out, up_accuracy, down_accuracy, zero_accuracy = evaluate(model, val_loader)
  valloss.append(test_loss)
  valaccuracy.append(test_accuracy)
  upaccuracy.append(up_accuracy)
  downaccuracy.append(down_accuracy)
  zeroaccuracy.append(zero_accuracy)
  print("[EPOCH: {}], Test Loss: {:.4f}, Test Accuracy: {:.2f} %".format(epoch, test_loss, test_accuracy))
val_data = {'train loss' : trainloss, 'val loss' : valloss, 'val accuracy' : valaccuracy, 'up accuracy' : upaccuracy, 'down accuracy' : downaccuracy, 'zero accuracy' : zeroaccuracy}
val_result = pd.DataFrame(val_data)
val_result.to_excel("val_result.xlsx")




[EPOCH: 1], Test Loss: 0.0398, Test Accuracy: 49.13 %
[EPOCH: 2], Test Loss: 0.0362, Test Accuracy: 52.55 %
[EPOCH: 3], Test Loss: 0.0332, Test Accuracy: 52.09 %
[EPOCH: 4], Test Loss: 0.0377, Test Accuracy: 51.27 %
[EPOCH: 5], Test Loss: 0.0412, Test Accuracy: 44.85 %
[EPOCH: 6], Test Loss: 0.0384, Test Accuracy: 47.66 %
[EPOCH: 7], Test Loss: 0.0391, Test Accuracy: 48.67 %
[EPOCH: 8], Test Loss: 0.0509, Test Accuracy: 46.84 %
[EPOCH: 9], Test Loss: 0.0489, Test Accuracy: 46.64 %
[EPOCH: 10], Test Loss: 0.0439, Test Accuracy: 50.10 %
[EPOCH: 11], Test Loss: 0.0433, Test Accuracy: 48.11 %
[EPOCH: 12], Test Loss: 0.0446, Test Accuracy: 45.92 %
[EPOCH: 13], Test Loss: 0.0412, Test Accuracy: 47.96 %
[EPOCH: 14], Test Loss: 0.0431, Test Accuracy: 49.13 %
[EPOCH: 15], Test Loss: 0.0470, Test Accuracy: 48.78 %
[EPOCH: 16], Test Loss: 0.0431, Test Accuracy: 47.81 %
[EPOCH: 17], Test Loss: 0.0550, Test Accuracy: 45.26 %
[EPOCH: 18], Test Loss: 0.0507, Test Accuracy: 48.37 %
[EPOCH: 19], Test L

In [None]:
print((y_val==0).sum())
print((y_val==1).sum())
print((y_val==2).sum())

1031
496
435
