<a href="https://colab.research.google.com/github/yashpatel5400/crypto-prediction/blob/main/stats_601_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import datetime
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import lightgbm as lgb
import urllib.request

url = "https://media.githubusercontent.com/media/yashpatel5400/crypto-prediction/main/log_pr.csv"
urllib.request.urlretrieve(url, "log_pr.csv")

url = "https://media.githubusercontent.com/media/yashpatel5400/crypto-prediction/main/volu.csv"
urllib.request.urlretrieve(url, "volu.csv")

('volu.csv', <http.client.HTTPMessage at 0x7f98d6c0bf50>)

In [3]:
log_pr = pd.read_csv("log_pr.csv", index_col= 0)
volu = pd.read_csv("volu.csv", index_col= 0)

log_pr.index = pd.to_datetime(log_pr.index)
volu.index = pd.to_datetime(volu.index)


In [4]:
def construct_dataset(window_size, dataset):
    """
    window: look-back window size for constructing X (in minutes)
    """
    window_dt = datetime.timedelta(minutes=window_size)
    predict_dt = datetime.timedelta(minutes=30)

    window_X = []
    window_y = []

    for t in dataset.index[window_size:-window_size:10]: # compute the predictions every 10 minutes
      window_X.append(dataset.loc[(t - window_dt):t])
      window_y.append(dataset.loc[t + predict_dt] - dataset.loc[t])
        
    return np.array(window_X), np.array(window_y)

In [5]:
def construct_pr_vol_datasets(window_size):
    """
    window: look-back window size for constructing X (in minutes)
    """
    return construct_dataset(window_size, log_pr), construct_dataset(window_size, volu)

In [6]:
window_size = 100 # in minutes
log_pr_ds, volu_ds = construct_pr_vol_datasets(window_size)
log_pr_X, log_pr_y = log_pr_ds
volu_X, volu_y = volu_ds

print(log_pr_X.shape)
print(log_pr_y.shape)

num_test = 5000
X_train = log_pr_X[:-num_test, :, :]
y_train = log_pr_y[:-num_test, :]

X_test = log_pr_X[-num_test:, :, :]
y_test = log_pr_y[-num_test:, :]

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(26476, 101, 10)
(26476, 10)
(21476, 101, 10)
(21476, 10)
(5000, 101, 10)
(5000, 10)


#LSTMs

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable 

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

print(device)

cuda


In [8]:
X_train_tensors = Variable(torch.Tensor(X_train).to(device))
X_test_tensors = Variable(torch.Tensor(X_test))

y_train_tensors = Variable(torch.Tensor(y_train).to(device))
y_test_tensors = Variable(torch.Tensor(y_test)) 
print("Training Shape", X_train_tensors.shape, y_train_tensors.shape)
print("Testing Shape", X_test_tensors.shape, y_test_tensors.shape) 

Training Shape torch.Size([21476, 101, 10]) torch.Size([21476, 10])
Testing Shape torch.Size([5000, 101, 10]) torch.Size([5000, 10])


In [9]:
class LSTM1(nn.Module):
    def __init__(self, output_size, input_size, hidden_size, num_layers, seq_length):
        super(LSTM1, self).__init__()
        self.output_size = output_size #number of classes
        self.num_layers = num_layers #number of layers
        self.input_size = input_size #input size
        self.hidden_size = hidden_size #hidden state
        self.seq_length = seq_length #sequence length

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, batch_first=True) #lstm
        self.fc1 = nn.Linear(hidden_size, 32)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(32, output_size) #fully connected last layer
    
    def forward(self,x):
        output, (hn, cn) = self.lstm(x) #lstm with input, hidden, and internal state
        hn = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
        out = self.fc1(hn)
        out = self.relu(out)
        out = self.fc(out) #Final Output
        return out

In [10]:
num_epochs = 100 #1000 epochs
learning_rate = 0.001 #0.001 lr

input_size = 10 #number of features
hidden_size = 128 #number of features in hidden state
num_layers = 1 #number of stacked lstm layers

output_size = 10 #number of output classes

lstm1 = LSTM1(output_size, input_size, hidden_size, num_layers, X_train_tensors.shape[1]) 
lstm1 = lstm1.to(device)
criterion = torch.nn.MSELoss()    # mean-squared error for regression
optimizer = torch.optim.Adam(lstm1.parameters(), lr=learning_rate)

In [11]:
for epoch in range(num_epochs):
  outputs = lstm1.forward(X_train_tensors) 
  optimizer.zero_grad() 
 
  # obtain the loss function
  loss = criterion(outputs, y_train_tensors)
 
  loss.backward() 
 
  optimizer.step() #improve from loss, i.e backprop
  if epoch % 2 == 0:
    print("Epoch: %d, loss: %1.5f" % (epoch, loss.item())) 

Epoch: 0, loss: 0.01893
Epoch: 2, loss: 0.01448
Epoch: 4, loss: 0.01080
Epoch: 6, loss: 0.00739
Epoch: 8, loss: 0.00442
Epoch: 10, loss: 0.00275
Epoch: 12, loss: 0.00333
Epoch: 14, loss: 0.00220
Epoch: 16, loss: 0.00121
Epoch: 18, loss: 0.00091
Epoch: 20, loss: 0.00081
Epoch: 22, loss: 0.00065
Epoch: 24, loss: 0.00043
Epoch: 26, loss: 0.00025
Epoch: 28, loss: 0.00019
Epoch: 30, loss: 0.00022
Epoch: 32, loss: 0.00021
Epoch: 34, loss: 0.00016
Epoch: 36, loss: 0.00011
Epoch: 38, loss: 0.00009
Epoch: 40, loss: 0.00009
Epoch: 42, loss: 0.00008
Epoch: 44, loss: 0.00007
Epoch: 46, loss: 0.00005
Epoch: 48, loss: 0.00005
Epoch: 50, loss: 0.00005
Epoch: 52, loss: 0.00004
Epoch: 54, loss: 0.00004
Epoch: 56, loss: 0.00004
Epoch: 58, loss: 0.00004
Epoch: 60, loss: 0.00003
Epoch: 62, loss: 0.00003
Epoch: 64, loss: 0.00003
Epoch: 66, loss: 0.00003
Epoch: 68, loss: 0.00003
Epoch: 70, loss: 0.00003
Epoch: 72, loss: 0.00003
Epoch: 74, loss: 0.00003
Epoch: 76, loss: 0.00003
Epoch: 78, loss: 0.00003
Epoch

In [12]:
lstm1 = lstm1.to(torch.device("cpu"))
torch.save(lstm1.state_dict(), "/content/lstm_model.pth")

In [13]:
print( np.expand_dims(log_pr.iloc[-31:, :].to_numpy(), 0).shape)

(1, 31, 10)


In [14]:
lstm1.load_state_dict(torch.load("/content/lstm_model.pth"))
lstm1.eval()
lstm1 = lstm1.to(torch.device("cpu"))

#GBoost

In [15]:
class GBoost:
    def __init__(self, num_assets=10):
        self.models = []
        self.num_assets = num_assets

    def fit(self, X, y):
        for asset_index in range(10):
            asset_X = np.array(X[:, :, asset_index])
            asset_y = np.array(y[:, asset_index])

            # X_train = np.array(one_asset_X[:-5000, :])
            # y_train = np.array(one_asset_y[:-5000])

            model = lgb.LGBMRegressor()
            model.fit(asset_X, asset_y)

            self.models.append(model)

    def predict_one(self, X): #given data for just one sequence.
        final_predictions = []
        for asset_index in range(10):
            features = np.expand_dims(X[-31:, asset_index], 0)
            pred = self.models[asset_index].predict(features)
            final_predictions.append(pred)
        final_predictions = np.array(final_predictions).squeeze()
        return final_predictions

    def predict(self, X): #multiple sequences:
        final_predictions = []
        for i in range(len(X)):
            tmp_predictions = []
            for asset_index in range(10):
                features = np.array(np.expand_dims(X[i, -31:, asset_index], 0))
                pred = self.models[asset_index].predict(features)[0]
                tmp_predictions.append(pred)
            final_predictions.append(tmp_predictions)
            
        return np.array(final_predictions)


In [16]:
#train 
boost_model = GBoost()
boost_model.fit(X_train, y_train)

#Evaluation

In [17]:
# Use the negative 30-minutes backward log-returns to predict the 30-minutes forward log-returns
#predict the log price, and then do correlation

def get_r_hat_baseline(A, B):
    return -(A.iloc[-1] - A.iloc[-30]).values 

In [18]:
def get_r_hat_gboost(A, B):
    preds = boost_model.predict_one(A.to_numpy())
    return preds

In [19]:
def get_r_hat_lstm(A, B):
  input = np.expand_dims( A.iloc[-31:, :].to_numpy(), 0)
  input = Variable(torch.Tensor(input))
  pred = lstm1(input).detach().cpu().numpy()
  return pred.squeeze()

In [29]:
# An example of get_r_hat

ACTIVE_R_HAT = "lstm"

r_hat_implementations = {
    "baseline": get_r_hat_baseline, # 0.040118694541047606
    "gboost": get_r_hat_gboost,
    "lstm": get_r_hat_lstm
}

def get_r_hat(A, B): 
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    # print(A.shape)
    return r_hat_implementations[ACTIVE_R_HAT](A, B)

In [30]:
def get_model_corr(input_df):
    t0 = time.time()
    dt = datetime.timedelta(days=1)
    r_hat = pd.DataFrame(index=input_df.index[1440::10], columns=np.arange(10), dtype=np.float64)
    for t in input_df.index[1440::10]: # compute the predictions every 10 minutes
        r_hat.loc[t, :] = get_r_hat(input_df.loc[(t - dt):t], volu.loc[(t - dt):t])
    t_used = time.time() - t0
    
    r_fwd = (input_df.shift(-30+1440) - input_df).iloc[1440::10].rename(columns={f"input_df_{i}": i for i in range(10)})
    r_fwd.corrwith(r_hat)
    
    r_fwd_all = r_fwd.iloc[:-3].values.ravel() # the final 3 rows are NaNs. 
    r_hat_all = r_hat.iloc[:-3].values.ravel()
    
    return np.corrcoef(r_fwd_all, r_hat_all)[0, 1]
# def get_model_accuracy():
#     t0 = time.time()
#     dt = datetime.timedelta(days=1)
#     r_hat = pd.DataFrame(index=log_pr.index[30::10], columns=np.arange(10), dtype=np.float64)
#     for t in log_pr.index[30::10]: # compute the predictions every 10 minutes
#         r_hat.loc[t, :] = get_r_hat(log_pr.loc[(t - dt):t], volu.loc[(t - dt):t])
#     t_used = time.time() - t0
    
#     r_fwd = (log_pr.shift(-30) - log_pr).iloc[30::10].rename(columns={f"log_pr_{i}": i for i in range(10)})
#     r_fwd.corrwith(r_hat)
    
#     r_fwd_all = r_fwd.iloc[:-3].values.ravel() # the final 3 rows are NaNs. 
#     print(r_fwd_all.shape)
#     r_hat_all = r_hat.iloc[:-3].values.ravel()
    
#     return np.corrcoef(r_fwd_all, r_hat_all)[0, 1]

In [31]:
log_pr_test = log_pr.iloc[-num_test:, :]
print(log_pr_test.shape)
get_model_corr(log_pr_test)

(5000, 10)


0.029624934743779387