<a href="https://colab.research.google.com/github/yashpatel5400/101days/blob/master/Copy_of_stats_601_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [94]:
import os
import datetime
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import lightgbm as lgb
import urllib.request

url = "https://media.githubusercontent.com/media/yashpatel5400/crypto-prediction/main/log_pr.csv"
urllib.request.urlretrieve(url, "log_pr.csv")

url = "https://media.githubusercontent.com/media/yashpatel5400/crypto-prediction/main/volu.csv"
urllib.request.urlretrieve(url, "volu.csv")

('volu.csv', <http.client.HTTPMessage at 0x7f995cc9ced0>)

In [95]:
log_pr = pd.read_csv("log_pr.csv", index_col= 0)
volu = pd.read_csv("volu.csv", index_col= 0)

log_pr.index = pd.to_datetime(log_pr.index)
volu.index = pd.to_datetime(volu.index)

In [96]:
# produces the *indices* of **original** dataset that will be used for training/test
# these are then fed into the feature constructor, which pulls the windows from those indices

prediction_pts = log_pr.index[1440::10].values.copy() # predictions every 10 minutes
permutation = np.random.permutation(len(prediction_pts)) + 1440 # shift all indices, since we index starting from 1440

num_test = 5000
train_pts = permutation[:-num_test]
test_pts = permutation[-num_test:]

In [97]:
def construct_features(log_pr_df, vol_df):
    df = log_pr_df.copy()
    ema21 = log_pr_df.ewm(span=21, min_periods=5, adjust=False).mean().fillna(1)
    ema35 = log_pr_df.ewm(span=35, min_periods=10, adjust=False).mean().fillna(1)
    ema80 = log_pr_df.ewm(span=80, min_periods=20, adjust=False).mean().fillna(1)
    ema250 = log_pr_df.ewm(span=250, min_periods=30, adjust=False).mean().fillna(1)

    df = pd.concat([df, np.log(vol_df + 1), ema21, ema35, ema80, ema250], axis=1)
    return df

In [98]:
def construct_dataset(window_size, features, log_prices, train_indices):
    """
    window: look-back window size for constructing X (in minutes)
    features: feature engineered df
    log_prices: original log prices df
    train_indices: indices of rows to be pulled for training dataset
    """
    window_dt = datetime.timedelta(minutes=window_size)
    predict_dt = datetime.timedelta(minutes=30)

    window_X = []
    window_y = []

    for t in features.index[train_indices]: # compute the predictions every 10 minutes
      window_X.append(features.loc[(t - window_dt):t])
      window_y.append(log_prices.loc[t + predict_dt] - log_prices.loc[t])
        
    return np.array(window_X), np.array(window_y)

In [99]:
features = construct_features(log_pr, volu)
window_size = 60 # in minutes
X_train, y_train = construct_dataset(window_size, features, log_pr, train_pts)

#LSTMs

In [100]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable 

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

print(device)

cuda


In [101]:
X_train_tensors = Variable(torch.Tensor(X_train).to(device))
y_train_tensors = Variable(torch.Tensor(y_train).to(device))

print("Training Shape", X_train_tensors.shape, y_train_tensors.shape)

Training Shape torch.Size([21352, 61, 60]) torch.Size([21352, 10])


In [131]:
class LSTM1(nn.Module):
    def __init__(self, output_size, input_size, hidden_size, num_layers):
        super(LSTM1, self).__init__()
        self.output_size = output_size #number of classes
        self.num_layers = num_layers #number of layers
        self.input_size = input_size #input size
        self.hidden_size = hidden_size #hidden state

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, batch_first=True) #lstm
        self.fc1 = nn.Linear(hidden_size, 64)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.fc = nn.Linear(32, output_size) #fully connected last layer
    
    def forward(self,x):
        output, (hn, cn) = self.lstm(x) #lstm with input, hidden, and internal state
        hn = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
        out = self.fc1(hn)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc(out) #Final Output
        return out

In [132]:
num_epochs = 100 #1000 epochs
learning_rate = 0.001 #0.001 lr

input_size = X_train.shape[-1] #number of features
print(input_size)
hidden_size = 16 #number of features in hidden state
num_layers = 1 #number of stacked lstm layers

output_size = 10 #number of output classes

lstm1 = LSTM1(output_size, input_size, hidden_size, num_layers) 
lstm1 = lstm1.to(device)
criterion = torch.nn.MSELoss()    # mean-squared error for regression
optimizer = torch.optim.Adam(lstm1.parameters(), lr=learning_rate)

print(X_train_tensors.shape)

60
torch.Size([21352, 61, 60])


In [133]:
for epoch in range(num_epochs):
  outputs = lstm1.forward(X_train_tensors) 
  optimizer.zero_grad() 
 
  # obtain the loss function
  loss = criterion(outputs, y_train_tensors)
 
  loss.backward() 
 
  optimizer.step() #improve from loss, i.e backprop
  if epoch % 2 == 0:
    print("Epoch: %d, loss: %1.5f" % (epoch, loss.item())) 

Epoch: 0, loss: 0.01106
Epoch: 2, loss: 0.00881
Epoch: 4, loss: 0.00697
Epoch: 6, loss: 0.00547
Epoch: 8, loss: 0.00427
Epoch: 10, loss: 0.00347
Epoch: 12, loss: 0.00285
Epoch: 14, loss: 0.00233
Epoch: 16, loss: 0.00184
Epoch: 18, loss: 0.00141
Epoch: 20, loss: 0.00102
Epoch: 22, loss: 0.00072
Epoch: 24, loss: 0.00051
Epoch: 26, loss: 0.00034
Epoch: 28, loss: 0.00022
Epoch: 30, loss: 0.00013
Epoch: 32, loss: 0.00008
Epoch: 34, loss: 0.00006
Epoch: 36, loss: 0.00006
Epoch: 38, loss: 0.00007
Epoch: 40, loss: 0.00007
Epoch: 42, loss: 0.00007
Epoch: 44, loss: 0.00006
Epoch: 46, loss: 0.00004
Epoch: 48, loss: 0.00004
Epoch: 50, loss: 0.00003
Epoch: 52, loss: 0.00003
Epoch: 54, loss: 0.00003
Epoch: 56, loss: 0.00002
Epoch: 58, loss: 0.00002
Epoch: 60, loss: 0.00002
Epoch: 62, loss: 0.00002
Epoch: 64, loss: 0.00002
Epoch: 66, loss: 0.00002
Epoch: 68, loss: 0.00002
Epoch: 70, loss: 0.00002
Epoch: 72, loss: 0.00002
Epoch: 74, loss: 0.00002
Epoch: 76, loss: 0.00002
Epoch: 78, loss: 0.00002
Epoch

In [134]:
lstm1 = lstm1.to(torch.device("cpu"))
torch.save(lstm1.state_dict(), "/content/lstm_model.pth")

In [135]:
print( np.expand_dims(log_pr.iloc[-31:, :].to_numpy(), 0).shape)

(1, 31, 10)


In [136]:
lstm1.load_state_dict(torch.load("/content/lstm_model.pth"))
lstm1.eval()
lstm1 = lstm1.to(torch.device("cpu"))

#GBoost

In [None]:
class GBoost:
    def __init__(self, num_assets=10):
        self.models = []
        self.num_assets = num_assets

    def fit(self, X, y):
        for asset_index in range(10):
            asset_X = np.array(X[:, :, asset_index])
            asset_y = np.array(y[:, asset_index])

            # X_train = np.array(one_asset_X[:-5000, :])
            # y_train = np.array(one_asset_y[:-5000])

            model = lgb.LGBMRegressor()
            model.fit(asset_X, asset_y)

            self.models.append(model)

    def predict_one(self, X): #given data for just one sequence.
        final_predictions = []
        for asset_index in range(10):
            features = np.expand_dims(X[-31:, asset_index], 0)
            pred = self.models[asset_index].predict(features)
            final_predictions.append(pred)
        final_predictions = np.array(final_predictions).squeeze()
        return final_predictions

    def predict(self, X): #multiple sequences:
        final_predictions = []
        for i in range(len(X)):
            tmp_predictions = []
            for asset_index in range(10):
                features = np.array(np.expand_dims(X[i, -31:, asset_index], 0))
                pred = self.models[asset_index].predict(features)[0]
                tmp_predictions.append(pred)
            final_predictions.append(tmp_predictions)
            
        return np.array(final_predictions)


In [None]:
#train 
boost_model = GBoost()
boost_model.fit(X_train, y_train)

#Evaluation

In [137]:
# Use the negative 30-minutes backward log-returns to predict the 30-minutes forward log-returns
#predict the log price, and then do correlation

def get_r_hat_baseline(A, B):
    return -(A.iloc[-1] - A.iloc[-30]).values 

In [138]:
def get_r_hat_gboost(A, B):
    preds = boost_model.predict_one(A.to_numpy())
    return preds

In [139]:
def get_r_hat_lstm(A, B):
  input = np.expand_dims(construct_features(A, B).values, axis=0)
  input = Variable(torch.Tensor(input))
  pred = lstm1(input).detach().cpu().numpy()
  return pred.squeeze()

In [140]:
# An example of get_r_hat

ACTIVE_R_HAT = "lstm"

r_hat_implementations = {
    "baseline": get_r_hat_baseline, 
    "gboost": get_r_hat_gboost,
    "lstm": get_r_hat_lstm,
}

def get_r_hat(A, B): 
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    return r_hat_implementations[ACTIVE_R_HAT](A, B)

In [141]:
def get_model_corr(log_pr_df, volu_df, test_indices):
    t0 = time.time()
    dt = datetime.timedelta(days=1)
    r_hat = pd.DataFrame(index=log_pr_df.index[test_indices], columns=np.arange(10), dtype=np.float64)
    idx = 0
    for t in log_pr_df.index[test_indices]:
        if idx % 100 == 0:
          print(f"Completed: {idx}/{len(test_indices)}")
        idx += 1  
        r_hat.loc[t, :] = get_r_hat(log_pr_df.loc[(t - dt):t], volu_df.loc[(t - dt):t])
    t_used = time.time() - t0
    
    r_fwd = (log_pr_df.shift(-30) - log_pr_df).iloc[test_indices].rename(columns={f"input_df_{i}": i for i in range(10)})
    r_fwd.corrwith(r_hat)
    
    r_fwd_all = r_fwd.iloc[:-3].values.ravel() # the final "ignore_rows" rows are NaNs. 
    r_hat_all = r_hat.iloc[:-3].values.ravel()
    return np.corrcoef(r_fwd_all, r_hat_all)[0, 1]

In [143]:
get_model_corr(log_pr, volu, test_pts)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


0.00432405446260245