In [62]:
# import packages
import numpy as np
import pandas as pd
from tools import FeatureEngineer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [63]:
# load data
bnb = pd.read_csv('bnb.csv')
bnb.drop(columns='Asset_ID', inplace=True)

In [64]:
fe = FeatureEngineer()
bnb = fe.build_technical_indicators(bnb)
bnb.head()

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


Unnamed: 0,timestamp,Count,Open,High,Low,Close,Volume,VWAP,Target,open_sub_close,...,PSAR-,STC,TRIX,VI,VI+,VI-,WMA,CR,DLR,DR
0,1523956260,7.0,12.4195,12.4195,12.4101,12.4195,794.7,12.411386,-0.004366,0.0,...,,,,,,,,0.0,,-84.853892
1,1523956320,33.0,12.4195,12.4195,12.4001,12.415,1117.73,12.407532,-0.00394,0.0045,...,,,,,,,,-0.036233,-0.03624,-0.036233
2,1523956380,32.0,12.415,12.4195,12.4003,12.41,1062.37,12.401607,-0.003153,0.005,...,12.4195,,,,,,,-0.076493,-0.040282,-0.040274
3,1523956440,38.0,12.4006,12.41,12.3931,12.4,2259.55,12.399251,-0.003429,0.0006,...,12.4195,,,,,,,-0.157011,-0.080613,-0.08058
4,1523956500,79.0,12.4,12.421,12.393,12.3942,7113.37,12.406144,-0.002187,0.0058,...,,,,,,,,-0.203712,-0.046785,-0.046774


In [67]:
# inspect missing values
temp = bnb.isna().sum()
print([i for i in temp.index if temp[i] > 1000])
print([temp[i] for i in temp.index if temp[i] > 1000])

['KAMA', 'RSI_stoch', 'RSI_stoch_d', 'RSI_stoch_k', 'MFI', 'PSAR+', 'PSAR-']
[1703401, 1905, 2193, 2049, 3153, 888055, 915717]


In [68]:
# drop columns with too many NAs
bnb.drop(columns=['KAMA', 'PSAR+', 'PSAR-'], inplace=True)

In [69]:
# drop NAs
bnb.dropna(axis=0, inplace=True)
bnb.isna().sum()

timestamp    0
Count        0
Open         0
High         0
Low          0
            ..
VI-          0
WMA          0
CR           0
DLR          0
DR           0
Length: 75, dtype: int64

In [81]:
# check if df contains infinite values
r = bnb.index[np.isinf(bnb).any(1)]
print(r)

Int64Index([ 697988,  697989,  697990,  697991,  697992,  697993,  697994,
             697995,  697996,  787255,  787256,  787257,  787259,  787260,
             787261,  787262,  787263,  787264,  787289,  787291,  787292,
             787293,  787294,  787317,  787318,  787319,  787320,  787321,
             787322,  787323,  787324,  787325,  787326,  787327,  787532,
             787533,  787534,  787570,  787664,  787665,  787955,  787957,
             787958,  787959,  788065,  788066,  788080,  788082,  798090,
             798091,  798092,  954316,  954319,  954321,  954323,  954324,
             954325,  969425,  969426,  969427, 1063757, 1063758, 1063759,
            1063760, 1063764, 1063765, 1063768, 1063769, 1063771, 1063772,
            1063798, 1063799, 1063800, 1063809, 1063810, 1063811, 1063872,
            1063873, 1155936, 1155937, 1155972, 1155974, 1155976, 1155978,
            1156057, 1156073, 1156076, 1156077, 1156080, 1156081, 1156083],
           dtype='int64'

In [85]:
bnb.drop([i for i in r], axis=0, inplace=True)

In [7]:
bnb.corr()

Unnamed: 0,timestamp,Count,Open,High,Low,Close,Volume,VWAP,Target,open_sub_close,...,close_ma21,close_ma50,close_ma200,vol_ma8,vol_ma21,vol_ma50,vol_ma200,RSI,MACD,OBV
timestamp,1.0,0.454724,0.653013,0.652999,0.653034,0.653012,0.109059,0.653016,0.002535,0.00297,...,0.65301,0.653007,0.652986,0.140959,0.153261,0.164915,0.189431,0.012558,0.004155,-0.49941
Count,0.454724,1.0,0.600312,0.601155,0.599297,0.600234,0.50284,0.600233,0.015029,0.03793,...,0.600611,0.600898,0.601463,0.505004,0.490549,0.475029,0.443689,-0.000916,-0.05911,0.130466
Open,0.653013,0.600312,1.0,0.999998,0.999998,0.999998,0.06328,0.999999,-0.001432,0.004484,...,0.999986,0.999963,0.999853,0.081799,0.089024,0.095909,0.110726,0.010516,0.010007,0.236152
High,0.652999,0.601155,0.999998,1.0,0.999994,0.999998,0.063815,0.999998,-0.001433,0.003359,...,0.999985,0.999964,0.999857,0.082411,0.089637,0.09651,0.111297,0.010632,0.009732,0.236234
Low,0.653034,0.599297,0.999998,0.999994,1.0,0.999998,0.062665,0.999999,-0.00144,0.003226,...,0.999982,0.999958,0.999845,0.081195,0.088428,0.095325,0.110177,0.010749,0.010285,0.236062
Close,0.653012,0.600234,0.999998,0.999998,0.999998,1.0,0.063256,0.999999,-0.001435,0.002315,...,0.999984,0.999961,0.999851,0.081835,0.089063,0.095947,0.110759,0.010839,0.010012,0.236154
Volume,0.109059,0.50284,0.06328,0.063815,0.062665,0.063256,1.0,0.063251,0.030766,0.011217,...,0.063433,0.063546,0.063705,0.74646,0.672651,0.613494,0.512325,0.024208,-0.024404,-0.031881
VWAP,0.653016,0.600233,0.999999,0.999998,0.999999,0.999999,0.063251,1.0,-0.001435,0.003217,...,0.999985,0.999962,0.999852,0.081817,0.089046,0.095932,0.110748,0.010717,0.010027,0.23615
Target,0.002535,0.015029,-0.001432,-0.001433,-0.00144,-0.001435,0.030766,-0.001435,1.0,0.001508,...,-0.001396,-0.001389,-0.001406,0.031918,0.029137,0.031965,0.024592,0.00052,-0.003931,0.000494
open_sub_close,0.00297,0.03793,0.004484,0.003359,0.003226,0.002315,0.011217,0.003217,0.001508,1.0,...,0.004415,0.004409,0.004348,-0.016302,-0.017591,-0.017167,-0.014671,-0.148911,-0.002442,-0.000505


In [71]:
def model_training(x, y, split_ratio, scaler, model_list):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=split_ratio, random_state=42)
    # x_train = scaler.fit_transform(x_train)
    # x_test = scaler.transform(x_test)
    for model_name, clf in model_list.items():
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        print(model_name)
        print('R^2 Score:', r2_score(y_test, y_pred))
        print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
        print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
        print()

In [87]:
split_ratio = 0.2
scaler = MinMaxScaler()
x = bnb.drop(columns=['Target', 'Open', 'High', 'Low', 'VWAP'])
y = bnb['Target']
model_list = {'linear regression': LinearRegression(),
              'ridge': Ridge(),
              'elastic net': ElasticNet()}

model_training(x, y, split_ratio, scaler, model_list)

linear regression
R^2 Score: 0.0009784882888579371
Mean Absolute Error: 0.0027135264111865427
Root Mean Squared Error: 0.00434814021944598



  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


ridge
R^2 Score: 0.0029583655487391836
Mean Absolute Error: 0.002711682413673992
Root Mean Squared Error: 0.004343829474716559

elastic net
R^2 Score: 0.00025167105262813294
Mean Absolute Error: 0.0027136954461279774
Root Mean Squared Error: 0.0043497216311702705



  model = cd_fast.enet_coordinate_descent(


In [88]:
split_ratio = 0.2
scaler = StandardScaler()
x = bnb.drop(columns=['Target'])
y = bnb['Target']
model_list = {'linear regression': LinearRegression(),
              'ridge': Ridge(),
              'elastic net': ElasticNet()}

model_training(x, y, split_ratio, scaler, model_list)

linear regression
R^2 Score: 0.000931803937923692
Mean Absolute Error: 0.0027138873669398634
Root Mean Squared Error: 0.004348241812720076



  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


ridge
R^2 Score: 0.0030704102644859566
Mean Absolute Error: 0.0027116205808644027
Root Mean Squared Error: 0.004343585394230711

elastic net
R^2 Score: 0.00025167105262813294
Mean Absolute Error: 0.0027136954461279774
Root Mean Squared Error: 0.0043497216311702705



  model = cd_fast.enet_coordinate_descent(


In [89]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn import datasets

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# split data
x, y = np.array(bnb.drop(columns='Target')), np.array(bnb['Target'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# normalization
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# convert to tensor
x_train, x_test = torch.from_numpy(x_train).float(), torch.from_numpy(x_test).float()
y_train, y_test = torch.from_numpy(y_train).float(), torch.from_numpy(y_test).float()
y_train = y_train.view(y_train.shape[0], 1)

n_samples, n_features = x_train.shape

In [90]:
# construct model
input_size, output_size = n_features, 1
model = nn.Linear(input_size, output_size)

In [91]:
# loss & optimizer
learning_rate = 0.01
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [92]:
# training loop
num_epochs = 2000
for epoch in range(num_epochs):
    
    # forward pass and loss
    y_pred = model(x_train)
    loss = criterion(y_pred, y_train)
    
    # backward pass and update
    loss.backward()
    optimizer.step()
    
    # zero grad before new step
    optimizer.zero_grad()
    
    if (epoch+1) % 50 == 0:
        print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')

epoch: 50, loss = 0.0052
epoch: 100, loss = 0.0020
epoch: 150, loss = 0.0013
epoch: 200, loss = 0.0010
epoch: 250, loss = 0.0009
epoch: 300, loss = 0.0008
epoch: 350, loss = 0.0007
epoch: 400, loss = 0.0006
epoch: 450, loss = 0.0006
epoch: 500, loss = 0.0005
epoch: 550, loss = 0.0005
epoch: 600, loss = 0.0005
epoch: 650, loss = 0.0005
epoch: 700, loss = 0.0004
epoch: 750, loss = 0.0004
epoch: 800, loss = 0.0004
epoch: 850, loss = 0.0004
epoch: 900, loss = 0.0004
epoch: 950, loss = 0.0004
epoch: 1000, loss = 0.0004
epoch: 1050, loss = 0.0004
epoch: 1100, loss = 0.0004
epoch: 1150, loss = 0.0004
epoch: 1200, loss = 0.0003
epoch: 1250, loss = 0.0003
epoch: 1300, loss = 0.0003
epoch: 1350, loss = 0.0003
epoch: 1400, loss = 0.0003
epoch: 1450, loss = 0.0003
epoch: 1500, loss = 0.0003
epoch: 1550, loss = 0.0003
epoch: 1600, loss = 0.0003
epoch: 1650, loss = 0.0003
epoch: 1700, loss = 0.0003
epoch: 1750, loss = 0.0003
epoch: 1800, loss = 0.0003
epoch: 1850, loss = 0.0003
epoch: 1900, loss = 0

In [93]:
# evaluation
pred = model(x_test).detach().numpy()
pred = [pred[i] for i in range(len(pred))]
print('R^2 Score:', r2_score(y_test, pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, pred)))

R^2 Score: -13.63350215834949
Mean Absolute Error: 0.012821098
Root Mean Squared Error: 0.016641414
