In [9]:
# import packages
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [13]:
# load data
bnb = pd.read_csv('bnb.csv')
bnb.drop(columns='Asset_ID', inplace=True)

In [14]:
# add new features
def baseline_features(df):
    df['open_sub_close'] = df['Open'] - df['Close']
    df['high_div_low'] = df['High'] / df['Low']
    
    # MA8, 21, 50, 200 - close price
    df['close_ma8'] = df['Close'].rolling(8).mean()
    df['close_ma21'] = df['Close'].rolling(21).mean()
    df['close_ma50'] = df['Close'].rolling(50).mean()
    df['close_ma200'] = df['Close'].rolling(200).mean()
    
    # MA8, 21, 50, 200 - volume
    df['vol_ma8'] = df['Volume'].rolling(8).mean()
    df['vol_ma21'] = df['Volume'].rolling(21).mean()
    df['vol_ma50'] = df['Volume'].rolling(50).mean()
    df['vol_ma200'] = df['Volume'].rolling(200).mean()
    
    # RSI
    delta = df['Close'].diff()
    up, down = delta.clip(lower=0), -1*delta.clip(upper=0)
    ema_up, ema_down = up.ewm(com=13, adjust=False).mean(), down.ewm(com=13, adjust=False).mean()
    rs = ema_up / ema_down
    df['RSI'] = 100 - (100/(1+rs))
    
    # MACD
    exp1, exp2 = df['Close'].ewm(span=12, adjust=False).mean(), df['Close'].ewm(span=26, adjust=False).mean()
    macd = exp1 - exp2
    df['MACD'] = macd.ewm(span=9, adjust=False).mean()
    
    # OBV
    df['OBV'] = (np.sign(df['Close'].diff())*df['Volume']).fillna(0).cumsum()
    
    return df

In [15]:
bnb = baseline_features(bnb)
bnb.head()

Unnamed: 0,timestamp,Count,Open,High,Low,Close,Volume,VWAP,Target,open_sub_close,...,close_ma21,close_ma50,close_ma200,vol_ma8,vol_ma21,vol_ma50,vol_ma200,RSI,MACD,OBV
0,1523956260,7.0,12.4195,12.4195,12.4101,12.4195,794.7,12.411386,-0.004366,0.0,...,,,,,,,,,0.0,0.0
1,1523956320,33.0,12.4195,12.4195,12.4001,12.415,1117.73,12.407532,-0.00394,0.0045,...,,,,,,,,0.0,-7.2e-05,-1117.73
2,1523956380,32.0,12.415,12.4195,12.4003,12.41,1062.37,12.401607,-0.003153,0.005,...,,,,,,,,0.0,-0.000264,-2180.1
3,1523956440,38.0,12.4006,12.41,12.3931,12.4,2259.55,12.399251,-0.003429,0.0006,...,,,,,,,,0.0,-0.000682,-4439.65
4,1523956500,79.0,12.4,12.421,12.393,12.3942,7113.37,12.406144,-0.002187,0.0058,...,,,,,,,,0.0,-0.001309,-11553.02


In [16]:
# inspect missing values
bnb.isna().sum()

timestamp           0
Count               0
Open                0
High                0
Low                 0
Close               0
Volume              0
VWAP                0
Target              0
open_sub_close      0
high_div_low        0
close_ma8           7
close_ma21         20
close_ma50         49
close_ma200       199
vol_ma8             7
vol_ma21           20
vol_ma50           49
vol_ma200         199
RSI                 1
MACD                0
OBV                 0
dtype: int64

In [17]:
# drop NAs
bnb.dropna(axis=0, inplace=True)
bnb.isna().sum()

timestamp         0
Count             0
Open              0
High              0
Low               0
Close             0
Volume            0
VWAP              0
Target            0
open_sub_close    0
high_div_low      0
close_ma8         0
close_ma21        0
close_ma50        0
close_ma200       0
vol_ma8           0
vol_ma21          0
vol_ma50          0
vol_ma200         0
RSI               0
MACD              0
OBV               0
dtype: int64

In [18]:
bnb.corr()

Unnamed: 0,timestamp,Count,Open,High,Low,Close,Volume,VWAP,Target,open_sub_close,...,close_ma21,close_ma50,close_ma200,vol_ma8,vol_ma21,vol_ma50,vol_ma200,RSI,MACD,OBV
timestamp,1.0,0.454724,0.653013,0.652999,0.653034,0.653012,0.109059,0.653016,0.002535,0.00297,...,0.65301,0.653007,0.652986,0.140959,0.153261,0.164915,0.189431,0.012558,0.004155,-0.49941
Count,0.454724,1.0,0.600312,0.601155,0.599297,0.600234,0.50284,0.600233,0.015029,0.03793,...,0.600611,0.600898,0.601463,0.505004,0.490549,0.475029,0.443689,-0.000916,-0.05911,0.130466
Open,0.653013,0.600312,1.0,0.999998,0.999998,0.999998,0.06328,0.999999,-0.001432,0.004484,...,0.999986,0.999963,0.999853,0.081799,0.089024,0.095909,0.110726,0.010516,0.010007,0.236152
High,0.652999,0.601155,0.999998,1.0,0.999994,0.999998,0.063815,0.999998,-0.001433,0.003359,...,0.999985,0.999964,0.999857,0.082411,0.089637,0.09651,0.111297,0.010632,0.009732,0.236234
Low,0.653034,0.599297,0.999998,0.999994,1.0,0.999998,0.062665,0.999999,-0.00144,0.003226,...,0.999982,0.999958,0.999845,0.081195,0.088428,0.095325,0.110177,0.010749,0.010285,0.236062
Close,0.653012,0.600234,0.999998,0.999998,0.999998,1.0,0.063256,0.999999,-0.001435,0.002315,...,0.999984,0.999961,0.999851,0.081835,0.089063,0.095947,0.110759,0.010839,0.010012,0.236154
Volume,0.109059,0.50284,0.06328,0.063815,0.062665,0.063256,1.0,0.063251,0.030766,0.011217,...,0.063433,0.063546,0.063705,0.74646,0.672651,0.613494,0.512325,0.024208,-0.024404,-0.031881
VWAP,0.653016,0.600233,0.999999,0.999998,0.999999,0.999999,0.063251,1.0,-0.001435,0.003217,...,0.999985,0.999962,0.999852,0.081817,0.089046,0.095932,0.110748,0.010717,0.010027,0.23615
Target,0.002535,0.015029,-0.001432,-0.001433,-0.00144,-0.001435,0.030766,-0.001435,1.0,0.001508,...,-0.001396,-0.001389,-0.001406,0.031918,0.029137,0.031965,0.024592,0.00052,-0.003931,0.000494
open_sub_close,0.00297,0.03793,0.004484,0.003359,0.003226,0.002315,0.011217,0.003217,0.001508,1.0,...,0.004415,0.004409,0.004348,-0.016302,-0.017591,-0.017167,-0.014671,-0.148911,-0.002442,-0.000505


In [33]:
def model_training(x, y, split_ratio, scaler, model_list):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=split_ratio, random_state=42)
    # x_train = scaler.fit_transform(x_train)
    # x_test = scaler.transform(x_test)
    for model_name, clf in model_list.items():
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        print(model_name)
        print('R^2 Score:', r2_score(y_test, y_pred))
        print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
        print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
        print()

In [34]:
split_ratio = 0.2
scaler = MinMaxScaler()
x = bnb.drop(columns=['Target', 'Open', 'High', 'Low', 'close_ma8', 'close_ma21', 'close_ma50', 
                      'close_ma200', 'vol_ma8', 'vol_ma21', 'vol_ma50', 'VWAP'])
y = bnb['Target']
model_list = {'linear regression': LinearRegression(),
              'ridge': Ridge(),
              'elastic net': ElasticNet()}

model_training(x, y, split_ratio, scaler, model_list)

linear regression
R^2 Score: 0.0010755296813810267
Mean Absolute Error: 0.002737470157786555
Root Mean Squared Error: 0.004433649881208322

ridge
R^2 Score: 0.0010766092109791847
Mean Absolute Error: 0.002737468608282443
Root Mean Squared Error: 0.004433647485502886

elastic net
R^2 Score: 1.4128145123470937e-05
Mean Absolute Error: 0.002737062589194016
Root Mean Squared Error: 0.004436004730622034



  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


In [35]:
split_ratio = 0.2
scaler = StandardScaler()
x = bnb.drop(columns=['Target'])
y = bnb['Target']
model_list = {'linear regression': LinearRegression(),
              'ridge': Ridge(),
              'elastic net': ElasticNet()}

model_training(x, y, split_ratio, scaler, model_list)

linear regression
R^2 Score: 0.001663186607734568
Mean Absolute Error: 0.0027373552231108487
Root Mean Squared Error: 0.004432345554182639

ridge
R^2 Score: 0.0016627127677654174
Mean Absolute Error: 0.0027373515585025145
Root Mean Squared Error: 0.004432346606043195



  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


elastic net
R^2 Score: 1.4128145123470937e-05
Mean Absolute Error: 0.002737062589194016
Root Mean Squared Error: 0.004436004730622034



In [36]:
split_ratio = 0.2
scaler = StandardScaler()
x = bnb.drop(columns=['Target', 'Open', 'High', 'Low'])
y = bnb['Target']
model_list = {'linear regression': LinearRegression(),
              'ridge': Ridge(),
              'elastic net': ElasticNet()}

model_training(x, y, split_ratio, scaler, model_list)

linear regression
R^2 Score: 0.0016467182309979345
Mean Absolute Error: 0.0027373020257930214
Root Mean Squared Error: 0.004432382111602146

ridge
R^2 Score: 0.001649086314074899
Mean Absolute Error: 0.0027372984456560997
Root Mean Squared Error: 0.0044323768548180575



  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


elastic net
R^2 Score: 1.4128145123470937e-05
Mean Absolute Error: 0.002737062589194016
Root Mean Squared Error: 0.004436004730622034

