In this notebook, we introduce a 'jump' feature, which categorises intraday trading gains/losses into ordinal categories. The problem becomes a classification problem, as we aim to predict the direction of the coin price.

In [13]:
# imports
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.linear_model import LogisticRegressionCV, LassoCV
from sklearn.metrics import f1_score, accuracy_score, log_loss, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys
from pathlib import Path

src_path = Path("..", "src")
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from feature_engineering import add_vwap, add_atr, add_ema, add_dow, add_return, add_jump_categories_3, add_jump_categories_5

In [84]:
df_raw = pd.read_csv("./../input/ETHUSDT_1d_join_final.csv")

df_raw.head()

Unnamed: 0,date,open,high,low,close,volume,base_asset_volume,no_trades,taker_buy_vol,taker_buy_base_asset_vol
0,1/01/2020,129.16,133.05,128.68,130.77,144770.522,18952318.53,75888,71847.93883,9407940.0
1,2/01/2020,130.72,130.78,126.38,127.19,213757.0581,27486853.39,96193,105830.5619,13615060.0
2,3/01/2020,127.19,135.14,125.88,134.35,413055.189,54139288.22,162310,227899.2553,29863550.0
3,4/01/2020,134.37,135.85,132.5,134.2,184276.171,24641135.57,95488,85809.67556,11476960.0
4,5/01/2020,134.2,138.19,134.19,135.37,254120.4534,34593687.6,115106,126786.5206,17260440.0


In [85]:
df = df_raw.copy()
df = add_return(df)

# add jump feature and target variable
df = add_jump_categories_5(df)
print('num small down jumps:', list(df['jump']).count('small_down'))
print('num big down jumps:', list(df['jump']).count('big_down'))
print('num neutral:', list(df['jump']).count('neutral'))
print('num small up jumps:', list(df['jump']).count('small_up'))
print('num big up jumps:', list(df['jump']).count('big_up'))
print(df.shape)

df['next_jump'] = df['jump'].shift(-1)

df.head()

num small down jumps: 228
num big down jumps: 228
num neutral: 510
num small up jumps: 223
num big up jumps: 332
(1521, 12)


Unnamed: 0,date,open,high,low,close,volume,base_asset_volume,no_trades,taker_buy_vol,taker_buy_base_asset_vol,return,jump,next_jump
0,1/01/2020,129.16,133.05,128.68,130.77,144770.522,18952318.53,75888,71847.93883,9407940.0,,neutral,small_down
1,2/01/2020,130.72,130.78,126.38,127.19,213757.0581,27486853.39,96193,105830.5619,13615060.0,-0.020764,small_down,big_up
2,3/01/2020,127.19,135.14,125.88,134.35,413055.189,54139288.22,162310,227899.2553,29863550.0,0.028672,big_up,small_up
3,4/01/2020,134.37,135.85,132.5,134.2,184276.171,24641135.57,95488,85809.67556,11476960.0,0.01816,small_up,small_up
4,5/01/2020,134.2,138.19,134.19,135.37,254120.4534,34593687.6,115106,126786.5206,17260440.0,0.012918,small_up,big_up


## feature engineering

In [70]:
df = add_atr(df)
df = add_ema(df)
df = add_vwap(df)

df = add_dow(df)
df = pd.get_dummies(df, columns=['day_of_week'], prefix='dow', drop_first=True)
df = df.dropna()

# lag features to capture non-stationary nature
lag_factor = 5
cols = ['open', 'high', 'low', 'close', 'volume', 'base_asset_volume', 'no_trades', 'taker_buy_vol', 'taker_buy_base_asset_vol',
        'atr', 'ema', 'VWAP']

for lag in range(1, lag_factor+1):
    for col in cols:
        newcol = np.zeros(df.shape[0]) * np.nan
        newcol[lag:] = df[col].values[:-lag]
        df.insert(len(df.columns), "{0}_{1}".format(col, lag), newcol)

df = df.dropna()

# move the jump and target variable (jump_tmr) to the end
df = pd.get_dummies(df, columns=['jump'], prefix='jump', drop_first=True)
#jump_variables = ['jump_big_down', 'jump_small_down', 'jump_neutral', 'jump_small_up', 'jump_big_up']
df = df[[col for col in df.columns if col not in ['next_jump']] + ['next_jump']]

for col, type in zip(df.columns, df.dtypes):
    print(col, type)

df.head(10)

date datetime64[ns]
open float64
high float64
low float64
close float64
volume float64
base_asset_volume float64
no_trades int64
taker_buy_vol float64
taker_buy_base_asset_vol float64
return float64
atr float64
ema float64
VWAP float64
dow_Monday bool
dow_Saturday bool
dow_Sunday bool
dow_Thursday bool
dow_Tuesday bool
dow_Wednesday bool
open_1 float64
high_1 float64
low_1 float64
close_1 float64
volume_1 float64
base_asset_volume_1 float64
no_trades_1 float64
taker_buy_vol_1 float64
taker_buy_base_asset_vol_1 float64
atr_1 float64
ema_1 float64
VWAP_1 float64
open_2 float64
high_2 float64
low_2 float64
close_2 float64
volume_2 float64
base_asset_volume_2 float64
no_trades_2 float64
taker_buy_vol_2 float64
taker_buy_base_asset_vol_2 float64
atr_2 float64
ema_2 float64
VWAP_2 float64
open_3 float64
high_3 float64
low_3 float64
close_3 float64
volume_3 float64
base_asset_volume_3 float64
no_trades_3 float64
taker_buy_vol_3 float64
taker_buy_base_asset_vol_3 float64
atr_3 float64
ema_3 fl

Unnamed: 0,date,open,high,low,close,volume,base_asset_volume,no_trades,taker_buy_vol,taker_buy_base_asset_vol,...,taker_buy_vol_5,taker_buy_base_asset_vol_5,atr_5,ema_5,VWAP_5,jump_big_up,jump_neutral,jump_small_down,jump_small_up,next_jump
19,2020-01-20,166.79,169.33,161.24,166.87,358092.8841,59292534.72,130615,181828.1952,30119285.66,...,350727.9639,57992191.98,6.619167,149.180743,146.274725,False,False,True,False,small_up
20,2020-01-21,166.86,170.32,164.8,169.49,308007.6353,51615232.57,125609,154669.5631,25916761.62,...,221669.3963,35985561.13,1.158512,151.971928,147.457383,False,False,False,True,neutral
21,2020-01-22,169.48,171.47,166.03,168.07,272240.9029,45903134.16,111017,131183.8062,22137225.77,...,375143.0151,63553850.73,0.987751,155.364209,149.723328,False,True,False,False,big_down
22,2020-01-23,168.07,168.2,159.21,162.81,373414.3499,60971659.77,150875,179121.3193,29244864.76,...,350447.0151,60783310.67,1.111982,158.862034,151.724806,False,False,False,False,small_down
23,2020-01-24,162.85,164.45,155.55,162.54,430013.199,68980724.06,145668,201937.8364,32428806.01,...,307350.6395,51957534.85,1.250142,160.856294,152.969757,False,False,True,False,neutral
24,2020-01-25,162.51,162.79,157.61,160.35,219921.652,35196608.31,95448,114622.4459,18335347.16,...,181828.1952,30119285.66,0.667153,161.847702,153.484055,False,True,False,False,big_up
25,2020-01-26,160.36,168.08,159.41,167.86,251582.5576,41239031.34,116628,135644.6268,22253874.87,...,154669.5631,25916761.62,0.441939,163.118828,153.974143,True,False,False,False,big_up
26,2020-01-27,167.91,172.56,165.22,170.08,365894.8192,61846624.61,157072,183012.1978,30945679.52,...,131183.8062,22137225.77,0.420139,164.199729,154.390075,True,False,False,False,big_up
27,2020-01-28,170.04,176.2,170.03,175.64,473433.8961,81649497.49,178990,249252.6153,42996095.56,...,179121.3193,29244864.76,0.672153,164.041117,154.730294,True,False,False,False,neutral
28,2020-01-29,175.58,178.45,173.33,173.72,317382.9016,55988230.27,145062,151440.5071,26718161.11,...,201937.8364,32428806.01,0.683725,163.402227,154.984995,False,True,False,False,big_up


In [71]:
X = df.loc[:, ~df.columns.isin(['next_jump', 'date'])]
y = df['next_jump']

# scale data, as we will be applying regularisation soon
one_hot_cols = [col for col in X.columns if col.startswith('dow_') or col.startswith('jump_')]
numeric_cols = [col for col in X.select_dtypes(include='number').columns if col not in one_hot_cols]

scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, shuffle=False)

## Baseline models

These "models" serve as baselines - an educated guess of how the price will move given the frequency of up and down jumps. The metrics we get from this are the bare minimum our models need to achieve

Most Common Class - guess everything by the most common class in the training set

In [72]:
num_small_down = list(y_train).count('small_down')
num_big_down = list(y_train).count('big_down')
num_neutral = list(y_train).count('neutral')
num_small_up = list(y_train).count('small_up')
num_big_up = list(y_train).count('big_up')

In [73]:
y_pred = np.full(len(y_test), 'neutral')
f1 = f1_score(y_test, y_pred, average='weighted')
acc = accuracy_score(y_test, y_pred)

print(f1, acc)

0.3822300542875821 0.543046357615894


Proportional Guessing - randomly guess each observation based on the proportion of that class in the training set

In [74]:
items = ['big_down', 'small_down', 'neutral', 'small_up', 'big_up']
n = X_train.shape[0]
probs = [num_big_down / n, num_small_down / n, num_neutral / n, num_small_up / n, num_big_up / n]

def prop_guess():
    return random.choices(items, weights=probs, k=1)[0]

y_pred = [prop_guess() for _ in range(len(y_test))]

f1 = f1_score(y_test, y_pred, average='micro')
acc = accuracy_score(y_test, y_pred)

print(f1, acc)



0.2980132450331126 0.2980132450331126


## Modelling

### logistic regression

In [39]:
cv = RepeatedKFold(n_splits=10, n_repeats=3)

model = LogisticRegressionCV(
    Cs=10,    
    cv=cv,               
    penalty="elasticnet",
    solver="saga",       
    l1_ratios=[0.1, 0.5, 0.9],  
    max_iter=1000,
    scoring='f1_micro'
)

model.fit(X_train, y_train)

In [41]:
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='micro')
acc = accuracy_score(y_test, y_pred)
print(f1, acc)

print(y_pred)

0.5364238410596026 0.5364238410596026
['neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'big_up' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral' 'neutral'
 'neutral' 'neutral' 'neutral' 'neutral'

In [80]:
logistic_regression_preds = {
    'y_test': y_test,
    'y_pred': y_pred
}

results = pd.DataFrame(logistic_regression_preds)

import os 

output_dir = '../output'
output_path = os.path.join(output_dir, 'logistic_regression_preds.csv')

results.to_csv(output_path, index=False)

### xgboost

In [6]:
import xgboost as xgb

xgboost preprocessing

In [76]:
xgbDF = df.copy()

jump_lookup = {
    'big_down':0,
    'small_down':1,
    'neutral':2,
    'small_up':3,
    'big_up':4
}

xgbDF['next_jump'] = xgbDF['next_jump'].map(jump_lookup)

X = xgbDF.drop(['next_jump', 'date'], axis=1).copy()
y = xgbDF['next_jump'].copy()
m = xgb.DMatrix(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False) 

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

Naive XGBoost trained to F1 score

In [62]:
def softmax(x):
    '''Softmax function with x as input vector.'''
    e = np.exp(x)
    return e / np.sum(e)

def softprob_obj(predt: np.ndarray, data: xgb.DMatrix):
    '''Loss function.  Computing the gradient and approximated hessian (diagonal).
    Reimplements the `multi:softprob` inside XGBoost.

    '''
    labels = data.get_label()
    kRows = predt.shape[0]
    kClasses = 5

    if data.get_weight().size == 0:
        # Use 1 as weight if we don't have custom weight.
        weights = np.ones((kRows, 1), dtype=float)
    else:
        weights = data.get_weight()

    # The prediction is of shape (rows, classes), each element in a row
    # represents a raw prediction (leaf weight, hasn't gone through softmax
    # yet).  In XGBoost 1.0.0, the prediction is transformed by a softmax
    # function, fixed in later versions.
    assert predt.shape == (kRows, kClasses)

    grad = np.zeros((kRows, kClasses), dtype=float)
    hess = np.zeros((kRows, kClasses), dtype=float)

    eps = 1e-6

    # compute the gradient and hessian, slow iterations in Python, only
    # suitable for demo.  Also the one in native XGBoost core is more robust to
    # numeric overflow as we don't do anything to mitigate the `exp` in
    # `softmax` here.
    for r in range(predt.shape[0]):
        target = labels[r]
        p = softmax(predt[r, :])
        for c in range(predt.shape[1]):
            assert target >= 0 or target <= kClasses
            g = p[c] - 1.0 if c == target else p[c]
            g = g * weights[r]
            h = max((2.0 * p[c] * (1.0 - p[c]) * weights[r]).item(), eps)
            grad[r, c] = g
            hess[r, c] = h

    # After 2.1.0, pass the gradient as it is.
    return grad, hess

def f1_weighted_eval(predt: np.ndarray, dtrain: xgb.DMatrix):
    y_true = dtrain.get_label()
    # Convert raw logits to predicted class
    y_pred = np.argmax(predt, axis=1)
    
    f1 = f1_score(y_true, y_pred, average='weighted')
    return 'F1_Weighted', f1

model = xgb.train(
    {
        'num_class':5,
        'disable_default_eval_metric':True
    },
    dtrain=m,
    num_boost_round=10,
    obj=softprob_obj,
    custom_metric=f1_weighted_eval,
    evals=[(m, 'train')]
)

[0]	train-F1_Weighted:0.60452
[1]	train-F1_Weighted:0.64018
[2]	train-F1_Weighted:0.68693
[3]	train-F1_Weighted:0.72665
[4]	train-F1_Weighted:0.75057
[5]	train-F1_Weighted:0.77143
[6]	train-F1_Weighted:0.80178
[7]	train-F1_Weighted:0.81213
[8]	train-F1_Weighted:0.83366
[9]	train-F1_Weighted:0.84712


In [83]:
y_pred = model.predict(dtest)

f1 = f1_score(y_test, y_pred, average='weighted')
acc = accuracy_score(y_test, y_pred)

print(f1, acc)

print(confusion_matrix(y_test, y_pred))

xgb_preds = {
    'y_test': y_test,
    'y_pred': y_pred
}

results = pd.DataFrame(xgb_preds)

output_dir = '../output'
output_path = os.path.join(output_dir, 'xgb_preds.csv')

results.to_csv(output_path, index=False)

0.8265553892556669 0.8410596026490066
[[ 4  0  3  0  0]
 [ 0 14  7  1  0]
 [ 0  0 82  0  0]
 [ 0  0  5 23  0]
 [ 0  1  7  0  4]]
