In [None]:
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
%matplotlib inline

import tqdm
import h5py
import pickle as pk
import numpy as np
import pandas as pd
import lightgbm as lgb
from matplotlib import pyplot as plt
from scipy.stats import spearmanr
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold, cross_val_score
from sklearn import linear_model
from scipy import stats

from backtester import calc_score, apply_on_test, test_generator, make_submission
import utils

In [None]:
# To reduce dataset, we left only relevant currency
RELEVANT_CURRENCY = 'BTC'

TICKS_IN_FIVE_MINUTES = 5 * 60 // 10
# Horizon of the forecast. This parameter is not recommended to change
PREDICT_DELAY_TICKS = 30
# GAP determines the sampling frequency of objects in the training set. 
# To reduce the required memory, this parameter can be increased
GAP = 2

DIVIDER = 7

In [None]:
# At this cell primary feature selection (relevant_columns)

RELEVANT_COLUMNS = []
f1 = h5py.File('data/part1.hdf5', 'r')
header = f1["header"][:]
s = set()
for exch, symbol, side, candles in utils.candles_loop(f1["body"]):
    for name in header:
        s.add(symbol)
        if RELEVANT_CURRENCY in symbol and name in ["max_price", "amount"]:
            column = exch + "/" + symbol + "/" + side + "/" + name
            print(column)
            RELEVANT_COLUMNS.append(column)
f1.close()

In [None]:
def load_dataset(fname):
    f = h5py.File(fname, 'r')
    header = f["header"][:]
    loaded = []
    columns = []
    for exch, symbol, side, candles in utils.candles_loop(f["body"]):
        # To reduce dataset, select only relevant columns
        for feature_num, name in enumerate(header):
            column = exch + "/" + symbol + "/" + side + "/" + name
            if column in RELEVANT_COLUMNS:
                loaded.append(candles[:, feature_num])
                columns.append(column)

    loaded = pd.DataFrame(np.column_stack(loaded), columns=columns)
    f.close()
    
    return loaded

In [None]:
train_dataset = pd.concat((load_dataset('data/part1.hdf5'), load_dataset('data/part2.hdf5')))
val_dataset = load_dataset('data/part3.hdf5')

In [None]:
def get_X_y(loaded):
    X = []
    y = []
    target = loaded.loc[:, utils.TARGET_COLUMNS].values
    
    vals = loaded.values
    
    X_diff = vals[1:] - vals[:-1]
    
    X_rel = vals[1:] / vals[:-1]
    X_rel[np.isnan(X_rel)] = 1
    X_rel[np.isinf(X_rel)] = 1
    
    for start in tqdm.tnrange(0, len(loaded) - TICKS_IN_FIVE_MINUTES - PREDICT_DELAY_TICKS, GAP):
        end = start + TICKS_IN_FIVE_MINUTES
        end_part = start + TICKS_IN_FIVE_MINUTES * (DIVIDER - 1) // DIVIDER
        
        pooling = tuple(pool_func(array, axis=0) for pool_func in (np.min, np.max, np.mean, np.sum)
                                                 for array in (X_diff[start:end-1],
                                                               X_rel[start:end-1],
                                                               X_diff[end_part:end-1],
                                                               X_rel[end_part:end-1]
                                                              ))
        
        simple = (X_diff[start:end-1:DIVIDER].flatten(), 
                  X_rel[start:end-1:DIVIDER].flatten(),
                  X_diff[end_part:end-1].flatten(),
                  X_rel[end_part:end-1].flatten()
                 )
        
        X.append(np.hstack(simple + pooling))
        y.append(target[end-1 + PREDICT_DELAY_TICKS] / target[end-1]  - 1)

    return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)

In [None]:
X_train, y_train = get_X_y(train_dataset)
X_val, y_val = get_X_y(val_dataset)

In [None]:
for i, pair in tqdm.tqdm_notebook(enumerate(utils.PAIR_NAMES)):
    regr = lgb.LGBMRegressor(learning_rate=0.03, n_estimators=100, num_leaves=15, silent=False)
    regr.fit(X_train, y_train[:, i])
    with open(pair + '.pkl', 'wb+') as f:
        pk.dump(regr, f)
    
    preds = regr.predict(X_val)
    print(spearmanr(preds, y_val[:, i]))