In [17]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from roboto.models import *

from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, LSTM, Reshape, Lambda, RepeatVector
from xgboost import XGBClassifier

In [2]:
INSTRUMENT_NAME = 'EU50_EUR'

In [3]:
instrument = Instrument.objects.get(name=INSTRUMENT_NAME)
candles = instrument.candles.values(
    'time',
    'open',
    'close',
)

In [4]:
frames = []

for instrument in Instrument.objects.all():
    columns = ('open', 'close')
    candles = list(instrument.candles.values_list('time', *columns))
    dataframe_columns = (['{}_{}'.format(instrument.name, c) for c in columns])
    tmp_df = pd.DataFrame(
        columns=['time', *dataframe_columns],
        data=candles,
    )
    if not len(frames):
        tmp_df['weekday'] = tmp_df.time.apply(lambda x: x.weekday)
        tmp_df['monthday'] = tmp_df.time.apply(lambda x: x.day)
        tmp_df['month'] = tmp_df.time.apply(lambda x: x.month)
        tmp_df['hour'] = tmp_df.time.apply(lambda x: x.hour)
    tmp_df.set_index(['time'], inplace=True)
    frames.append(tmp_df)
    
data = pd.concat(frames, axis=1, ignore_index=False)

first_full_field_index = None
for index, row in data.iterrows():
    if not row.isna().any():
        first_full_field_index = index
        break
first_full_field_index

data = data[first_full_field_index:]

data = data.fillna(method='ffill')

data.isna().any().any()

False

In [5]:
data.head()

Unnamed: 0_level_0,XAU_EUR_open,XAU_EUR_close,weekday,monthday,month,hour,UK100_GBP_open,UK100_GBP_close,XCU_USD_open,XCU_USD_close,...,US30_USD_open,US30_USD_close,GBP_HKD_open,GBP_HKD_close,USB05Y_USD_open,USB05Y_USD_close,NATGAS_USD_open,NATGAS_USD_close,USD_ZAR_open,USD_ZAR_close
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-08-15 07:00:00+00:00,1048.25,1048.299,2.0,15.0,8.0,7.0,7631.4,7616.3,2.63647,2.6173,...,25268.8,25251.2,9.98636,9.98213,113.677,113.663,2.956,2.954,14.26578,14.22086
2018-08-15 08:00:00+00:00,1048.393,1047.406,2.0,15.0,8.0,8.0,7616.7,7605.3,2.61742,2.60537,...,25251.7,25253.8,9.98209,9.97886,113.671,113.665,2.952,2.953,14.22086,14.34148
2018-08-15 09:00:00+00:00,1047.402,1047.24,2.0,15.0,8.0,9.0,7605.0,7578.3,2.60526,2.58997,...,25253.3,25212.2,9.97898,9.98586,113.661,113.692,2.952,2.954,14.34147,14.43672
2018-08-15 10:00:00+00:00,1047.276,1046.92,2.0,15.0,8.0,10.0,7578.8,7535.6,2.59008,2.59794,...,25212.2,25142.2,9.98566,9.97862,113.696,113.739,2.954,2.957,14.43654,14.59488
2018-08-15 11:00:00+00:00,1046.917,1048.398,2.0,15.0,8.0,11.0,7535.8,7529.3,2.59772,2.59447,...,25142.2,25131.0,9.97862,9.97206,113.749,113.764,2.958,2.942,14.5952,14.67718


In [6]:
def transform_categorial(df, categorial_columns):
    ret_df = df.copy()
    for feature in categorial_features:
        le = preprocessing.LabelBinarizer()    
        transformed_data = le.fit_transform(data[feature])
        new_columns = ['{}_{}'.format(feature, c) for c in le.classes_]
        for i in range(len(new_columns)):
            ret_df[new_columns[i]] = transformed_data[:,i]
    return ret_df.drop(categorial_columns, axis=1)

In [7]:
categorial_features = ['weekday', 'monthday', 'month', 'hour']
data = transform_categorial(data, categorial_features)

In [8]:
data.head()

Unnamed: 0_level_0,XAU_EUR_open,XAU_EUR_close,UK100_GBP_open,UK100_GBP_close,XCU_USD_open,XCU_USD_close,DE30_EUR_open,DE30_EUR_close,EUR_ZAR_open,EUR_ZAR_close,...,hour_14.0,hour_15.0,hour_16.0,hour_17.0,hour_18.0,hour_19.0,hour_20.0,hour_21.0,hour_22.0,hour_23.0
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-08-15 07:00:00+00:00,1048.25,1048.299,7631.4,7616.3,2.63647,2.6173,12392.8,12391.3,16.17726,16.11508,...,0,0,0,0,0,0,0,0,0,0
2018-08-15 08:00:00+00:00,1048.393,1047.406,7616.7,7605.3,2.61742,2.60537,12391.8,12383.6,16.11508,16.24266,...,0,0,0,0,0,0,0,0,0,0
2018-08-15 09:00:00+00:00,1047.402,1047.24,7605.0,7578.3,2.60526,2.58997,12383.8,12349.6,16.24264,16.34863,...,0,0,0,0,0,0,0,0,0,0
2018-08-15 10:00:00+00:00,1047.276,1046.92,7578.8,7535.6,2.59008,2.59794,12349.8,12270.6,16.3483,16.51943,...,0,0,0,0,0,0,0,0,0,0
2018-08-15 11:00:00+00:00,1046.917,1048.398,7535.8,7529.3,2.59772,2.59447,12270.2,12227.6,16.5198,16.61449,...,0,0,0,0,0,0,0,0,0,0


In [9]:
def create_df_with_diff(df, difference_values, columns, two_d=False):
    m, f = df.shape
    diff_df = df.copy()
    ret_df = None
    max_diff_value = 0
    for t in difference_values:
        new_columns = ['{}_{}'.format(c, t) for c in columns]
        tmp_df = df.drop(columns, axis=1)
        tmp_df[new_columns] = df[columns].diff(t)
        tmp_df = tmp_df.values
        if not two_d:
            tmp_df = tmp_df.reshape(m, 1, f)
        max_diff_value = max(max_diff_value, t)
        if ret_df is None:
            ret_df = tmp_df
        else:
            ret_df = np.hstack([ret_df, tmp_df])
    return ret_df[max_diff_value:]

In [10]:
import re
diff_columns = [c for c in data.columns if re.search('open|close', c)]

In [11]:
diff_values = [240, 216, 192, 168, 144, 120, 96, 72, 48, 24, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,]
# diff_values = [10,9,8,7,6,5,4,3,2,1,]
max(diff_values)

240

In [23]:
X = create_df_with_diff(data, diff_values, diff_columns, two_d=False)[:-1]

Y = (data['{}_close'.format(INSTRUMENT_NAME)].diff(1)[1:] > 0) * 1
Y = Y[max(diff_values):]
Y.head()

time
2018-08-29 08:00:00+00:00    0
2018-08-29 09:00:00+00:00    0
2018-08-29 10:00:00+00:00    1
2018-08-29 11:00:00+00:00    1
2018-08-29 12:00:00+00:00    0
Name: EU50_EUR_close, dtype: int64

In [31]:
print (X.shape)
print (Y.shape)

(4938, 22, 320)
(4938,)


In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle=False, test_size=.2)

In [68]:
def create_model():
    model = Sequential()
    model.add(LSTM(20, input_shape=X.shape[1:]))
    model.add(Dropout(0.1))
    model.add(Flatten(input_shape=X.shape[1:]))
    model.add(Dense(10, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(10, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(2, activation='softmax'))
    optimizer = optimizers.Adam(lr=0.0001,)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

model = create_model()

model.fit(
    X_train,
    Y_train,
    epochs=30,
    batch_size=256,
    validation_data=(X_test, Y_test),
    verbose=2,
    shuffle=False,
)

Train on 3950 samples, validate on 988 samples
Epoch 1/30
 - 2s - loss: 0.6704 - acc: 0.6215 - val_loss: 0.6763 - val_acc: 0.6134
Epoch 2/30
 - 1s - loss: 0.6684 - acc: 0.6210 - val_loss: 0.6758 - val_acc: 0.6123
Epoch 3/30
 - 1s - loss: 0.6686 - acc: 0.6233 - val_loss: 0.6752 - val_acc: 0.6083
Epoch 4/30
 - 1s - loss: 0.6662 - acc: 0.6238 - val_loss: 0.6747 - val_acc: 0.6144
Epoch 5/30
 - 1s - loss: 0.6640 - acc: 0.6261 - val_loss: 0.6738 - val_acc: 0.6194
Epoch 6/30
 - 1s - loss: 0.6628 - acc: 0.6306 - val_loss: 0.6728 - val_acc: 0.6204
Epoch 7/30
 - 1s - loss: 0.6578 - acc: 0.6377 - val_loss: 0.6725 - val_acc: 0.6225
Epoch 8/30
 - 1s - loss: 0.6605 - acc: 0.6377 - val_loss: 0.6718 - val_acc: 0.6215
Epoch 9/30
 - 1s - loss: 0.6572 - acc: 0.6344 - val_loss: 0.6712 - val_acc: 0.6235
Epoch 10/30
 - 1s - loss: 0.6558 - acc: 0.6400 - val_loss: 0.6709 - val_acc: 0.6255
Epoch 11/30
 - 1s - loss: 0.6545 - acc: 0.6385 - val_loss: 0.6710 - val_acc: 0.6235
Epoch 12/30
 - 1s - loss: 0.6512 - acc

<tensorflow.python.keras.callbacks.History at 0x7f9f23c95550>

In [49]:
model.save('test.h5')

In [44]:
errors = []
predicted_values = model.predict_classes(X_test)
for predicted_item, y in zip(predicted_values, Y_test):
    if predicted_item != y:
        errors.append(y)

In [None]:
from collections import defaultdict
d = defaultdict(int)
for e in errors:
    d[e] += 1

In [64]:
X_train.shape

(3950, 22, 320)

In [15]:
clf = XGBClassifier()
clf.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [85]:
accuracy_score(clf.predict(X_test), Y_test)

ValueError: Input numpy.ndarray must be 2 dimensional