In [1]:
# Using test data to build/prototype-out a ML model
    # Further restructuring data for ML input layers
        # Variable isolation and selection refinement
    # Reshaping, encoding, normalization
    # Input, layer, and output specification
    # Force keras/tensorflow to use GPU backend

In [2]:
# Imports
import pandas as pd
import numpy as np
import json
import datetime 
import matplotlib.pyplot as plt
%matplotlib inline

# Pymongo import for connection to local client DB
import pymongo
from pymongo import MongoClient

# ML Imports 
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, GRU
from keras.models import load_model
from keras import backend as K

# Preprocessing Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler,OneHotEncoder
from keras.utils import to_categorical 

# Import to check check for GPU availability for tensorflow backend
from tensorflow.python.client import device_lib

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# Mongodb connection for live feed of data into model
    # Implementation after verification of test data -> model prototype input layer working

In [4]:
# Verify GPU availability for tensorflow backend
print(device_lib.list_local_devices())
print("==============================================")
print(K.tensorflow_backend._get_available_gpus())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14492975823642502260
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 9222031934
locality {
  bus_id: 1
  links {
  }
}
incarnation: 10495751105079177724
physical_device_desc: "device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]
['/job:localhost/replica:0/task:0/device:GPU:0']


In [5]:
# Read in test data for L2 orderbook state (bids + asks)
    # Read in test data for subsquent L2 orderbook update states (L2 updates to bid + asks)
snapshot_asks_df = pd.read_csv("test_data/snapshot_asks.csv")
snapshot_bids_df = pd.read_csv("test_data/snapshot_bids.csv")
l2update_df = pd.read_csv("test_data/l2update.csv", dtype ={'changes':object})
request_log_df= pd.read_csv("test_data/request_log.csv")

In [6]:
print(request_log_df.head())
print("===============================================")
print(snapshot_asks_df.head())
#snapshot_asks_df = snapshot_asks_df["side","price","size"]
print(snapshot_bids_df.head())
print("===============================================")
print(l2update_df.head())
print(l2update_df.tail())

      Unnamed: 0                       iso         epoch
0    request end  2018-04-17T01:05:03.469Z  1.523927e+09
1  request start  2018-04-17T00:55:03.354Z  1.523927e+09
   side    price       size
0  sell  8042.91  10.656147
1  sell  8042.92   0.001181
2  sell  8042.93   0.001305
3  sell  8042.94   0.001181
4  sell  8042.95   1.265966
  side    price      size
0  buy  8042.90  9.118009
1  buy  8042.83  0.002487
2  buy  8042.50  0.420000
3  buy  8042.00  0.001000
4  buy  8041.33  0.020000
  side    price   size                      time
0  buy  8041.33  0.000  2018-04-17T00:55:04.358Z
1  buy  8041.43  0.020  2018-04-17T00:55:04.375Z
2  buy  7940.12  0.000  2018-04-17T00:55:04.395Z
3  buy  8039.00  0.001  2018-04-17T00:55:04.412Z
4  buy  7972.56  0.000  2018-04-17T00:55:04.413Z
       side    price      size                      time
18088  sell  8078.07  0.000000  2018-04-17T01:05:03.247Z
18089   buy  8039.05  0.000000  2018-04-17T01:05:03.330Z
18090  sell  8040.52  0.005328  2018-04-

In [7]:
# Bid= buy
# Ask = sell

#L2 snapshot structure
    # [side,price,size]
    # 'side' added as part of structure for classification
    
#l2 updates structure
    # [side, price, size, time]

# Note on GDAX API about l2update structure:
    # size of "0" indicates the price level can be removed

In [8]:
#One-hot/categorical encoding test

#Join together buy and sell side of orderbook for encoding:
snapshot_both_df = pd.concat([snapshot_asks_df,snapshot_bids_df], axis=0, join='outer', join_axes=None, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False,
          copy=True)
snapshot_both_df.count()

side     28242
price    28242
size     28242
dtype: int64

In [10]:
# snapshot data cat/one-hot encoded:
data_s = snapshot_both_df.values
X_s = data_s[:,1:3]
y_s = data_s[:,0:1]
y_s = np.ravel(y_s)
label_encoder = LabelEncoder()
encoded_y_s = label_encoder.fit_transform(y_s)
onehot_encoder = OneHotEncoder(sparse=False)
encoded_y_s = encoded_y_s.reshape(len(encoded_y_s), 1)
onehot_y_s = onehot_encoder.fit_transform(encoded_y_s)

In [15]:
encoded_y_s,onehot_y_s,X_s

(array([[1],
        [1],
        [1],
        ...,
        [0],
        [0],
        [0]], dtype=int64), array([[0., 1.],
        [0., 1.],
        [0., 1.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]]), array([[8042.91, 10.65614728],
        [8042.92, 0.00118097],
        [8042.93, 0.00130538],
        ...,
        [0.03, 7498.221],
        [0.02, 25955.001],
        [0.01, 153092.98092796]], dtype=object))

In [26]:
# L2 data cat/one-hot encoded:
data_l2 = l2update_df.values
X_l2 = data_l2[:,1:4]
y_l2 = data_l2[:,0:1]
y_l2 = np.ravel(y_l2)
label_encoder = LabelEncoder()
encoded_y_l2 = label_encoder.fit_transform(y_l2)
onehot_encoder = OneHotEncoder(sparse=False)
encoded_y_l2 = encoded_y_l2.reshape(len(encoded_y_l2), 1)
onehot_y_l2 = onehot_encoder.fit_transform(encoded_y_l2)

In [27]:
encoded_y_l2,onehot_y_l2,X_l2

(array([[0],
        [0],
        [0],
        ...,
        [1],
        [0],
        [0]], dtype=int64), array([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [0., 1.],
        [1., 0.],
        [1., 0.]]), array([[8041.33, 0.0, '2018-04-17T00:55:04.358Z'],
        [8041.43, 0.02, '2018-04-17T00:55:04.375Z'],
        [7940.12, 0.0, '2018-04-17T00:55:04.395Z'],
        ...,
        [8040.52, 0.00532786, '2018-04-17T01:05:03.431Z'],
        [8039.06, 0.0, '2018-04-17T01:05:03.452Z'],
        [8039.0, 0.00375, '2018-04-17T01:05:03.486Z']], dtype=object))