In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm
import random
from sklearn import preprocessing
import tensorflow as tf
import os

dir_path = './data/'
df = pd.read_pickle(dir_path+'df_hourly_poloniex.pickle')
df = df.dropna()

df.head()

  from pandas.core import datetools


Unnamed: 0_level_0,USDT_BTC_high,USDT_BTC_low,USDT_BTC_close,USDT_BTC_open,USDT_BTC_volume,USDT_BTC_quoteVolume,USDT_BTC_weighted_mean,USDT_BTC_pctChange,USDT_ETH_high,USDT_ETH_low,...,BTC_LTC_weighted_mean,BTC_LTC_pctChange,BTC_XRP_high,BTC_XRP_low,BTC_XRP_close,BTC_XRP_open,BTC_XRP_volume,BTC_XRP_quoteVolume,BTC_XRP_weighted_mean,BTC_XRP_pctChange
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-02 12:00:00,432.5,432.5,432.5,432.5,40.041239,0.092581,432.5,2.220446e-16,0.959136,0.959136,...,0.008063,-0.002293,1.4e-05,1.4e-05,1.4e-05,1.4e-05,0.033605,2408.822942,1.4e-05,-0.002859
2016-01-02 13:00:00,432.5,432.5,432.5,432.5,0.0,0.0,432.986941,0.001125876,0.959136,0.959136,...,0.00806,-0.000333,1.4e-05,1.4e-05,1.4e-05,1.4e-05,0.0,0.0,1.4e-05,0.004704
2016-01-02 14:00:00,437.3635,432.48,433.336667,433.52799,359.269753,0.828819,433.473883,0.00112461,0.959136,0.957,...,0.008073,0.001623,1.4e-05,1.4e-05,1.4e-05,1.4e-05,1.141981,81071.098773,1.4e-05,0.004682
2016-01-02 15:00:00,432.48,432.48,432.48,432.48,60.859598,0.140722,432.48,-0.002292832,0.957,0.957,...,0.008089,0.002002,1.4e-05,1.4e-05,1.4e-05,1.4e-05,2.120423,150622.792769,1.4e-05,-0.000492
2016-01-02 16:00:00,432.48,432.48,432.48,432.48,0.0,0.0,432.35,-0.0003005919,0.957,0.957,...,0.008079,-0.001224,1.4e-05,1.4e-05,1.4e-05,1.4e-05,0.491516,35178.793196,1.4e-05,-0.007526


In [2]:
n_pairs = 7 # USDT_BTC, USDT_ETH, USDT_LTC, USDT_XRP, BTC_ETH, BTC_LTC, BTC_XRP
n_channels = 8 # high, low, close, open, volume, quoteVolume, weighted mean, pctChange

In [3]:
scaler_features = preprocessing.MinMaxScaler(feature_range=(0.1, 1))

In [4]:
# function to create train, validation, test data given sequence length
def load_data(df, seq_len, test_size=-1):
    # prepare one-hot labels
    labels = df['USDT_BTC_pctChange'].as_matrix().reshape([-1,1])
    labels = np.concatenate([(labels > 3e-3)*1, ((3e-3 > labels)&(labels > -3e-3))*1, (labels < -3e-3)*1],1)
    labels = labels[seq_len-1:] # so labels and data has same length
    
    feature_set = [x for x in range(56)] #[0,1,2,3,4,6,7]
    
    data_raw = df.as_matrix() # convert to numpy array
    # fit scaler
    data_raw = scaler_features.fit_transform(data_raw[:, feature_set])
    
    # reshape
    data_raw = data_raw.reshape((-1, n_pairs, n_channels))
    
    data = []
    
    # create all possible sequences of length seq_len
    for index in range(len(data_raw) - seq_len + 1): 
        data.append(data_raw[index: index + seq_len, :])
    
    data = np.array(data)
    
    if test_size == -1: # split the old way
        n_train_valid_pairs = 3
        each_train_set_size_pct = 25
        each_valid_set_size_pct = 5

        each_train_set_size = round(each_train_set_size_pct/100*data.shape[0])
        each_valid_set_size = round(each_valid_set_size_pct/100*data.shape[0])

        x_train_sets = []
        y_train_sets = []
        x_valid_sets = []
        y_valid_sets = []
        used = 0

        for i in range(n_train_valid_pairs):
            x_train_sets.append(data[used : used + each_train_set_size,:-1,:]) # cannot see last day, which we aim to predict
            y_train_sets.append(labels[used : used + each_train_set_size, :])
            used += each_train_set_size

            x_valid_sets.append(data[used : used + each_valid_set_size,:-1,:])
            y_valid_sets.append(labels[used : used + each_valid_set_size, :])
            used += each_valid_set_size

        x_test = data[used : , :-1, :]
        y_test = labels[used : , :]

        x_train = np.concatenate(x_train_sets, axis=0)
        y_train = np.concatenate(y_train_sets, axis=0)
        x_valid = np.concatenate(x_valid_sets, axis=0)
        y_valid = np.concatenate(y_valid_sets, axis=0)
    
    else:
        x_test = data[-test_size : , :-1, :]
        y_test = labels[-test_size : , :]
        
        valid_start = data.shape[0] - test_size - int(test_size/2)
        x_valid = data[valid_start:-test_size, :-1, :]
        y_valid = labels[valid_start:-test_size, :]
        
        x_train = data[:valid_start, :-1, :]
        y_train = labels[:valid_start, :]
    
    return [x_train, y_train, x_valid, y_valid, x_test, y_test]

In [5]:
# create train, test data
seq_len = 10 # choose sequence length
x_train, y_train, x_valid, y_valid, x_test, y_test = load_data(df, seq_len, test_size=1996)
# y_train = y_train.reshape([-1,1])
# y_valid = y_valid.reshape([-1,1])
# y_test = y_test.reshape([-1,1])
print('x_train.shape = ',x_train.shape)
print('y_train.shape = ', y_train.shape)
print('x_valid.shape = ',x_valid.shape)
print('y_valid.shape = ', y_valid.shape)
print('x_test.shape = ', x_test.shape)
print('y_test.shape = ',y_test.shape)

x_train.shape =  (16981, 9, 7, 8)
y_train.shape =  (16981, 3)
x_valid.shape =  (998, 9, 7, 8)
y_valid.shape =  (998, 3)
x_test.shape =  (1996, 9, 7, 8)
y_test.shape =  (1996, 3)


In [6]:
from sklearn.metrics import f1_score

# baseline

In [7]:
y_pred = np.roll(y_valid,1, axis=0)
print('baseline dev_F1=',f1_score(y_valid[1:], y_pred[1:], average='micro'))  
y_pred = np.roll(y_test,1, axis=0)
y_pred[0] = y_valid[-1] # be careful here
print('baseline test_F1=',f1_score(y_test, y_pred, average='micro'))

baseline dev_F1= 0.39618856569709127
baseline test_F1= 0.3712424849699399


In [8]:
index_in_epoch = 0;
perm_array  = np.arange(x_train.shape[0])
np.random.shuffle(perm_array)

# function to get the next batch
def get_next_batch(batch_size):
    global index_in_epoch, x_train, perm_array   
    start = index_in_epoch
    index_in_epoch += batch_size
    
    if index_in_epoch > x_train.shape[0]:
        np.random.shuffle(perm_array) # shuffle permutation array
        start = 0 # start next epoch
        index_in_epoch = batch_size
        
    end = index_in_epoch
    return x_train[perm_array[start:end]], y_train[perm_array[start:end]]

x_1000_train, y_1000_train = get_next_batch(1000) # special batch of 1000 records in training set

In [9]:
class vanilla_CNN(object):
    def __init__(self, n_pairs, seq_len, n_channels):
        # parameters
        self.n_steps = seq_len-1
        self.n_pairs = n_pairs
        self.n_channels = n_channels
        self.n_bins = 3
        self.keep_prob = tf.placeholder(tf.float32, [])
        self.batch_size = 100
        self.n_epochs = 0 # 0 means to train indefinitely
        self.train_set_size = x_train.shape[0]
        self.test_set_size = x_test.shape[0]
        self.keep_prob = tf.placeholder(tf.float32, [])
        self.max_gradient_norm = 5
        
        with tf.variable_scope("vanilla_CNN", initializer=tf.contrib.layers.xavier_initializer()):
            self.X = tf.placeholder(tf.float32, [None, self.n_steps, self.n_pairs, self.n_channels])
            self.y = tf.placeholder(tf.float32, [None, self.n_bins])
            
            # Convolutional Layer #1
            conv1 = tf.layers.conv2d(inputs=self.X,
                                     filters=64,
                                     kernel_size=[4, 1],
                                     padding="same",
                                     activation=tf.nn.relu)
            
            # Pooling Layer #1
            pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 1], strides=[2, 1])
            
            # Convolutional Layer #2
            conv2 = tf.layers.conv2d(inputs=pool1,
                                     filters=32,
                                     kernel_size=[4, 1],
                                     padding="same",
                                     activation=tf.nn.relu)
            
            # Pooling Layer #2
            pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=[2, 2])
            
            # Convolutional Layer #3
            conv3 = tf.layers.conv2d(inputs=pool2,
                                     filters=8,
                                     kernel_size=[4, 7],
                                     padding="same",
                                     activation=tf.nn.relu)
            # Pooling Layer #3
            pool3 = tf.layers.max_pooling2d(inputs=conv3, pool_size=[2, 2], strides=[2, 2])
            
            pool3_flat = tf.reshape(pool3, [-1, 8])
            
            fc1 = tf.layers.dense(inputs=pool3_flat, units=512, activation=tf.nn.relu)
            
            dp1 = tf.nn.dropout(fc1, self.keep_prob)
            
            fc2 = tf.layers.dense(inputs=dp1, units=256)
            
            dp2 = tf.nn.dropout(fc2, self.keep_prob)
            
            fc3 = tf.layers.dense(inputs=dp1, units=self.n_bins)
            
            self.final_logits = tf.layers.dense(inputs=fc3, units=self.n_bins)
            
            self.indices = tf.argmax(self.final_logits, axis=-1) # (batchsize, 1)
            self.preds = tf.one_hot(self.indices, depth=self.n_bins)
            
            self.each_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.final_logits, labels=self.y)
            self.loss = tf.reduce_mean(self.each_loss) 

            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            self.gradient_norm = tf.global_norm(gradients)
            clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
            clipped_norm = tf.global_norm(clipped_gradients)
            self.param_norm = tf.global_norm(params)
            self.learning_rate_placeholder = tf.placeholder(tf.float32, [], name='learning_rate')
            optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_placeholder) 
            # training_op = optimizer.minimize(loss)
            self.training_op = optimizer.apply_gradients(zip(clipped_gradients, params))

            # initialize parameters
#             sess = tf.Session()
            self.global_step = tf.Variable(0, name='global_step', trainable=False)
            self.saver = tf.train.Saver(max_to_keep=2)
            self.bestmodel_saver = tf.train.Saver(max_to_keep=2)


    def train(self, session, experiment_name, keep_prob_val):
        
        bestmodel_dir = experiment_name+'/best_ckpt'
        bestmodel_ckpt_path = bestmodel_dir+'/best.ckpt'
        best_valid_f1 = None
        # Make bestmodel dir if necessary
        if not os.path.exists(bestmodel_dir):
            os.makedirs(bestmodel_dir)
        
        ckpt = tf.train.get_checkpoint_state(experiment_name)
        v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else ""
        if ckpt and (tf.gfile.Exists(ckpt.model_checkpoint_path) or tf.gfile.Exists(v2_path)):
            self.saver.restore(sess, ckpt.model_checkpoint_path)
            iteration = self.global_step.eval(session=sess) # get last global_step
            print("Start from iteration:", iteration)
            lr = 1e-3
        else:
            print('There is not saved parameters. Creating model with fresh parameters.')
            sess.run(tf.global_variables_initializer())
            iteration = 0
            lr = 1e-2 # should fix this...

        old_loss = 1000
        
        while self.n_epochs == 0 or iteration*self.batch_size/self.train_set_size < self.n_epochs:
            iteration = iteration + 1
            x_batch, y_batch = get_next_batch(self.batch_size) # fetch the next training batch 

            # train on this batch
            sess.run(self.training_op, feed_dict={self.X: x_batch, self.y: y_batch, self.learning_rate_placeholder:lr, \
                                                 self.keep_prob: keep_prob_val})

            if iteration % 50 == 0:
                y_1000_train_pred, loss_val, param_norm_val, grad_norm_val = \
                    sess.run([self.preds, self.loss, self.param_norm, self.gradient_norm],\
                            feed_dict={self.X: x_1000_train, self.y:y_1000_train, \
                                        self.learning_rate_placeholder:lr, self.keep_prob: keep_prob_val})
                    
                if loss_val > old_loss * 1.2:
                    lr /= 2
                old_loss = loss_val

                y_valid_pred = sess.run(self.preds, feed_dict={self.X: x_valid, self.keep_prob: keep_prob_val})
                
                valid_f1 = f1_score(y_valid, y_valid_pred, average='micro')
                print('%.2f epochs, iter %d: train_loss = %.9f, param_norm = %.3f, grad_norm = %.3f, train_F1/valid_F1 = %.6f/%.6f' \
                      %(iteration*self.batch_size/self.train_set_size, iteration, loss_val, param_norm_val, grad_norm_val, \
                        f1_score(y_1000_train, y_1000_train_pred, average='micro'), \
                        valid_f1))

                if best_valid_f1 is None or valid_f1 > best_valid_f1:
                    best_valid_f1 = valid_f1
                    print("======New best valid F1. Saving to %s..." % bestmodel_ckpt_path)
                    self.bestmodel_saver.save(sess, bestmodel_ckpt_path, global_step=self.global_step)
                
            if iteration % 100 == 0:
                self.global_step.assign(iteration).eval(session=sess) # set and update(eval) global_step with index, i
                save_path = self.saver.save(sess, "./"+experiment_name+"/model.ckpt", global_step=self.global_step)
                print('Saved parameters to %s' % save_path)

In [10]:
experiment_name="vanilla_CNN"

In [11]:
tf.reset_default_graph()
cnn_model = vanilla_CNN(n_pairs=n_pairs,seq_len=seq_len,n_channels=n_channels)
sess = tf.Session()

Instructions for updating:
Use the retry module or similar alternatives.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [12]:
cnn_model.train(session=sess, experiment_name=experiment_name, keep_prob_val=0.6)
# IMPORTANT:
# when you think F1 is not going to improve anymore, wait another 10 epochs. 
# if you see any better iteration that has not appeared before, keep waiting.

There is not saved parameters. Creating model with fresh parameters.
0.29 epochs, iter 50: train_loss = 1.045646429, param_norm = 20.831, grad_norm = 0.068, train_F1/valid_F1 = 0.484000/0.186373
0.59 epochs, iter 100: train_loss = 1.045559168, param_norm = 20.834, grad_norm = 0.110, train_F1/valid_F1 = 0.484000/0.194389
Saved parameters to ./vanilla_CNN/model.ckpt-100
0.88 epochs, iter 150: train_loss = 1.047325253, param_norm = 20.811, grad_norm = 0.035, train_F1/valid_F1 = 0.484000/0.186373
1.18 epochs, iter 200: train_loss = 1.054908156, param_norm = 20.957, grad_norm = 0.117, train_F1/valid_F1 = 0.484000/0.186373
Saved parameters to ./vanilla_CNN/model.ckpt-200
1.47 epochs, iter 250: train_loss = 1.030577183, param_norm = 21.124, grad_norm = 0.142, train_F1/valid_F1 = 0.483000/0.342685
1.77 epochs, iter 300: train_loss = 1.047845483, param_norm = 21.196, grad_norm = 0.280, train_F1/valid_F1 = 0.484000/0.377756
Saved parameters to ./vanilla_CNN/model.ckpt-300
2.06 epochs, iter 350: 

15.02 epochs, iter 2550: train_loss = 0.991187394, param_norm = 23.882, grad_norm = 0.346, train_F1/valid_F1 = 0.500000/0.331663
15.31 epochs, iter 2600: train_loss = 0.991090178, param_norm = 23.898, grad_norm = 0.337, train_F1/valid_F1 = 0.476000/0.385772
Saved parameters to ./vanilla_CNN/model.ckpt-2600
15.61 epochs, iter 2650: train_loss = 1.000569105, param_norm = 23.924, grad_norm = 0.371, train_F1/valid_F1 = 0.505000/0.342685
15.90 epochs, iter 2700: train_loss = 0.980285406, param_norm = 24.001, grad_norm = 0.104, train_F1/valid_F1 = 0.521000/0.396794
Saved parameters to ./vanilla_CNN/model.ckpt-2700
16.19 epochs, iter 2750: train_loss = 0.987045169, param_norm = 24.072, grad_norm = 0.408, train_F1/valid_F1 = 0.493000/0.387776
16.49 epochs, iter 2800: train_loss = 0.993848324, param_norm = 24.070, grad_norm = 0.304, train_F1/valid_F1 = 0.490000/0.302605
Saved parameters to ./vanilla_CNN/model.ckpt-2800
16.78 epochs, iter 2850: train_loss = 0.981218159, param_norm = 24.132, grad

30.92 epochs, iter 5250: train_loss = 0.992329955, param_norm = 25.550, grad_norm = 0.200, train_F1/valid_F1 = 0.497000/0.212425
31.21 epochs, iter 5300: train_loss = 1.017305374, param_norm = 25.605, grad_norm = 0.814, train_F1/valid_F1 = 0.509000/0.230461
Saved parameters to ./vanilla_CNN/model.ckpt-5300
31.51 epochs, iter 5350: train_loss = 0.981570423, param_norm = 25.588, grad_norm = 0.230, train_F1/valid_F1 = 0.518000/0.263527
31.80 epochs, iter 5400: train_loss = 0.977464259, param_norm = 25.654, grad_norm = 0.130, train_F1/valid_F1 = 0.519000/0.269539
Saved parameters to ./vanilla_CNN/model.ckpt-5400
32.09 epochs, iter 5450: train_loss = 1.010827065, param_norm = 25.644, grad_norm = 0.469, train_F1/valid_F1 = 0.498000/0.367735
32.39 epochs, iter 5500: train_loss = 0.987664700, param_norm = 25.697, grad_norm = 0.326, train_F1/valid_F1 = 0.520000/0.314629
Saved parameters to ./vanilla_CNN/model.ckpt-5500
32.68 epochs, iter 5550: train_loss = 0.981081426, param_norm = 25.731, grad

KeyboardInterrupt: 

In [13]:
# load best checkpoint (based on dev f1) and evaluate
ckpt = tf.train.get_checkpoint_state(experiment_name+'/best_ckpt')
v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else ""
if ckpt and (tf.gfile.Exists(ckpt.model_checkpoint_path) or tf.gfile.Exists(v2_path)):
    cnn_model.saver.restore(sess, ckpt.model_checkpoint_path)
else:
    raise ValueError('What? you dont have a best checkpoint?')

y_1000_train_pred = sess.run(cnn_model.preds, feed_dict={cnn_model.X: x_1000_train, cnn_model.keep_prob: 0.8})
print("train F1:",f1_score(y_1000_train, y_1000_train_pred, average='micro'))
y_valid_pred = sess.run(cnn_model.preds, feed_dict={cnn_model.X: x_valid, cnn_model.keep_prob: 0.8})
print("dev F1:",f1_score(y_valid, y_valid_pred, average='micro'))
y_test_pred = sess.run(cnn_model.preds, feed_dict={cnn_model.X: x_test, cnn_model.keep_prob: 0.8})
print("test F1:",f1_score(y_test, y_test_pred, average='micro'))

INFO:tensorflow:Restoring parameters from vanilla_CNN/best_ckpt/best.ckpt-1600
train F1: 0.514
dev F1: 0.42084168336673344
test F1: 0.38927855711422843


In [24]:
# Code for checking dimension

seq_len = 10
n_steps = seq_len - 1
n_pairs = 7
n_channels = 8
batch_size = 100

x = tf.Variable(tf.random_normal([batch_size, n_steps, n_pairs, n_channels]))

conv1 = tf.layers.conv2d(inputs=x,
                                     filters=128,
                                     kernel_size=[4, 1],
                                     padding="same",
                                     activation=tf.nn.relu) # [100, 9, 7, 128]

pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 1], strides=[2, 1]) # [100, 4, 7, 128]


conv2 = tf.layers.conv2d(inputs=pool1,
                                     filters=64,
                                     kernel_size=[4, 1],
                                     padding="same",
                                     activation=tf.nn.relu) # [100, 4, 7, 64]

pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=[2, 2]) # [100, 2, 3, 64]

conv3 = tf.layers.conv2d(inputs=pool2,
                                     filters=32, 
                                     kernel_size=[4, 7],
                                     padding="same",
                                     activation=tf.nn.relu) # [100, 2, 3, 32]

pool3 = tf.layers.max_pooling2d(inputs=conv3, pool_size=[2, 2], strides=[2, 2]) # [100, 1, 1, 32]

pool3_flat = tf.reshape(pool3, [-1, 32]) # [100,32]

fc1 = tf.layers.dense(inputs=pool3_flat, units=512, activation=tf.nn.relu)

dropout = tf.nn.dropout(fc1, 0.8)

fc2 = tf.layers.dense(inputs=dropout, units=3) # [100,32]

fc2.shape 

TensorShape([Dimension(100), Dimension(3)])