In [116]:
__author__ = "Yicheng Li"
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm
import random
from sklearn import preprocessing
import tensorflow as tf
import os

dir_path = 'CS341-repo/Data/'
df = pd.read_pickle(dir_path+'df_hourly_poloniex.pickle')
df = df.dropna()

df.head()

Unnamed: 0_level_0,USDT_BTC_high,USDT_BTC_low,USDT_BTC_close,USDT_BTC_open,USDT_BTC_volume,USDT_BTC_quoteVolume,USDT_BTC_weighted_mean,USDT_BTC_pctChange,USDT_ETH_high,USDT_ETH_low,...,BTC_LTC_weighted_mean,BTC_LTC_pctChange,BTC_XRP_high,BTC_XRP_low,BTC_XRP_close,BTC_XRP_open,BTC_XRP_volume,BTC_XRP_quoteVolume,BTC_XRP_weighted_mean,BTC_XRP_pctChange
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-02 12:00:00,432.5,432.5,432.5,432.5,40.041239,0.092581,432.5,2.220446e-16,0.959136,0.959136,...,0.008063,-0.002293,1.4e-05,1.4e-05,1.4e-05,1.4e-05,0.033605,2408.822942,1.4e-05,-0.002859
2016-01-02 13:00:00,432.5,432.5,432.5,432.5,0.0,0.0,432.986941,0.001125876,0.959136,0.959136,...,0.00806,-0.000333,1.4e-05,1.4e-05,1.4e-05,1.4e-05,0.0,0.0,1.4e-05,0.004704
2016-01-02 14:00:00,437.3635,432.48,433.336667,433.52799,359.269753,0.828819,433.473883,0.00112461,0.959136,0.957,...,0.008073,0.001623,1.4e-05,1.4e-05,1.4e-05,1.4e-05,1.141981,81071.098773,1.4e-05,0.004682
2016-01-02 15:00:00,432.48,432.48,432.48,432.48,60.859598,0.140722,432.48,-0.002292832,0.957,0.957,...,0.008089,0.002002,1.4e-05,1.4e-05,1.4e-05,1.4e-05,2.120423,150622.792769,1.4e-05,-0.000492
2016-01-02 16:00:00,432.48,432.48,432.48,432.48,0.0,0.0,432.35,-0.0003005919,0.957,0.957,...,0.008079,-0.001224,1.4e-05,1.4e-05,1.4e-05,1.4e-05,0.491516,35178.793196,1.4e-05,-0.007526


In [100]:
df2 = pd.read_pickle(dir_path+'df_hourly_news_sentiment_reach.pickle')
df2 = df2.reset_index()

df2['date'] = df2['datetime']
df2['datetime'] = pd.to_datetime(df2['datetime'],unit='ms')

df2 = df2.set_index('datetime')
df2 = df2.rename(columns={'date':'timestamp'})

In [101]:
df3 = df.join(df2).dropna() # drop nan later so we don't mess up with test set

In [104]:
scaler_features = preprocessing.MinMaxScaler(feature_range=(0.1, 1))

In [105]:
# function to create train, validation, test data given sequence length
def load_data(df, seq_len, feature_set, test_size=-1):
    # prepare one-hot labels
    labels = df['USDT_BTC_pctChange'].as_matrix().reshape([-1,1])
    labels = np.concatenate([(labels > 3e-3)*1, ((3e-3 > labels)&(labels > -3e-3))*1, (labels < -3e-3)*1],1)
    
    data_raw = df.as_matrix() # convert to numpy array
    # fit scaler
    data_raw = scaler_features.fit_transform(data_raw[:, feature_set])
    data = []
    
    # create all possible sequences of length seq_len
    for index in range(len(data_raw) - seq_len + 1): 
        data.append(data_raw[index: index + seq_len, :])
    
    data = np.array(data)
    
    if test_size == -1: # take train and valid sets first
        n_train_valid_pairs = 3
        each_train_set_size_pct = 25
        each_valid_set_size_pct = 5

        each_train_set_size = round(each_train_set_size_pct/100*data.shape[0])
        each_valid_set_size = round(each_valid_set_size_pct/100*data.shape[0])

        x_train_sets = []
        y_train_sets = []
        x_valid_sets = []
        y_valid_sets = []
        used = 0

        for i in range(n_train_valid_pairs):
            x_train_sets.append(data[used : used + each_train_set_size,:-1,:]) # cannot see last day, which we aim to predict
            y_train_sets.append(labels[used + seq_len-1 : used + each_train_set_size + seq_len-1, :])
            used += each_train_set_size

            x_valid_sets.append(data[used : used + each_valid_set_size,:-1,:])
            y_valid_sets.append(labels[used + seq_len-1 : used + each_valid_set_size + seq_len-1, :])
            used += each_valid_set_size

        x_test = data[used : , :-1, :]
        y_test = labels[seq_len-1 + used : , :]

        x_train = np.concatenate(x_train_sets, axis=0)
        y_train = np.concatenate(y_train_sets, axis=0)
        x_valid = np.concatenate(x_valid_sets, axis=0)
        y_valid = np.concatenate(y_valid_sets, axis=0)
    
    else: # take test set first
        labels = labels[seq_len-1:] # so labels and data has same length
        x_test = data[-test_size : , :-1, :]
        y_test = labels[-test_size : , :]
        
        valid_start = data.shape[0] - test_size - int(test_size/2)
        x_valid = data[valid_start:-test_size, :-1, :]
        y_valid = labels[valid_start:-test_size, :]
        
        x_train = data[:valid_start, :-1, :]
        y_train = labels[:valid_start, :]
        
    return [x_train, y_train, x_valid, y_valid, x_test, y_test]
        

In [106]:
# create train, test data
seq_len = 10 # choose sequence length
feature_set = [x for x in range(64)] #[0,1,2,3,4,6,7]
x_train, y_train, x_valid, y_valid, x_test, y_test = load_data(df3, seq_len, feature_set, test_size=1996) # test_size = 1996
# y_train = y_train.reshape([-1,1])
# y_valid = y_valid.reshape([-1,1])
# y_test = y_test.reshape([-1,1])
print('x_train.shape = ',x_train.shape)
print('y_train.shape = ', y_train.shape)
print('x_valid.shape = ',x_valid.shape)
print('y_valid.shape = ', y_valid.shape)
print('x_test.shape = ', x_test.shape)
print('y_test.shape = ',y_test.shape)

x_train.shape =  (6711, 9, 64)
y_train.shape =  (6711, 3)
x_valid.shape =  (998, 9, 64)
y_valid.shape =  (998, 3)
x_test.shape =  (1996, 9, 64)
y_test.shape =  (1996, 3)




In [26]:
def f1_score_single(y_true, y_pred):
    TP = y_true.dot(y_pred) # zero or one
    FP = np.sum(y_pred > y_true) # sum over all k classes, zero or one
    FN = np.sum(y_pred < y_true) # sum over all k classes, zero or one
    
    if TP == 0: return 0.
    p = 1. * TP / (TP + FP)
    r = 1. * TP / (TP + FN)
    return 2 * p * r / (p + r)
    
def F1(y_true, y_pred):
    return np.mean([f1_score_single(x, y) for x, y in zip(y_true, y_pred)])

## baseline

In [107]:
y_pred = np.roll(y_valid,1, axis=0)
print('baseline dev_F1=',F1(y_valid[1:], y_pred[1:]))
y_pred = np.roll(y_test,1, axis=0)
y_pred[0] = y_valid[-1] # be careful here
print('baseline test_F1=',F1(y_test, y_pred))

baseline dev_F1= 0.39618856569709127
baseline test_F1= 0.3712424849699399


In [108]:
index_in_epoch = 0;
perm_array  = np.arange(x_train.shape[0])
np.random.shuffle(perm_array)

# function to get the next batch
def get_next_batch(batch_size):
    global index_in_epoch, x_train, perm_array   
    start = index_in_epoch
    index_in_epoch += batch_size
    
    if index_in_epoch > x_train.shape[0]:
        np.random.shuffle(perm_array) # shuffle permutation array
        start = 0 # start next epoch
        index_in_epoch = batch_size
        
    end = index_in_epoch
    return x_train[perm_array[start:end]], y_train[perm_array[start:end]]

x_1000_train, y_1000_train = get_next_batch(1000) # special batch of 1000 records in training set

In [113]:
class LSTM_Model(object):
    def __init__(self, seq_len):
        # parameters
        self.n_steps = seq_len-1 
        self.n_inputs = x_train.shape[-1]
        self.n_neurons = 100  # cell.state_size
        self.n_bins = 3 # be careful if you want to change this
        self.n_layers = 2
        self.batch_size = 100
        self.n_epochs = 0 # 0 means to train indefinitely
        self.train_set_size = x_train.shape[0]
        self.test_set_size = x_test.shape[0]
        self.keep_prob = tf.placeholder(tf.float32, [])
        self.max_gradient_norm = 5
        with tf.variable_scope("LSTM_Model", initializer=tf.contrib.layers.xavier_initializer()):
            self.X = tf.placeholder(tf.float32, [None, self.n_steps, self.n_inputs])
            self.y = tf.placeholder(tf.float32, [None, self.n_bins])

            layers = [tf.contrib.rnn.LSTMCell(num_units=self.n_neurons, \
                                              initializer=tf.contrib.layers.xavier_initializer(), \
                                              activation=tf.nn.elu)
                     for layer in range(self.n_layers)]
            
#             layers = [tf.contrib.rnn.BasicRNNCell(num_units=self.n_neurons, \
#                                               activation=tf.nn.elu)
#                      for layer in range(self.n_layers)]
            
            multi_layer_cell = tf.contrib.rnn.MultiRNNCell(layers)

            outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, self.X, dtype=tf.float32)
            outputs = tf.nn.dropout(outputs, self.keep_prob) # dropout
            # 'outputs' is a tensor of shape [batch_size, n_steps, n_neurons(cell.state_size)]
        
#             directly output
#             logits = outputs[:,self.n_steps-1,:] # keep only last output of sequence
            
            # ======== attn layer ===================
#             sim_mat = tf.matmul(outputs, tf.transpose(outputs, perm=[0,2,1]))
#             attn_dist = tf.nn.softmax(sim_mat, 2)
#             # (batchsize, n_steps, n_steps)
#             attn_outputs = tf.matmul(attn_dist, outputs)
#             # (batchsize, n_steps, n_bins)
            # ========================================
            
            stacked_outputs = tf.reshape(outputs, [-1, self.n_neurons]) 
            stacked_outputs = tf.layers.dense(stacked_outputs, self.n_bins)
            final_outputs = tf.reshape(stacked_outputs, [-1, self.n_steps, self.n_bins])
            
            self.final_logits = final_outputs[:, -1, :] # last timestep
            # (batchsize, n_bins), this is logits, not probs!!
            self.final_logits = tf.nn.dropout(self.final_logits, self.keep_prob) # dropout
            
            self.indices = tf.argmax(self.final_logits, axis=-1) # (batchsize, 1)
            self.preds = tf.one_hot(self.indices, depth=self.n_bins)
            
            self.each_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.final_logits, labels=self.y)
            self.loss = tf.reduce_mean(self.each_loss) 

            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            self.gradient_norm = tf.global_norm(gradients)
            clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
            clipped_norm = tf.global_norm(clipped_gradients)
            self.param_norm = tf.global_norm(params)
            self.learning_rate_placeholder = tf.placeholder(tf.float32, [], name='learning_rate')
            optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_placeholder) 
            # training_op = optimizer.minimize(loss)
            self.training_op = optimizer.apply_gradients(zip(clipped_gradients, params))

            # initialize parameters
#             sess = tf.Session()
            self.global_step = tf.Variable(0, name='global_step', trainable=False)
            self.saver = tf.train.Saver(max_to_keep=2)
            self.bestmodel_saver = tf.train.Saver(max_to_keep=2)
    
    def train(self, session, experiment_name, keep_prob_val):
        
        bestmodel_dir = experiment_name+'/best_ckpt'
        bestmodel_ckpt_path = bestmodel_dir+'/best.ckpt'
        best_valid_f1 = None
        # Make bestmodel dir if necessary
        if not os.path.exists(bestmodel_dir):
            os.makedirs(bestmodel_dir)
        
        ckpt = tf.train.get_checkpoint_state(experiment_name)
        v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else ""
        if ckpt and (tf.gfile.Exists(ckpt.model_checkpoint_path) or tf.gfile.Exists(v2_path)):
            self.saver.restore(sess, ckpt.model_checkpoint_path)
            iteration = self.global_step.eval(session=sess) # get last global_step
            print("Start from iteration:", iteration)
            lr = 1e-3
        else:
            print('There is not saved parameters. Creating model with fresh parameters.')
            sess.run(tf.global_variables_initializer())
            iteration = 0
            lr = 1e-2 # should fix this...

        old_loss = 1000
        
        while self.n_epochs == 0 or iteration*self.batch_size/self.train_set_size < self.n_epochs:
            iteration = iteration + 1
            x_batch, y_batch = get_next_batch(self.batch_size) # fetch the next training batch 

            # train on this batch
            sess.run(self.training_op, feed_dict={self.X: x_batch, self.y: y_batch, self.learning_rate_placeholder:lr, \
                                                 self.keep_prob: keep_prob_val})

            if iteration % 50 == 0:
                y_1000_train_pred, loss_val, param_norm_val, grad_norm_val = \
                    sess.run([self.preds, self.loss, self.param_norm, self.gradient_norm],\
                            feed_dict={self.X: x_1000_train, self.y:y_1000_train, \
                                        self.learning_rate_placeholder:lr, self.keep_prob: keep_prob_val})
                    
                if loss_val > old_loss * 1.2:
                    lr /= 2
                old_loss = loss_val

                y_valid_pred = sess.run(self.preds, feed_dict={self.X: x_valid, self.keep_prob: keep_prob_val})
                
                valid_f1 = F1(y_valid, y_valid_pred)
                print('%.2f epochs, iter %d: train_loss = %.9f, param_norm = %.3f, grad_norm = %.3f, train_F1/valid_F1 = %.6f/%.6f' \
                      %(iteration*self.batch_size/self.train_set_size, iteration, loss_val, param_norm_val, grad_norm_val, \
                        F1(y_1000_train, y_1000_train_pred), \
                        valid_f1))

                if best_valid_f1 is None or valid_f1 > best_valid_f1:
                    best_valid_f1 = valid_f1
                    print("======New best valid F1. Saving to %s..." % bestmodel_ckpt_path)
                    self.bestmodel_saver.save(sess, bestmodel_ckpt_path, global_step=self.global_step)
                
            if iteration % 100 == 0:
                self.global_step.assign(iteration).eval(session=sess) # set and update(eval) global_step with index, i
                save_path = self.saver.save(sess, "./"+experiment_name+"/model.ckpt", global_step=self.global_step)
                print('Saved parameters to %s' % save_path)

## run experiment

In [110]:
experiment_name="all_features_with_sentiment_10_seq_100hidden_2layer_keep08"

In [114]:
tf.reset_default_graph()
lstm_model = LSTM_Model(seq_len=seq_len)
sess = tf.Session()

In [117]:
lstm_model.train(session=sess, experiment_name=experiment_name, keep_prob_val=0.8)
# IMPORTANT:
# when you think F1 is not going to improve anymore, wait another 10 epochs. 
# if you see any better iteration that has not appeared before, keep waiting.

There is not saved parameters. Creating model with fresh parameters.
0.75 epochs, iter 50: train_loss = 1.098032475, param_norm = 22.490, grad_norm = 0.206, train_F1/valid_F1 = 0.366000/0.212425
1.49 epochs, iter 100: train_loss = 1.099740028, param_norm = 22.471, grad_norm = 0.187, train_F1/valid_F1 = 0.340000/0.434870
Saved parameters to ./all_features_with_sentiment_10_seq_100hidden_2layer_keep08/model.ckpt-100
2.24 epochs, iter 150: train_loss = 1.092554450, param_norm = 22.824, grad_norm = 0.049, train_F1/valid_F1 = 0.353000/0.198397
2.98 epochs, iter 200: train_loss = 1.124582052, param_norm = 25.347, grad_norm = 0.323, train_F1/valid_F1 = 0.352000/0.418838
Saved parameters to ./all_features_with_sentiment_10_seq_100hidden_2layer_keep08/model.ckpt-200
3.73 epochs, iter 250: train_loss = 1.088899136, param_norm = 29.801, grad_norm = 0.020, train_F1/valid_F1 = 0.381000/0.380762
4.47 epochs, iter 300: train_loss = 1.089793801, param_norm = 31.630, grad_norm = 0.070, train_F1/valid_F

34.27 epochs, iter 2300: train_loss = 1.043236136, param_norm = 77.961, grad_norm = 0.152, train_F1/valid_F1 = 0.462000/0.264529
Saved parameters to ./all_features_with_sentiment_10_seq_100hidden_2layer_keep08/model.ckpt-2300
35.02 epochs, iter 2350: train_loss = 1.034645319, param_norm = 78.472, grad_norm = 0.104, train_F1/valid_F1 = 0.462000/0.267535
35.76 epochs, iter 2400: train_loss = 1.032690644, param_norm = 78.893, grad_norm = 0.076, train_F1/valid_F1 = 0.469000/0.290581
Saved parameters to ./all_features_with_sentiment_10_seq_100hidden_2layer_keep08/model.ckpt-2400
36.51 epochs, iter 2450: train_loss = 1.034169912, param_norm = 79.405, grad_norm = 0.100, train_F1/valid_F1 = 0.458000/0.308617
37.25 epochs, iter 2500: train_loss = 1.039065957, param_norm = 80.125, grad_norm = 0.219, train_F1/valid_F1 = 0.470000/0.302605
Saved parameters to ./all_features_with_sentiment_10_seq_100hidden_2layer_keep08/model.ckpt-2500
38.00 epochs, iter 2550: train_loss = 1.036708474, param_norm = 

KeyboardInterrupt: 

In [125]:
# load best checkpoint (based on dev f1) and evaluate
ckpt = tf.train.get_checkpoint_state(experiment_name+'/best_ckpt')
v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else ""
if ckpt and (tf.gfile.Exists(ckpt.model_checkpoint_path) or tf.gfile.Exists(v2_path)):
    lstm_model.saver.restore(sess, ckpt.model_checkpoint_path)
else:
    raise ValueError('What? you dont have a best checkpoint?')

y_1000_train_pred = sess.run(lstm_model.preds, feed_dict={lstm_model.X: x_1000_train, lstm_model.keep_prob: 0.8})
print("train F1:",F1(y_1000_train, y_1000_train_pred))
y_valid_pred = sess.run(lstm_model.preds, feed_dict={lstm_model.X: x_valid, lstm_model.keep_prob: 0.8})
print("dev F1:",F1(y_valid, y_valid_pred))
y_test_pred = sess.run(lstm_model.preds, feed_dict={lstm_model.X: x_test, lstm_model.keep_prob: 0.8})
print("test F1:",F1(y_test, y_test_pred))

INFO:tensorflow:Restoring parameters from all_features_with_sentiment_10_seq_100hidden_2layer_keep08/best_ckpt/best.ckpt-0
train F1: 0.36
dev F1: 0.40080160320641284
test F1: 0.3712424849699399
