In [None]:
batchSize = 50
epochs = 20

learningRate = 1e-3
maxNumIntervals = 250
recurrentType = 'LSTM'
recurrentSize = 128
denseSize = 128
regression = False
classBins = 3

# Specify features as saved tags.
featureTags = ['MFCC'] # ['MFCC', 'Pitches']
logName = 'B50-E20-LR1E3-LSTM-R128-D128-Class-3-MFCC'

In [None]:
# !rm -rf checkpoints/*
# !rm -rf tensorboard/*
# !rm *.log

In [None]:
# Full ticker specifications required.
tickers = ['seekingalpha-AON-2014-Q2',
           'seekingalpha-MIXT-2017-Q1',
           'seekingalpha-LTBR-2016-Q3',
           'seekingalpha-FARM-2016-Q2',
           'seekingalpha-XOXO-2016-Q1',
           'seekingalpha-UFPI-2015-Q3',
           'seekingalpha-ECYT-2015-Q1',
           'seekingalpha-ARNA-2016-Q2',
           'seekingalpha-JCAP-2016-Q2']

In [None]:
import random
random.shuffle(tickers)
trainTickers = tickers[:5]
valTickers = tickers[5:7]
testTickers = tickers[7:]

In [None]:
from keras.layers.recurrent import LSTM, GRU
from keras.layers import Input, Activation, Flatten, Dense, Masking
from keras.callbacks import CSVLogger, ModelCheckpoint, TensorBoard
from keras import regularizers, optimizers
from keras.models import Model, load_model
import numpy as np

In [None]:
from tensorflow.python.client import device_lib
localDeviceProtos = device_lib.list_local_devices()
[x.name for x in localDeviceProtos if x.device_type == 'GPU']

In [None]:
trainFeatures = np.concatenate([np.concatenate([np.load('features/' + tag + '-' + ticker + '.npy')
    for ticker in trainTickers], axis = 0) for tag in featureTags], axis = 2) # Dope.
valFeatures = np.concatenate([np.concatenate([np.load('features/' + tag + '-' + ticker + '.npy')
    for ticker in valTickers], axis = 0) for tag in featureTags], axis = 2) # Dope.
testFeatures = np.concatenate([np.concatenate([np.load('features/' + tag + '-' + ticker + '.npy')
    for ticker in testTickers], axis = 0) for tag in featureTags], axis = 2) # Dope.

# Shape sanity check.
trainFeatures.shape, valFeatures.shape, testFeatures.shape

In [None]:
trainIntervals = np.concatenate([np.load('features/intervals-' + ticker + '.npy') for ticker in trainTickers], axis = 0)
valIntervals = np.concatenate([np.load('features/intervals-' + ticker + '.npy') for ticker in valTickers], axis = 0)
testIntervals = np.concatenate([np.load('features/intervals-' + ticker + '.npy') for ticker in testTickers], axis = 0)

trainIntervals[trainIntervals == 0] = 1
valIntervals[valIntervals == 0] = 1
testIntervals[testIntervals == 0] = 1

In [None]:
trainTargetsInt = np.concatenate([np.load('targets/' + ticker + '.npy') for ticker in trainTickers], axis = 0)
valTargetsInt = np.concatenate([np.load('targets/' + ticker + '.npy') for ticker in valTickers], axis = 0)
testTargetsInt = np.concatenate([np.load('targets/' + ticker + '.npy') for ticker in testTickers], axis = 0)

In [None]:
trainTargets = np.sum(trainTargetsInt, axis = 1) / trainIntervals
valTargets = np.sum(valTargetsInt, axis = 1) / valIntervals
testTargets = np.sum(testTargetsInt, axis = 1) / testIntervals

In [None]:
trainTargets.shape, valTargets.shape, testTargets.shape

In [None]:
# Bucket train set confidence labels into even bins.
# We will reuse these bucket stops for val and test.
if not regression:
    sortIndices = np.argsort(trainTargets)
    trainFeatures = trainFeatures[sortIndices]
    trainTargets = trainTargets[sortIndices]
    binSize = int(len(trainTargets) / float(classBins))
    buckets = np.zeros((classBins - 1))

    # Assign class labels.
    for i in range(classBins):
        if i < classBins - 1:
            lBound = trainTargets[(i + 1) * binSize - 1]
            rBound = trainTargets[(i + 1) * binSize]
            buckets[i] = (lBound + rBound) / 2.0
            trainTargets[i * binSize:(i + 1) * binSize] = i
        else: trainTargets[i * binSize:] = i

In [None]:
# Do not clobber the original continuous targets during.
finalValTargets = np.zeros_like(valTargets)
finalTestTargets = np.zeros_like(testTargets)

mBin = classBins - 1
# Handle first and last edge cases in the procedure below.
finalValTargets[valTargets < buckets[0]], finalTestTargets[testTargets < buckets[0]] = 0, 0
finalValTargets[valTargets > buckets[-1]], finalTestTargets[testTargets > buckets[-1]] = mBin, mBin

# Assign val and test values to bins.
for i in range(len(buckets) - 1):
    left, right = buckets[i], buckets[i + 1]
    binSplitVal = np.where(np.logical_and(valTargets > left, valTargets < right))
    binSplitTest = np.where(np.logical_and(testTargets > left, testTargets < right))
    finalValTargets[binSplitVal], finalTestTargets[binSplitTest] = i + 1, i + 1

valTargets = finalValTargets
testTargets = finalTestTargets

In [None]:
np.random.seed(224)
trainIndices = list(range(len(trainTargets)))
valIndices = list(range(len(valTargets)))
testIndices = list(range(len(testTargets)))

np.random.shuffle(trainIndices)
np.random.shuffle(valIndices)
np.random.shuffle(testIndices)

trainFeatures = trainFeatures[trainIndices]
valFeatures = valFeatures[valIndices]
testFeatures = testFeatures[testIndices]

trainTargets = trainTargets[trainIndices]
valTargets = valTargets[valIndices]
testTargets = testTargets[testIndices]

In [None]:
if recurrentType == 'GRU': Recurrent = GRU
elif recurrentType == 'LSTM': Recurrent = LSTM
numBins = 1 if regression else classBins

In [None]:
featureDim = trainFeatures.shape[2] # Shape has batch size, time steps, and then features.
exInput = Input(shape = (maxNumIntervals, featureDim)) # Batch size is implicit.
out = Masking()(exInput) # Ignore zeroed positions.
out = Recurrent(recurrentSize)(out)
out = Dense(denseSize)(out)
out = Activation('relu')(out)
out = Dense(numBins)(out)
if regression: output = out
else: output = Activation('softmax')(out)

In [None]:
model = Model(inputs = exInput, outputs = output)
adam = optimizers.Adam(lr = learningRate)
if regression: model.compile(optimizer = adam, loss = 'mean_squared_error')
else: model.compile(optimizer = adam, loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
model.summary()

In [None]:
import h5py
fullName = 'InterFile-' + logName
logger = CSVLogger(fullName + '.log')
checkpointer = ModelCheckpoint(filepath = 'checkpoints/weights' + fullName + '.hdf5', save_best_only = True, verbose = 1)
tensorboarder = TensorBoard(log_dir = 'tensorboard/logs' + fullName)
# model = load_model('checkpoints/weights.hdf5')
model.fit(trainFeatures, trainTargets,
          batch_size = batchSize, epochs = epochs,
          validation_data = (valFeatures, valTargets),
          callbacks = [logger, checkpointer, tensorboarder])

In [None]:
if regression: predict = model.predict(valFeatures[:30])
else: predict = np.argmax(model.predict(valFeatures[:30]), axis = 1)
predict

In [None]:
valTargets[:30]

In [None]:
if not regression: correct = np.sum(valTargets[:30] == predict)
else: correct = None
correct

In [None]:
valTargets