In [None]:
ticker = 'seekingalpha-JCAP-2016-Q2'
interval = 0.1 # In seconds.
maxNumIntervals = 250
raterA = 'grid'
raterB = None
smoothFactor = 30000
shiftAmount = 300

In [None]:
import numpy as np
intervalMS = interval * 1000
lInterval = 0.001 # In seconds.
intRatio = lInterval / interval
hasRaterB = raterB is not None

In [None]:
raterAFileName = 'data/labels/' + raterA + '-' + ticker + '.train.npy'
raterALabels = np.load(raterAFileName)

# Optional.
if hasRaterB:
    raterBFileName = 'data/labels/' + raterB + '-' + ticker + '.train.npy'
    raterBLabels = np.load(raterBFileName)

In [None]:
# Shift labels by certain number of milliseconds.
A = np.pad(raterALabels, (0, shiftAmount), mode = 'mean')[shiftAmount:]
if hasRaterB: B = np.pad(raterBLabels, (0, shiftAmount), mode = 'mean')[shiftAmount:]
else: B = []
len(A), len(B)

In [None]:
# Center and normalize labels.
A = A - np.mean(A)
A = A / np.std(A)

# Optional.
if hasRaterB:
    B = B - np.mean(B)
    B = B / np.std(B)

In [None]:
import pandas
EWMA = pandas.stats.moments.ewma

# For smoother labels.
def smoothWithEWMA(x, span = 15):
    forward = EWMA(x, span = span)
    backward = EWMA(x[::-1], span = span)
    stack = np.vstack((forward, backward[::-1]))
    return np.mean(stack, axis = 0)

In [None]:
import warnings
# Since Pandas is being annoying about the EWMA function.
warnings.simplefilter(action = 'ignore', category = FutureWarning)

# Plot raw inter-rater agreement.
from matplotlib import pyplot as plt
plt.title('Raw Rater Agreement (SF = ' +
          str(smoothFactor) + ', SH = ' +
          str(shiftAmount) + ')')
plt.xlabel('Time (milliseconds)')
plt.ylabel('Normalized Confidence Score')
if hasRaterB: plt.plot((A + B) / 2, 'g--')
if hasRaterB: plt.plot(B)
plt.plot(A)

# Save for poster use instead of showing.
if hasRaterB: plt.savefig('raters/' + ticker + '-' + raterA + '-' + raterB + '-raw.png', dpi = 200)
plt.show()

In [None]:
# Smooth labels using forward-backward EWMA.
if hasRaterB: B = smoothWithEWMA(B, smoothFactor)
A = smoothWithEWMA(A, smoothFactor)

# Plot inter-rater agreement.
plt.title('Rater Agreement (SF = ' +
          str(smoothFactor) + ', SH = ' +
          str(shiftAmount) + ')')
plt.xlabel('Time (milliseconds)')
plt.ylabel('Normalized Confidence Score')
if hasRaterB: plt.plot((A + B) / 2, 'g--')
if hasRaterB: plt.plot(B)
plt.plot(A)

# Save for poster use instead of showing.
if hasRaterB: plt.savefig('raters/' + ticker + '-' + raterA + '-' + raterB + '-smooth.png', dpi = 200)
plt.show()

In [None]:
condense = int(1 / intRatio)
# Pad labels to condensation interval.
A = np.pad(A, (0, condense - len(A) % condense), mode = 'mean')
if hasRaterB: B = np.pad(B, (0, condense - len(B) % condense), mode = 'mean')

In [None]:
# Condense labels to mean of every N.
# Subtract one because of weird MFCC extraction.
A = np.mean(A.reshape(-1, condense), axis = 1)[:-1]
if hasRaterB: B = np.mean(B.reshape(-1, condense), axis = 1)[:-1]
len(A), len(B)

In [None]:
# Inter-rater reliability metric.
if hasRaterB:
    IRRAbsLoss = np.mean(np.abs(A - B))
    IRRSqLoss = np.sqrt(np.mean((A - B) ** 2))
else: IRRAbsLoss, IRRSqLoss = None, None
IRRAbsLoss, IRRSqLoss

In [None]:
if hasRaterB: rawTargets = (A + B) / 2
else: rawTargets = A

In [None]:
splitIndices = np.load('scratch/splits-' + ticker + '.npy')
targets = np.zeros((len(splitIndices) + 1, maxNumIntervals))
targetsList = np.split(rawTargets, splitIndices)
for i in range(len(targetsList)):
    sentTargets = targetsList[i]
    intervals = min(sentTargets.shape[0], maxNumIntervals)
    targets[i, :intervals] = sentTargets[:intervals]

In [None]:
# Sanity check.
targets.shape

In [None]:
np.save('targets/' + ticker, targets)