In [None]:
ticker = 'seekingalpha-JCAP-2016-Q2'
interval = 0.1 # In seconds.
maxNumIntervals = 250

In [None]:
import math
import gensim
import numpy as np
from aubio import pitch
import scipy.io.wavfile as wav
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import ssc

In [None]:
# Extraction settings.
intervalMS = interval * 1000
tInterval = 0.01 # In seconds.
intRatio = tInterval / interval

In [None]:
import itertools
def iSplit(iterable, splitters):
    return [list(g) for k, g in itertools.groupby(iterable, lambda x: x in splitters) if not k]

tFile = open('data/transcripts/' + ticker + '.txt')
tString = tFile.read() # To be chunked.
transcript = tString.split('\n')[:-2]
transcript = iSplit(transcript, ('<EOS>'))

# Helper function below.
def SLToTuple(splitSent):
    return [(int(word.split(':')[0]), int(word.split(':')[1]), word.split(' ')[1]) for word in splitSent]
transcript = [SLToTuple(sentence) for sentence in transcript]

In [None]:
intervals = [(sentence[-1][0] + sentence[-1][1] - sentence[0][0], sentence) for sentence in transcript]
intIndices = [int(math.ceil((transcript[i][-1][0] + transcript[i][-1][1]) * intRatio)) for i in range(len(transcript) - 1)]
intIndicesFine = [(transcript[i][-1][0] + transcript[i][-1][1]) * 10 for i in range(len(transcript) - 1)]
# maxNumIntervals = int(math.ceil(max(intervals)[0] * interval))
numSentences = len(transcript)

In [None]:
(rate, signal) = wav.read('data/audio/' + ticker + '.wav')
signal.shape

In [None]:
# 400 is a magic number here to make things work out nicely.
pitchesPerInterval = int(interval / (400.0 / rate))
pitchComp = pitch('default', 4096, 400, rate)
numPitchOutputs = len(signal) // 400
rawPitches = np.zeros((numPitchOutputs))

# Iteratively compute all pitches.
for i in range(numPitchOutputs):
    start, end = i * 400, (i + 1) * 400
    rawPitches[i] = pitchComp(signal[start:end].astype(np.float32))

In [None]:
rawPitches = rawPitches[:len(rawPitches) - len(rawPitches) % pitchesPerInterval]
rawPitches = np.mean(np.reshape(rawPitches, (-1, pitchesPerInterval)), axis = 1)
rawPitches = np.expand_dims(rawPitches, axis = 1)
rawPitches.shape

In [None]:
squish = int(interval / 0.025)
rawMFCC = mfcc(signal, rate, winstep = 0.025)
rawMFCCDelta = delta(rawMFCC, 2) # 2 from internet.
rawMFCCDD = delta(rawMFCCDelta, 2) # 2 from internet.
rawSSC = ssc(signal, rate, winstep = 0.025)

# Truncate trailing readings to interval.
rawMFCC = rawMFCC[:len(rawMFCC) - len(rawMFCC) % squish]
rawMFCCDelta = rawMFCCDelta[:len(rawMFCCDelta) - len(rawMFCCDelta) % squish]
rawMFCCDD = rawMFCCDD[:len(rawMFCCDD) - len(rawMFCCDD) % squish]
rawSSC = rawSSC[:len(rawSSC) - len(rawSSC) % squish]

# Reshape to average every squish values.
rawMFCC = np.reshape(rawMFCC, (-1, squish, 13))
rawMFCCDelta = np.reshape(rawMFCCDelta, (-1, squish, 13))
rawMFCCDD = np.reshape(rawMFCCDD, (-1, squish, 13))
rawSSC = np.reshape(rawSSC, (-1, squish, 26))

# Reduce sum along squish axis.
rawMFCC = np.mean(rawMFCC, axis = 1)
rawMFCCDelta = np.mean(rawMFCCDelta, axis = 1)
rawMFCCDD = np.mean(rawMFCCDD, axis = 1)
rawSSC = np.mean(rawSSC, axis = 1)

# Diagnostic check of shapes.
rawMFCC.shape, rawMFCCDelta.shape, rawMFCCDD.shape, rawSSC.shape

In [None]:
numIntervals = np.zeros((numSentences), dtype = int)
MFCC = np.zeros((numSentences, maxNumIntervals, rawMFCC.shape[1]))
Pitches = np.zeros((numSentences, maxNumIntervals, rawPitches.shape[1]))
MFCCDelta = np.zeros((numSentences, maxNumIntervals, rawMFCCDelta.shape[1]))
MFCCDD = np.zeros((numSentences, maxNumIntervals, rawMFCCDD.shape[1]))
SSC = np.zeros((numSentences, maxNumIntervals, rawSSC.shape[1]))

In [None]:
MFCCList = np.split(rawMFCC, intIndices)
PitchesList = np.split(rawPitches, intIndices)
MFCCDeltaList = np.split(rawMFCCDelta, intIndices)
MFCCDDList = np.split(rawMFCCDD, intIndices)
SSCList = np.split(rawSSC, intIndices)

# Chunk MFCC/SSC by sentence.
for i in range(len(MFCCList)):
    sentMFCC = MFCCList[i]
    sentPitches = PitchesList[i]
    sentMFCCDD = MFCCDDList[i]
    sentMFCCDelta = MFCCDeltaList[i]
    sentSSC = SSCList[i]

    # Pad to avoid spooky off-by-one bugs.
    intervals = min(sentMFCC.shape[0], maxNumIntervals)
    if len(sentPitches) < intervals: # TODO: Find a better way to do this.
        sentPitches = np.pad(sentPitches, ((0, intervals - len(sentPitches)), (0, 0)), mode = 'mean')

    # Zeros here will be masked by an RNN layer.
    MFCC[i, :intervals, :] = sentMFCC[:intervals, :]
    Pitches[i, :intervals, :] = sentPitches[:intervals, :]
    MFCCDelta[i, :intervals, :] = sentMFCCDelta[:intervals, :]
    MFCCDD[i, :intervals, :] = sentMFCCDD[:intervals, :]
    SSC[i, :intervals, :] = sentSSC[:intervals, :]

    # Really a by-product.
    numIntervals[i] = intervals

In [None]:
MFCC.shape, MFCCDelta.shape, MFCCDD.shape, SSC.shape, Pitches.shape, numIntervals.shape, len(intIndices), len(intIndicesFine)

In [None]:
np.save('features/MFCC-' + ticker, MFCC)
np.save('features/Pitches-' + ticker, Pitches)
np.save('features/MFCCDelta-' + ticker, MFCCDelta)
np.save('features/MFCCDD-' + ticker, MFCCDD)
np.save('features/SSC-' + ticker, SSC)

In [None]:
# np.save('features/words-' + ticker, wordVecs)
np.save('features/intervals-' + ticker, numIntervals)
np.save('scratch/splits-' + ticker, np.array(intIndices))
np.save('scratch/splits-fine-' + ticker, np.array(intIndicesFine))