In [142]:
from lib.csv_to_array import convert as pre_process

# George Frederick Handel (robot)

```
[0, 0, Header, format, nTracks, division]
[Track, 0, Start_track]
[Track, Time, Note_off_c, Channel, Note, Velocity]
```
checkout http://www.fourmilab.ch/webtools/midicsv/ for more info

In [143]:
gigue = './csv/handel_hwv-433_5_gigue_(c)yamada.mid.csv'
# gigue = './csv/total.csv'

ga = pre_process(gigue)

print "i.e: \n" + str(ga[32])

i.e: 
['1', ' 190', ' Note_on_c', ' 0', ' 68', ' 0']


In [144]:
#let's drop the time bit for now
whatevs = [g.pop(1) for g in ga]

In [145]:
#padding 
s_len = max([len(g) for g in ga])

p_ga = [g + [None] * (s_len - len(g)) for g in ga]

# p_ga = [[g[3]] for g in p_ga]
# s_len = 1

elems = ['track', 'action', 'channel', 'pitch', 'velocity', 'hurh?']

v = []
s = []
d_l = len(p_ga)

for i in range(s_len):
    v.append([g[i] for g in p_ga])
    s.append(set(v[i]))

In [146]:
#let's pad the sets of values with None until they reach max(l_v)
#this will give us a matrix of values length_data X max(length_vocab)
m_v = max(len(ss) for ss in s)

p_s = [list(ss) + [None] * (m_v - len(ss)) for ss in s]

In [147]:
ix_to_val = lambda y, f: f[y]
val_to_ix = lambda y, f: f.index(y)

In [148]:
import numpy as np
#pure numerics - we don't need to worry about how they map back until later
train_data = np.asarray([[val_to_ix(vvv, p_s[i]) for vvv in vv] 
                  for i, vv in enumerate(v)])

Let's include crossover on the hidden levels - the last ones.
And we're going to ignore the time element until we can implement a 'sequence generation' network.

Open Questions:
 - does normalizing the length of each vocabulary make it easier to pure numpy arrays?
    - I kinda think numpy arrays in lists viz a viz list comprehensions is not such a bad thing
 - what about sharing?
    - my feeling is that each element will have len(elements) hidden weight matrices, and we will be applying them against the element's hidden layer -- do we need hidden layers, plural?
      my instinct is: no - we will be defining the hidden layer as:
      tanh(dot(in-wgts, in-vals) + dot(last-self-weights, last-self) + dot(last-other-weights, last-other) ... etc etc. This fits the analogy - we don't really need to prefer any previous values, but we DO want to be sensitive to temporal patterns in every dimension. Let's do this.

In [149]:
#the fun begins - let's see if we can avoid using anything but numpy arrays!
#note: the real challenge is that the length of the input vectors & weight matrices are not always the same!
#do we pad them with null values? probably - but what null values to use without breaking the format?
#None is taken for most - but we'll use it as the padding value in teh vocabs since our lambda's only grab the first
#result. We'll also try to mask the Wxh and Wyh weights to zero for these connections - right?
# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# weights belong only to the individual network, values do not
# model parameters
Wxh = np.random.randn(s_len, hidden_size, m_v)*0.01
Whh = np.random.randn(s_len, s_len, hidden_size, hidden_size)*0.01
Why = np.random.randn(s_len, m_v, hidden_size,1)*0.01
bh = np.zeros((s_len, hidden_size,1))
by = np.zeros((s_len, m_v,1)) # input to hidden

In [150]:
#let's do the main loop next
n = 0 #iteration counter
p = 0 #position in `events` array, `nm` - more generally `p_ga`

mWxh = np.zeros_like(Wxh)
mWhh = np.zeros_like(Whh)
mWhy = np.zeros_like(Why)
mbh = np.zeros_like(bh)
mby = np.zeros_like(by)

#hm - does loss belong to each network individually?
smooth_loss = np.zeros((s_len))
smooth_loss[:] = -1 * np.log(1.0/m_v) * seq_length

In [None]:
while True:
    #position is absolute - invariant across elements
    if p + seq_length + 1 >= d_l or n == 0: #beinning & end: training data
        #hprev = [np.zeros((hidden_size,1)) for i in range(s_len)] #marks the start/end of seq
        hprev = np.zeros((s_len, hidden_size,1))
        p = 0 #start at the very beginning
        
    #transform first chunk of data into indices
    inputs = train_data[:,p:p+seq_length]
    #transform shifted+1 chunk of data for target vals
    targets = train_data[:,p+1:p+seq_length+1]
    
    #run the current seq through the net, and fetch the gradients
    #not so pretty - let's not use this in favor of an array of np arrays - 
    #must be so since input/output lengths are variant
    #let's see what happens - and use plain old numpy with normalized 
    xs = np.zeros((s_len, seq_length, m_v,1))    
    hs = np.zeros((s_len, seq_length, hidden_size,1))
    ys = np.zeros((s_len, seq_length, m_v,1)) # output states
    ps = np.zeros((s_len, seq_length, m_v,1)) # probabilities
    
    #I wonder what it would look like to have a hs with len less than seq_length
    hs[:,-1] = hprev[:]
    loss = np.zeros((s_len))#does each element experience loss separately? probably
    
    #forward pass!
    #sequences are tough! what how does the sequence need to be trandformed
    for t in np.arange(inputs.shape[1]): #remind u of rk4?
        for j in np.arange(inputs.shape[0]):
            xs[j,t,inputs[j,t]] = 1 
            hs[j,t] = np.tanh(np.dot(Wxh[j], xs[j,t]) + 
                              np.sum(np.dot(Whh[j,i], hs[i,t-1]) for i in np.arange(inputs.shape[0]))+
                              bh[j])
            ys[j,t] = np.dot(Why[j,...,0], hs[j,t]) +by[j] #unnormed log prob for next word
            ps[j,t] = np.exp(ys[j,t]) / np.sum(np.exp(ys[j,t])) #sigmoid those probs
            #apply loss - log linear
            loss[j] += -np.log(ps[j,t,targets[j,t]])
        
    #backward pass! - might want to split this when we've got parallelized
    #interconnected networks
    
    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)
    dWhy = np.zeros_like(Why)
    dbh = np.zeros_like(bh)
    dby = np.zeros_like(by)
    
    dhnext = np.zeros_like(hs[:,0])
    
    #jus cuz it's neater, less in the loop
    dy = np.copy(ps)
    dh = np.zeros_like(hs)
    
    #that's right, we backprop to every sequence we visit - t has power
    for t in reversed(np.arange(inputs.shape[1])):
        for j in np.arange(inputs.shape[0]):
            dy[j, t,targets[j,t]] -= 1 #actual value 'applied' to space, will fix
            dWhy[j,...,0] += np.dot(dy[j,t], hs[j,t].T) #bp into y weights
            dby[j] += dy[j,t]
            dh[j,t] = np.dot(Why[j].T, dy[j,t]) + dhnext[j] #bp into h
            dhraw = (1-hs[j,t]*hs[j,t]) * dh[j,t] #take bp and filter via tanh(u)`
            dbh[j] += dhraw #makes sense, the bias accums via the bp of output
            dWxh[j] += np.dot(dhraw, xs[j][t].T) #bp hidden into inputs weights
            dWhh[j] += np.asarray([np.dot(dhraw, hs[i,t-1].T) for i in np.arange(inputs.shape[0])]) #bp into t-1 hidden state weights
            dhnext[j] = np.dot(Whh[j,j], dhraw)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam,-5,5,out=dparam) #clip to mitigate exploding grad

    smooth_loss = [ss * 0.999 + loss[i] * 0.001 for i,ss in enumerate(smooth_loss)] #wow, such voodoo
    
    if n % 10000 == 0:
        print('iter %d: ' % (n) + str(['loss %f,' % (sl) for sl in smooth_loss])) #see that loss shrinkin?
    
    #adagrad - let's make this thing its own bit?!
    #seems like afterthought, Karpathy!
    #such a business, but gives us tuple:
    #(cur-weights, del-weights, mem-weights? wat r mem-weights?)
    for j in np.arange(inputs.shape[0]):
        for param, dparam, mem in zip([Wxh[j], Whh[j], Why[j], bh[j], by[j]],
                                      [dWxh[j], dWhh[j], dWhy[j], dbh[j], dby[j]],
                                      [mWxh[j], mWhh[j], mWhy[j], mbh[j], mby[j]]):
            mem += dparam * dparam #square delta weights into memory? y square?
            param += -learning_rate * dparam / np.sqrt(mem + 1e-8) #adagrad!
        
    p += seq_length
    n += 1 
    #let's go again!

In [None]:
#generator - basically a forward only run
#   """ 
#   sample a sequence of integers from the model 
#   h is memory state, seed_ix is seed letter for first time step
#   """
def sample(h, seed_ix, n):
    x = np.zeros((vocab_size,1))
    x[seed_ix] = 1 #builds initial input - catalyst
    ixes = [] #sequence out!
    #forward pass, tidy loop
    for t in xrange(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        #p= option sets the probabilities of 'random' choice
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

In [None]:
sample_ix = sample(hprev, inputs[0], 3500)
sample_notes = [note_for_ind(i)[1] for i in sample_ix]
print sample_notes