In [1]:
import numpy as np
from scipy.io import loadmat
import time
data = loadmat('data.mat')['data']
#data

In [2]:
data.dtype

dtype([('testData', 'O'), ('trainData', 'O'), ('validData', 'O'), ('vocab', 'O')])

In [3]:
trainData = data['trainData'][0][0]
validData = data['validData'][0][0]
testData = data['testData'][0][0]
print(len(trainData), len(validData), len(testData))

4 4 4


In [4]:
trainData.shape, validData.shape, testData.shape

((4, 372550), (4, 46568), (4, 46568))

In [5]:
#setting for later processes
vocab = data['vocab'][0][0][0]
print(vocab[:5])
train_input = trainData[0:3,:] -1
train_target = trainData[3,:].reshape((1,-1)) -1
print(train_input[:,0], train_target[:,0])
valid_input = validData[0:3,:] -1
valid_target = validData[3,:].reshape((1,-1)) -1
print(valid_input[:,0], valid_target[:,0])
test_input = testData[0:3,:] -1
test_target = testData[3,:].reshape((1,-1)) -1
print(test_input[:,0], test_target[:,0])

[array(['all'], dtype='<U3') array(['set'], dtype='<U3')
 array(['just'], dtype='<U4') array(['show'], dtype='<U4')
 array(['being'], dtype='<U5')]
[27 25 89] [143]
[169 136 189] [142]
[182  89 186] [143]


Octave starts from 1, while python starts from 0!!!

In [6]:
#INITIALIZE WEIGHTS AND BIASES.
vocab_size = len(vocab) # vocab size
batchsize = 100  # Mini-batch size.
learning_rate = 0.1  # Learning rate; default = 0.1.
momentum = 0.9  # Momentum; default = 0.9.
numhid1 = 50    # Dimensionality of embedding space; default = 50.
numhid2 = 200   # Number of units in hidden layer; default = 200.
init_wt = 0.01  # Standard deviation of the normal distribution, 
                # which is sampled to get the initial weights; default = 0.01
numwords = train_input.shape[0]
np.random.seed(32)
word_embedding_weights = init_wt * np.random.randn(vocab_size, numhid1)
embed_to_hid_weights = init_wt * np.random.randn(numhid1*numwords, numhid2)
hid_to_output_weights = init_wt * np.random.randn(numhid2, vocab_size)
hid_bias = np.zeros((numhid2, 1))
output_bias = np.zeros((vocab_size, 1))

word_embedding_weights_delta = np.zeros((vocab_size, numhid1))
word_embedding_weights_gradient = np.zeros((vocab_size, numhid1))
embed_to_hid_weights_delta = np.zeros((numhid1*numwords, numhid2))
hid_to_output_weights_delta = np.zeros((numhid2, vocab_size))
hid_bias_delta = np.zeros((numhid2, 1))
output_bias_delta = np.zeros((vocab_size, 1))
expansion_matrix = np.eye(vocab_size) # matrix which outputs 1 if i = j else 0
count = 0
tiny = np.exp(-30)


In [7]:
def fprop(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias):
    '''
    This method forward propagates through a neural network.
    Inputs:
        input_batch: The input data as a matrix of size numwords X batchsize where,
                     numwords is the number of words, batchsize is the number of data points.
                     So, if input_batch(i, j) = k then the ith word in data point j is word
                     index k of the vocabulary.

        word_embedding_weights: Word embedding as a matrix of size
                                vocab_size X numhid1, where vocab_size is the size of the vocabulary
                                numhid1 is the dimensionality of the embedding space.

        embed_to_hid_weights: Weights between the word embedding layer and hidden
                              layer as a matrix of soze numhid1*numwords X numhid2, numhid2 is the
                              number of hidden units.

        hid_to_output_weights: Weights between the hidden layer and output softmax
                               unit as a matrix of size numhid2 X vocab_size

        hid_bias: Bias of the hidden layer as a matrix of size numhid2 X 1.

        output_bias: Bias of the output layer as a matrix of size vocab_size X 1.

    Outputs:
        embedding_layer_state: State of units in the embedding layer as a matrix of
                               size numhid1*numwords X batchsize

        hidden_layer_state: State of units in the hidden layer as a matrix of size
                            numhid2 X batchsize

        output_layer_state: State of units in the output layer as a matrix of size
                            vocab_size X batchsize
    '''

    numwords, batchsize = input_batch.shape
    vocab_size, numhid1 = word_embedding_weights.shape
    numhid2 = embed_to_hid_weights.shape[1]

    ## COMPUTE STATE OF WORD EMBEDDING LAYER.
    # Look up the inputs word indices in the word_embedding_weights matrix.
    #embedding_layer_state = reshape(word_embedding_weights(reshape(input_batch, 1, []),:).T, numhid1 * numwords, [])
    embedding_layer_state = word_embedding_weights[input_batch.reshape((1,-1)), :].T.reshape((numhid1 * numwords, -1))
    
    ## COMPUTE STATE OF HIDDEN LAYER.
    # Compute inputs to hidden units.
    inputs_to_hidden_units = np.dot(embed_to_hid_weights.T, embedding_layer_state) + hid_bias

    # Apply logistic activation function.
    # FILL IN CODE. Replace the line below by one of the options.
    #hidden_layer_state = zeros(numhid2, batchsize);
    # Options
    # (a) hidden_layer_state = 1 ./ (1 + exp(inputs_to_hidden_units));
    # (b) hidden_layer_state = 1 ./ (1 - exp(-inputs_to_hidden_units));
    hidden_layer_state = 1. / (1 + np.exp(-inputs_to_hidden_units))
    # (d) hidden_layer_state = -1 ./ (1 + exp(-inputs_to_hidden_units));

    ## COMPUTE STATE OF OUTPUT LAYER.
    # Compute inputs to softmax.
    # FILL IN CODE. Replace the line below by one of the options.
    #inputs_to_softmax = zeros(vocab_size, batchsize);
    # Options
    inputs_to_softmax = np.dot(hid_to_output_weights.T, hidden_layer_state) + output_bias
    # (b) inputs_to_softmax = hid_to_output_weights' * hidden_layer_state +  repmat(output_bias, batchsize, 1);
    # (c) inputs_to_softmax = hidden_layer_state * hid_to_output_weights' +  repmat(output_bias, 1, batchsize);
    # (d) inputs_to_softmax = hid_to_output_weights * hidden_layer_state +  repmat(output_bias, batchsize, 1);

    # Subtract maximum. 
    # Remember that adding or subtracting the same constant from each input to a
    # softmax unit does not affect the outputs. Here we are subtracting maximum to
    # make all inputs <= 0. This prevents overflows when computing their
    # exponents.
    inputs_to_softmax = inputs_to_softmax - np.max(inputs_to_softmax, axis = 0)

    # Compute exp.
    output_layer_state = np.exp(inputs_to_softmax)

    # Normalize to get probability distribution.
    output_layer_state = output_layer_state / np.sum(output_layer_state, axis = 0)

    
    return embedding_layer_state, hidden_layer_state, output_layer_state

In [8]:
def train(epochs, train_input, train_target, word_embedding_weights, embed_to_hid_weights,
          hid_to_output_weights, hid_bias, output_bias, valid_input, valid_target, vocab,
          word_embedding_weights_delta, embed_to_hid_weights_delta, hid_to_output_weights_delta,
          hid_bias_delta, output_bias_delta,
          vocab_size = 250, 
          batchsize = 100, learning_rate = 0.1, numhid1 = 50, numhid2 = 200,
          count = 0, momentum = 0.9, tiny = np.exp(-30)):
    
    expansion_matrix = np.eye(vocab_size)
    numwords = train_input.shape[0]
    start_time = time.time()
    for epoch in range(epochs):
        print('Epoch %d\n' % epoch)
        this_chunk_CE = 0
        trainset_CE = 0
        # LOOP OVER MINI-BATCHES.
        for m in range(len(train_target[0])//batchsize + 1):
            #print('Batch%d' % m)
            if m < len(train_target[0])//batchsize:
                input_batch = train_input[:, m*batchsize:(m+1)*batchsize]
                target_batch = train_target[:, m*batchsize:(m+1)*batchsize]
            else:
                input_batch = train_input[:, m*batchsize:]
                target_batch = train_target[:, m*batchsize:]

            # FORWARD PROPAGATE.
            # Compute the state of each layer in the network given the input batch
            # and all weights and biases
            embedding_layer_state, hidden_layer_state, output_layer_state = fprop(input_batch, 
                                                                                    word_embedding_weights, 
                                                                                    embed_to_hid_weights, 
                                                                                    hid_to_output_weights, 
                                                                                    hid_bias, output_bias)

            # COMPUTE DERIVATIVE.
            # Expand the target to a sparse 1-of-K vector.
            #print(target_batch.ravel().max())
            expanded_target_batch = expansion_matrix[:, target_batch.ravel()]
            # Compute derivative of cross-entropy loss function.
            error_deriv = output_layer_state - expanded_target_batch
            #print(error_deriv.shape, output_layer_state.shape, expanded_target_batch.shape)

            # MEASURE LOSS FUNCTION.
            cross_entropy = -np.sum(np.sum(expanded_target_batch * np.log(output_layer_state + tiny))) / batchsize
            count += 1
            this_chunk_CE = this_chunk_CE + (cross_entropy - this_chunk_CE) / count
            trainset_CE = trainset_CE + (cross_entropy - trainset_CE) / m
            if m % 1000 == 0:
                print('\rBatch %d Train CE %.3f' % (m, this_chunk_CE))
    

            # BACK PROPAGATE.
            ## OUTPUT LAYER.
            hid_to_output_weights_gradient =  hidden_layer_state.dot(error_deriv.T)
            output_bias_gradient = np.sum(error_deriv, axis = 1, keepdims = True)
            back_propagated_deriv_1 = (hid_to_output_weights.dot(error_deriv)) * hidden_layer_state * (1 - hidden_layer_state)

            ## HIDDEN LAYER.
            # FILL IN CODE. Replace the line below by one of the options.
            #embed_to_hid_weights_gradient = zeros(numhid1 * numwords, numhid2)
            # Options:
            # (a) embed_to_hid_weights_gradient = back_propagated_deriv_1' * embedding_layer_state;
            embed_to_hid_weights_gradient = embedding_layer_state.dot(back_propagated_deriv_1.T)
            # (c) embed_to_hid_weights_gradient = back_propagated_deriv_1;
            # (d) embed_to_hid_weights_gradient = embedding_layer_state;

            # FILL IN CODE. Replace the line below by one of the options.
            #hid_bias_gradient = zeros(numhid2, 1)
            # Options
            hid_bias_gradient = np.sum(back_propagated_deriv_1, axis = 1, keepdims = True)
            # (b) hid_bias_gradient = sum(back_propagated_deriv_1, 1);
            # (c) hid_bias_gradient = back_propagated_deriv_1;
            # (d) hid_bias_gradient = back_propagated_deriv_1';

            # FILL IN CODE. Replace the line below by one of the options.
            #back_propagated_deriv_2 = zeros(numhid2, batchsize);
            # Options
            back_propagated_deriv_2 = embed_to_hid_weights.dot(back_propagated_deriv_1)
            # (b) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights;
            # (c) back_propagated_deriv_2 = back_propagated_deriv_1' * embed_to_hid_weights;
            # (d) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights';

            word_embedding_weights_gradient = np.zeros((vocab_size, numhid1))
            ## EMBEDDING LAYER.
            for w in range(numwords):
                word_embedding_weights_gradient = (word_embedding_weights_gradient + 
                                                   (expansion_matrix[:, input_batch[w, :].ravel()].dot(
                                                       back_propagated_deriv_2[w * numhid1 : (w+1) * numhid1, :].T)))
    
            # UPDATE WEIGHTS AND BIASES.
            word_embedding_weights_delta = (momentum * word_embedding_weights_delta + 
                                            word_embedding_weights_gradient / batchsize)
            word_embedding_weights -= learning_rate * word_embedding_weights_delta

            embed_to_hid_weights_delta = (momentum * embed_to_hid_weights_delta + 
                                          embed_to_hid_weights_gradient / batchsize)
            embed_to_hid_weights -= learning_rate * embed_to_hid_weights_delta

            hid_to_output_weights_delta = (momentum * hid_to_output_weights_delta + 
                                           hid_to_output_weights_gradient / batchsize)
            hid_to_output_weights -= learning_rate * hid_to_output_weights_delta

            hid_bias_delta = (momentum * hid_bias_delta + hid_bias_gradient / batchsize)
            hid_bias -= learning_rate * hid_bias_delta

            output_bias_delta = (momentum * output_bias_delta + output_bias_gradient / batchsize)
            output_bias -= learning_rate * output_bias_delta

            
        print('\rAverage Training CE %.3f\n' % trainset_CE)
        
        print('Finished 1 epoch Training.\n')
        
        # VALIDATE.
        print('\rRunning validation ...')
        embedding_layer_state, hidden_layer_state, output_layer_state = fprop(valid_input, 
                                                                              word_embedding_weights, 
                                                                              embed_to_hid_weights,
                                                                              hid_to_output_weights, 
                                                                              hid_bias, output_bias)
        datasetsize = len(valid_input[0]);
        expanded_valid_target = expansion_matrix[:, valid_target.ravel()]
        CE = -np.sum(np.sum(expanded_valid_target * np.log(output_layer_state + tiny))) / datasetsize
        print(' Validation CE %.3f\n' % CE)
    print('Final Training CE %.3f\n', trainset_CE)
    
    model = dict()
    model['word_embedding_weights'] = word_embedding_weights
    model['embed_to_hid_weights'] = embed_to_hid_weights
    model['hid_to_output_weights'] = hid_to_output_weights
    model['hid_bias'] = hid_bias
    model['output_bias'] = output_bias
    model['vocab'] = vocab
    
    end_time = time.time()
    
    print('Training took %.2f seconds' % (end_time - start_time))
    
    return model

In [16]:
model = train(10, train_input, train_target, word_embedding_weights, embed_to_hid_weights,
          hid_to_output_weights, hid_bias, output_bias, valid_input, valid_target, vocab,
          word_embedding_weights_delta, embed_to_hid_weights_delta, hid_to_output_weights_delta,
          hid_bias_delta, output_bias_delta,
          vocab_size = 250, 
          batchsize = 100, learning_rate = 0.1, numhid1 = 50, numhid2 = 200,
          count = 0, momentum = 0.9, tiny = np.exp(-30))

Epoch 0

Batch 0 Train CE 3.291




Batch 1000 Train CE 3.150
Batch 2000 Train CE 3.107
Batch 3000 Train CE 3.083
Average Training CE nan

Finished 1 epoch Training.

Running validation ...
 Validation CE 3.017

Epoch 1

Batch 0 Train CE 0.001
Batch 1000 Train CE 0.635
Batch 2000 Train CE 1.045
Batch 3000 Train CE 1.332
Average Training CE nan

Finished 1 epoch Training.

Running validation ...
 Validation CE 2.977

Epoch 2

Batch 0 Train CE 0.000
Batch 1000 Train CE 0.350
Batch 2000 Train CE 0.625
Batch 3000 Train CE 0.847
Average Training CE nan

Finished 1 epoch Training.

Running validation ...
 Validation CE 2.955

Epoch 3

Batch 0 Train CE 0.000
Batch 1000 Train CE 0.241
Batch 2000 Train CE 0.444
Batch 3000 Train CE 0.619
Average Training CE nan

Finished 1 epoch Training.

Running validation ...
 Validation CE 2.937

Epoch 4

Batch 0 Train CE 0.000
Batch 1000 Train CE 0.183
Batch 2000 Train CE 0.344
Batch 3000 Train CE 0.487
Average Training CE nan

Finished 1 epoch Training.

Running validation ...
 Validation CE

In [17]:
#run test
def test(model, test_input, test_target):
    word_embedding_weights = model['word_embedding_weights']
    embed_to_hid_weights = model['embed_to_hid_weights']
    hid_to_output_weights = model['hid_to_output_weights']
    hid_bias = model['hid_bias']
    output_bias = model['output_bias']
    
    embedding_layer_state, hidden_layer_state, output_layer_state = fprop(test_input, 
                                                                          word_embedding_weights, 
                                                                          embed_to_hid_weights,
                                                                          hid_to_output_weights, 
                                                                          hid_bias, output_bias)
    datasetsize = len(test_input[0])
    expanded_test_target = expansion_matrix[:, test_target.ravel()]
    CE = -np.sum(np.sum(expanded_test_target * np.log(output_layer_state + tiny))) / datasetsize
    print(1, '\rFinal Test CE %.3f\n', CE)
    return CE

In [18]:
test(model, test_input, test_target)



1 Final Test CE %.3f
 2.8896407304431304


2.8896407304431304