In [1]:
import tensorflow as tf
import numpy as np

In [2]:
# function to generate a sequence of one_hot vectors (i.e. a sequence of "words")
def gen_sequence(words, embedding_size):
    sentences = np.zeros((len(words), embedding_size))
    for i in range(len(words)):
        sentences[i,words[i]] = 1
    return sentences

# generate n_sentences sentences with variable number of words
# Assume a vocabulary of size vocab_size
# maximum words per sentence will be max_words_per_sent
def generate_paragraph(vocab_size, n_sentences, max_words_per_sent):
    sentences = []
    for i in range(n_sentences):
        nwords = np.random.randint(1, max_words_per_sent+1)
        words = np.random.randint(vocab_size, size=nwords)
        sentence = gen_sequence(words, vocab_size)
        sentences.append(sentence)
    return sentences
        
def print_sent_sizes(paragraph):
    print ([len(s) for s in paragraph])

In [9]:
# generate 5 paragraphs
vocab_size=30
p1 = generate_paragraph(vocab_size, n_sentences=1+1, max_words_per_sent=int(1/1*vocab_size))
p2 = generate_paragraph(vocab_size, n_sentences=2+1, max_words_per_sent=int(1/2*vocab_size))
p3 = generate_paragraph(vocab_size, n_sentences=3+1, max_words_per_sent=int(1/3*vocab_size))
p4 = generate_paragraph(vocab_size, n_sentences=4+1, max_words_per_sent=int(1/4*vocab_size))
p5 = generate_paragraph(vocab_size, n_sentences=5+1, max_words_per_sent=int(1/5*vocab_size))

paragraphs = [p1, p2, p3, p4, p5]
for p in paragraphs: print_sent_sizes(p)

[28, 25]
[15, 10, 4]
[7, 6, 3, 2]
[2, 3, 4, 6, 5]
[2, 2, 1, 5, 5, 4]


From this point onwards, we will treat these paragraphs as a single "batch".
The goal is to to read the sentences from each paragraph one-at-a-time in parallel.
This is somewhat of a difficult task because there are both a varible number of sentences and a variable number of words, which makes actions like "bucketting" and "padding" difficult and non-intuitive.

I am practicing this for the context that every paragraph corresponds to a different example and can thus be treated independently.

In [10]:
# this function goes through every value in an array and finds the maximum length
# will be applied to "paragraphs",which contains lists, for the maximum paragraph length 
#      and each paragraph, which contains np.arrays, for the maximum sentence length
def get_max_length(array, size_op):
    max_len = 0
    for value in array:
        max_len = max(max_len, size_op(value))
    return max_len

In [11]:
# sanity check. should be 6
maximum_paragraph_length = get_max_length(paragraphs, len)
print ("maximum_paragraph_length =", maximum_paragraph_length)

maximum_paragraph_length = 6


In [12]:
# will be used for np.arrays
def np_length(arr): return arr.shape[0]
# this is slightly recursive. for each paragraph, I check the local maximum sentence length and 
# I then compare that against a "global" maximum sentence inside the main get_max_length function
def max_sent_in_par(paragraph): 
    return get_max_length(paragraph, np_length)

# sanity check. should be 28
maximum_sentence_length = get_max_length(paragraphs, max_sent_in_par)
print ("maximum_sentence_length =", maximum_sentence_length)

maximum_sentence_length = 28


In [14]:
# our goal is now to make a tensor for 5 paragraphs, 
                                # with each paragraph containing 6 sentences, 
                                # and each sentence containing 28 words,
                                # and each word using an embedding size of vocab_size 
                                # (change this for whether you're using one-hot or word embeddings here)
# i.e. we have a tensor of size 5 X 6 X 28 X 30
batch_size=5
paragraph_tensor = np.zeros((batch_size, maximum_paragraph_length, maximum_sentence_length, vocab_size))

# this wil serve as a single batch that will be read in by tensorflow

In [None]:
# now let's define a computational graph that will read in every tensor batch and process them with an rnn