# Preprocess the Text

In [1]:
# retrieve the text
def get_docs():
    from keras.utils import get_file
    print('\nFetching the text...')
    url = 'https://raw.githubusercontent.com/maxim5/stanford-tensorflow-tutorials/master/data/arxiv_abstracts.txt'
    path = get_file('arxiv_abstracts.txt', origin=url)
    with open(path) as file_:
        return file_.readlines()

In [2]:
# preprocess the text
def preprocess(string):
    import nltk
    from nltk.tokenize import sent_tokenize
    return [[word.lower() for word in nltk.word_tokenize(line) if word.isalpha()] 
            for line in sent_tokenize(string)]

In [3]:
sentences = []
for doc in get_docs():
    sentences.extend(preprocess(string=doc))
print("Retrieved " + str(len(sentences)) + " sentences.")

Using TensorFlow backend.



Fetching the text...
Retrieved 48168 sentences.


# Convert Text to Input

In [4]:
def flatten(orig):
    import numpy as np
    out = []
    for x in orig:
        out.extend(x)
    return out
    
# 2 level tree parallel reduce
def flatten_parallel(orig, n_threads=100):
    from multiprocessing import Pool
    from multiprocessing.dummy import Pool as ThreadPool
    p = ThreadPool(n_threads)
    length = len(orig)
    while(length > n_threads):
        step = length//n_threads
        step = 2 if step < 2 else step
        orig = [orig[x: min(x + step, length)] 
                 for x in range(0, length, step)]
        orig = p.map(flatten, orig)
        length = len(orig)
    p.close()
    return flatten(orig)

In [5]:
def text_to_input(sentences, method='last_word', overlap=10, overlap_len=40):
    if method == 'last_word':
        x = [s[:-1] for s in sentences]
        y = [s[-1] for s in sentences]
    elif method == 'n_overlap':
        all_words = flatten_parallel(sentences)
        overlapped = [all_words[x: min(x+overlap_len, len(all_words))] 
                                for x in range(0, len(all_words), overlap)]
        x = [s[:-1] for s in overlapped]
        y = [s[-1] for s in overlapped]
        sentences = overlapped
    return x, y, sentences

In [6]:
x, y, sentences = text_to_input(sentences, method='n_overlap')

# Embed Words as Vectors

In [7]:
def embed_words(sentences, debug=True):
    import gensim
    if debug:
        print("Creating embeddings...")
    embed_model = gensim.models.Word2Vec(
        sentences,
        size=100, # vector dimension
        min_count=1, # min num times it needs to be in sentences to count
        window=5, # num words around word that affect vector
        iter=100)
    if debug:
        print("Embedding model created.")
    return embed_model

def get_embedding_layer(embed_model):
    from keras.layers.embeddings import Embedding
    weights = embed_model.wv.vectors
    vocab_size, embedding_size = weights.shape
    return Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[weights])

def word2index(embed_model, word):
    return embed_model.wv.vocab[word].index

def index2word(embed_model, index):
    return embed_model.wv.index2word[index]

# Construct Model

In [10]:
def model(embed_model):
    from keras.layers.recurrent import LSTM
    from keras.layers.embeddings import Embedding
    from keras.layers import Dense, Activation
    from keras.models import Sequential
    
    vocab_size, embedding_size = embed_model.wv.vectors.shape
    model = Sequential()
    model.add(get_embedding_layer(embed_model))
    model.add(LSTM(units=embedding_size, input_shape=(None,)))
    model.add(Dense(units=vocab_size))
    model.add(Activation('softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    
    return model

# Fit and Predict Words

In [11]:
def fit(model, x_train, y_train):
    model.fit(x_train, y_train, 
             batch_size=8192,
             epochs=500,
             verbose=1)
    return model
    
def predict(model, x_test):
    print(model.predict(x_test))

In [12]:
embed_model = embed_words(sentences)

Creating embeddings...
Embedding model created.


In [13]:
import numpy as np
x = [np.array([word2index(embed_model, w) for w in words]) for words in x]
y = np.array([word2index(embed_model, w) for w in y])

In [14]:
# remove arrays not of size 39, temp fix
x = np.stack([a for a in x if a.shape[0] == 39], axis=0)

In [None]:
import numpy as np
model = fit(model(embed_model), x[:100000], y[:100000])#y[:-4])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500