In [4]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pop_p\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

# Preprocess the Text

In [42]:
# retrieve the text
def get_docs():
    from keras.utils import get_file
    print('\nFetching the text...')
    url = 'https://raw.githubusercontent.com/maxim5/stanford-tensorflow-tutorials/master/data/arxiv_abstracts.txt'
    path = get_file('arxiv_abstracts.txt', origin=url)
    with open(path) as file_:
        return file_.readlines()

In [43]:
# preprocess the text
def preprocess(string):
    import nltk
    from nltk.tokenize import sent_tokenize
    return [[word.lower() for word in nltk.word_tokenize(line) if word.isalpha()] 
            for line in sent_tokenize(string)]

In [79]:
sentences = []
for doc in get_docs():
    sentences.extend(preprocess(string=doc))
print("Retrieved " + str(len(sentences)) + " sentences.")


Fetching the text...
Retrieved 48168 sentences.


# Convert Text to Input

In [45]:
def flatten(orig):
    import numpy as np
    out = []
    for x in orig:
        out.extend(x)
    return out
    
# 2 level tree parallel reduce
def flatten_parallel(orig, n_threads=100):
    from multiprocessing import Pool
    from multiprocessing.dummy import Pool as ThreadPool
    p = ThreadPool(n_threads)
    length = len(orig)
    while(length > n_threads):
        step = length//n_threads
        step = 2 if step < 2 else step
        orig = [orig[x: min(x + step, length)] 
                 for x in range(0, length, step)]
        orig = p.map(flatten, orig)
        length = len(orig)
    p.close()
    return flatten(orig)

In [46]:
def text_to_input(sentences, method='last_word', overlap=10, overlap_len=40):
    if method == 'last_word':
        x = [s[:-1] for s in sentences]
        y = [s[-1] for s in sentences]
    elif method == 'n_overlap':
        all_words = flatten_parallel(sentences)
        overlapped = [all_words[x: min(x+overlap_len, len(all_words))] 
                                for x in range(0, len(all_words), overlap)]
        x = [s[:-1] for s in overlapped]
        y = [s[-1] for s in overlapped]
        sentences = overlapped
    return x, y, sentences

In [80]:
x, y, sentences = text_to_input(sentences, method='n_overlap')

# Embed Words as Vectors

In [48]:
def embed_words(sentences, debug=True):
    import gensim
    if debug:
        print("Creating embeddings...")
    embed_model = gensim.models.Word2Vec(
        sentences,
        size=100, # vector dimension
        min_count=1, # min num times it needs to be in sentences to count
        window=5, # num words around word that affect vector
        iter=100)
    if debug:
        print("Embedding model created.")
    return embed_model

def get_embedding_layer(embed_model):
    from keras.layers.embeddings import Embedding
    weights = embed_model.wv.vectors
    vocab_size, embedding_size = weights.shape
    return Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[weights])

def word2index(embed_model, word):
    return embed_model.wv.vocab[word].index

def index2word(embed_model, index):
    return embed_model.wv.index2word[index]

# Construct Model

In [49]:
def get_model(embed_model):
    from keras.layers.recurrent import LSTM
    from keras.layers.embeddings import Embedding
    from keras.layers import Dense, Activation
    from keras.models import Sequential
    
    vocab_size, embedding_size = embed_model.wv.vectors.shape
    model = Sequential()
    model.add(get_embedding_layer(embed_model))
    model.add(LSTM(units=embedding_size, input_shape=(None,)))
    model.add(Dense(units=vocab_size))
    model.add(Activation('softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    
    return model

# Fit and Predict Words

In [84]:
def fit(model, x_train, y_train):
    model.fit(x_train, y_train, 
             batch_size=128,
             epochs=15,
             verbose=1)
    return model
    
def predict(model, embed_model, seed_word='model'):
    test = [word2index(embed_model, seed_word)]
    vocab_size = embed_model.wv.vectors.shape[0]
    while len(test) < 30:
        predictions = model.predict_proba(test)[0]
        test.append(np.random.choice(np.linspace(0, vocab_size - 1, vocab_size),
                                    p=predictions))
    return [index2word(embed_model, int(w)) for w in test]

In [51]:
embed_model = embed_words(sentences)

Creating embeddings...
Embedding model created.


In [81]:
# run this code temporarily to fix input formatting!!
import numpy as np
x = [np.array([word2index(embed_model, w) for w in words]) for words in x]
y = np.array([word2index(embed_model, w) for w in y])
# remove arrays not of size 39, temp fix
x = np.stack([a for a in x if a.shape[0] == 39], axis=0)
y = y[:-4]

In [85]:
model = fit(get_model(embed_model), x, y)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [78]:
predict(model, embed_model)

['model',
 'from',
 'will',
 'bootstrap',
 'theoretical',
 'modern',
 'filters',
 'unreliable',
 'than',
 'which',
 'hmm',
 'invariance',
 'prominent',
 'samples',
 'presents',
 'extracted',
 'certain',
 'aims',
 'straightforward',
 'indicating',
 'accuracy',
 'applied',
 'local',
 'learn',
 'suffer',
 'deep',
 'noisy',
 'of',
 'randomized',
 'martens']