# Load

In [0]:
import pandas as pd

# data = pd.read_csv('https://alexip-ml.s3.amazonaws.com/stackexchange_812k.csv.gz', compression='gzip')
df = pd.read_csv('https://alexip-ml.s3.amazonaws.com/stackexchange_812k.tokenized.csv.gz', compression='gzip').sample(frac = 1, random_state = 8).reset_index(drop = True)


# Select Title

In [0]:
corpus = df[df.category.isin(['title'])].copy()
corpus = corpus[(corpus.n_tokens > 10) & (corpus.n_tokens < 500)].reset_index(drop = True).copy()
corpus.head()
# Need to sample corpus or training takes forever
red_corpus = corpus.sample(10000)

# Tokenize Input Sequence (X) Using Keras

In [3]:
# Fix Tensorflow
%tensorflow_version 2.x
import tensorflow
print(tensorflow.__version__)

TensorFlow 2.x selected.
2.1.0


In [46]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
texts = red_corpus.tokens

# Update internal vocabulary based on a list of texts
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1
print("vocabulary size: %d" %vocab_size)

vocabulary size: 6829


In [47]:
input_sequences = []
for line in texts:
    encoded_tokens = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded_tokens)):
        n_gram_sequence = encoded_tokens[:i+1]
        input_sequences.append(n_gram_sequence)

print('Total Sequences: %d' % len(input_sequences))


Total Sequences: 122917


# Build Fix Length Sequences of Token Indexes

In [48]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
print('Max Sequence Length: %d' % max_sequence_len)


Max Sequence Length: 36


# Split the Sequence Into Predictors and Labels

In [0]:
import keras.utils as ku 

# create predictors and label (X(input) and y(output))
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = ku.to_categorical(label, num_classes=vocab_size)


# Define a Model

In [67]:
from tensorflow.python.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.python.keras.models import Sequential

model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x=predictors, y=label, batch_size=100, epochs=1, verbose=1, use_multiprocessing=True)
print(model.summary())

Train on 122917 samples
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 35, 10)            68290     
_________________________________________________________________
lstm_10 (LSTM)               (None, 35, 150)           96600     
_________________________________________________________________
dropout_5 (Dropout)          (None, 35, 150)           0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 100)               100400    
_________________________________________________________________
dense_5 (Dense)              (None, 6829)              689729    
Total params: 955,019
Trainable params: 955,019
Non-trainable params: 0
_________________________________________________________________
None


# Text Generation

In [0]:
def generate_text(seed_text, next_words, max_sequence_len, model):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [52]:
texts.sample(10)

24093    is it valid to use t - test when using ratio o...
27883    proving that the estimators of coefficients an...
16982    how does xgboost python differentiate between ...
9605      limit of c . d . f . of poisson goes to infinity
23540    r prop . test - chi - squared approximation ma...
16381    expectation of a variable inside the cumulativ...
9685     approach to scale the size of investment with ...
23266    evaluating unbiased errors on the test set whe...
20150    what would be an appropriate y axis when plott...
18104    summarize multiple distributions into one , sp...
Name: tokens, dtype: object

In [0]:
generate_text("summarize multiple distributions", 10, max_sequence_len, model)

In [0]:
subset_text = corpus.sample(10000).text.to_list()