In [91]:
import string
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, GRU
from keras.layers import Embedding

In [20]:
np.random.seed(123456)

In [9]:
# turn a doc into clean tokens
def clean_doc(doc):
    doc = doc.replace('\n\n', '\n').replace('\n\n', '\n').replace('\n\n', '\n')    
    tokens = doc.split() # split into tokens by white space    
    table = str.maketrans('', '', string.punctuation) # remove punctuation from each token
    tokens = [w.translate(table) for w in tokens]    
    tokens = [word for word in tokens if word.isalpha()] # remove remaining tokens that are not alphabetic    
    tokens = [word.lower() for word in tokens] # make lower case
    return tokens

In [10]:
open('the_agile_samurai.txt', encoding="utf8").read()
print(doc[:40])

Agile in a Nutshell

What would it take 


In [138]:
# clean document
tokens = clean_doc(doc)
print(tokens[:20])
print("Total Tokens: " + str(len(tokens)))
print("Unique Tokens: " + str(len(set(tokens))))

['agile', 'in', 'a', 'nutshell', 'what', 'would', 'it', 'take', 'to', 'deliver', 'something', 'of', 'value', 'each', 'and', 'every', 'week', 'the', 'question', 'we']
Total Tokens: 43146
Unique Tokens: 4007


In [15]:
SEQUENCE_LENGTH = 20
# organize into sequences of tokens
length = SEQUENCE_LENGTH + 1 # the one here is the next token which is the label in our case
sequences = list()
for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    sequences.append(line)
    
print('Total Sequences: %d' % len(sequences))

Total Sequences: 43125


In [17]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

4008


In [139]:
# separate into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
Y = to_categorical(y, num_classes=vocab_size)
print(X.shape)
print(Y.shape)
print(X[0])
print(Y[0])

(43125, 20)
(43125, 4008)
[  24   12    6 1211   16   73    9   72    2  139   56    5  138  173
    3  114  225    1  446   13]
[0. 0. 0. ... 0. 0. 0.]


In [89]:
# define model
EMBEDDING_SIZE = 50

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=SEQUENCE_LENGTH))
model.add(GRU(128, return_sequences=True))
model.add(GRU(128))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())


# compile model
model.compile(loss='categorical_crossentropy', optimizer='Nadam', metrics=['accuracy'])

# fit model
model.fit(X, Y, batch_size= 256, epochs=100, verbose=1)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 20, 50)            200400    
_________________________________________________________________
gru_7 (GRU)                  (None, 20, 128)           68736     
_________________________________________________________________
gru_8 (GRU)                  (None, 128)               98688     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_20 (Dense)             (None, 4008)              517032    
Total params: 901,368
Trainable params: 901,368
Non-trainable params: 0
_________________________________________________________________
None

Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7fad48296c50>

In [90]:
model.save('Sample7.h5')

In [143]:
word2index = tokenizer.word_index
index2word = dict((c, w) for w, c in word2index.items())

In [144]:
# generate a sequence from a language model
def generate_seq(seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=SEQUENCE_LENGTH, truncating='pre')

        prediction = model.predict_classes(encoded, verbose=0)
        
        # map predicted word index to word        
        out_word = index2word[prediction[0]]
      
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [145]:
# select a seed text
seed_text = "agile team should be fully aware that the delivery is not"
print(seed_text)

agile team should be fully aware that the delivery is not


In [146]:
# generate new text
generated = generate_seq(seed_text, 50)
print(generated)

about them as possible by feature set decision what do this is truly an agile coach and rockstar project manager all rolled up in one agile coaches can be very helpful in getting new teams going they can start software are always kept the faint of fourteen cookies glass of


In [149]:
import csv
def saveDictionary(dict, fileName) :
    f = open(fileName, "w", newline='')
    w = csv.writer(f)
    for key, val in dict.items():
        w.writerow([key, val])
    f.close()

In [150]:
saveDictionary(word2index, "word2index.csv")
saveDictionary(index2word, "index2word.csv")