In [30]:
import string
import numpy as np
import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, GRU
from keras.layers import Embedding

In [13]:
np.random.seed(12345)

In [14]:
# turn a doc into clean tokens
def clean_doc(doc):
    doc = doc.replace('\n\n', '\n').replace('\n\n', '\n').replace('\n\n', '\n')    
    tokens = doc.split() # split into tokens by white space    
    table = str.maketrans('', '', string.punctuation) # remove punctuation from each token
    tokens = [w.translate(table) for w in tokens]    
    tokens = [word for word in tokens if word.isalpha()] # remove remaining tokens that are not alphabetic    
    tokens = [word.lower() for word in tokens] # make lower case
    return tokens

In [16]:
doc = open('treasure-island.txt', encoding="utf8").read()
print(doc[:40])

﻿The Old Sea-dog at the Admiral Benbow




In [17]:
# clean document
tokens = clean_doc(doc)
print(tokens[:20])
print("Total Tokens: " + str(len(tokens)))
print("Unique Tokens: " + str(len(set(tokens))))

['old', 'seadog', 'at', 'the', 'admiral', 'benbow', 'squire', 'trelawney', 'dr', 'livesey', 'and', 'the', 'rest', 'of', 'these', 'gentlemen', 'having', 'asked', 'me', 'to']
Total Tokens: 65486
Unique Tokens: 6371


In [18]:
SEQUENCE_LENGTH = 50
# organize into sequences of tokens
length = SEQUENCE_LENGTH + 1 # the one here is the next token which is the label in our case
sequences = list()
for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    sequences.append(line)
    
print('Total Sequences: %d' % len(sequences))

Total Sequences: 65435


In [19]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

6372


In [23]:
# separate into input and output
sequences = np.array(sequences)

In [24]:
X, y = sequences[:,:-1], sequences[:,-1]

In [25]:
Y = to_categorical(y, num_classes=vocab_size)

print(X.shape)
print(Y.shape)
print(X[0])
print(Y[0])

(65435, 50)
(65435, 6372)
[  76 6371   21    1  402  442   94  319  209  190    2    1  251    4
  126  490  566  220   23    6 3064   57    1  259 6369   67  208  119
   38    1  745    6    1  198  744  188   82   20    1 1341    4    1
  119    2   10  103 1143   40   47   75]
[0. 0. 0. ... 0. 0. 0.]


In [34]:
# define model
EMBEDDING_SIZE = 50

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=SEQUENCE_LENGTH))
model.add(GRU(128)) #, return_sequences=True
#model.add(GRU(128))
model.add(Dense(2048, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())


# compile model
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.01), metrics=['accuracy'])



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 50, 50)            318600    
_________________________________________________________________
gru_7 (GRU)                  (None, 128)               68736     
_________________________________________________________________
dense_9 (Dense)              (None, 2048)              264192    
_________________________________________________________________
dropout_5 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 6372)              13056228  
Total params: 13,707,756
Trainable params: 13,707,756
Non-trainable params: 0
_________________________________________________________________
None


In [35]:
# fit model
model.fit(X, Y, batch_size= 512, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f6b46cd3c50>

In [36]:
# fit model
model.fit(X, Y, batch_size= 512, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6b1c152048>

In [37]:
model.save('Sample7.h5')

In [38]:
word2index = tokenizer.word_index
index2word = dict((c, w) for w, c in word2index.items()) # index2word is available directly on tokenizer on new keras versions

In [39]:
# generate a sequence from a language model
def generate_seq(seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=SEQUENCE_LENGTH, truncating='pre')

        prediction = model.predict_classes(encoded, verbose=0)
        
        # map predicted word index to word        
        out_word = index2word[prediction[0]]
      
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [46]:
# select a seed text
seed_text = "It was a dark night near the scary shore and the ships were resting as giant monsters we had to move slowly and take care of each other as there was lots of injuries the wind has been in our favour"
print(len(seed_text.split()))

41


In [48]:
# generate new text
generated = generate_seq(seed_text, 200)
print(generated)

we began to scramble out of a bribe want none of your own sir you have the last good as for that lot and their council mark me theyre outright fools and cowards ill save your lifeif so do i believe this crew youll believe and i must know where is one can youll up and i dont like an echo than he sailed with of it was said i would go back to captain kidds anchorage ran from the twopeaked hill upon the captain and we could fight for the ship but i was determined to go down the squire and dr livesey were seated on either a harbourbar my mother pulled it up with impatience and there lay before us the last things in the centre after one lad went to the echoes and one of the cocks with his hat having fallen against a big seaman obrien carried his cutlass did you may suppose that he took it a deal more rum and i dare say true i had gone up to execution dock by thunder so do and it why its here englands men will you ring that bell mr dance must have shown for course


In [49]:
import csv
def saveDictionary(dict, fileName) :
    f = open(fileName, "w", newline='')
    w = csv.writer(f)
    for key, val in dict.items():
        w.writerow([key, val])
    f.close()

In [50]:
saveDictionary(word2index, "word2index.csv")
saveDictionary(index2word, "index2word.csv")