# Create A New Model

This notebook will create a new model with a defined architecture. It will then run one epoch, and then finally it exports to a given file name. 

In [20]:
import string
import numpy as np
from numpy import array
import random
from random import randint
from pickle import load
from pickle import dump

import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import LambdaCallback
from keras.utils import to_categorical
from keras.models import load_model

import sys
import io, getopt, ast
from pathlib import Path

In [21]:
# ===================================================================
# parameters

# Path to the raw corpus
raw_corpus ="../datasets/harry-potter-1.txt"
# This is the file with the pre separated lines of 51 words
dataset_path = "./harry-potter.txt"

load_existing_model = False
filename  = "generated-model"
load_path = "./" + filename + ".h5"
save_path = "./"

num_epochs = 5
checkpoints = list(range(1,num_epochs+1))
batch_size = 256
words_to_generate = 60

input_size = 50
output_size = 1


In [None]:
# =================================================
# Dataset Acquisition

# loads doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turns a doc into clean tokens
def clean_doc(doc):
    # make lower case
    doc = doc.lower()
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    doc = doc.replace('-', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    return tokens

# load document
doc = load_doc(raw_corpus)
# clean document
tokens = clean_doc(doc)
print(tokens[:20])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

In [None]:
# =================================================
# Dataset Preparation and Preservation

# organize into sequences of tokens
length = input_size + output_size
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

# save sequences to file
save_doc(sequences, filename + "-lines.txt")

In [None]:
# =================================================
# Tokenize Lines, Vocab Size Determination

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load
doc = load_doc(filename + "-lines.txt")
lines = doc.split('\n')

# integer encode sequences of words
tokenizer = Tokenizer(filters=string.punctuation)
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# save the tokenizer
dump(tokenizer, open(filename + '-tokenizer.pkl', 'wb'))

# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [None]:
# =================================================
# Model Creation

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout

# define model
model = Sequential()
model.add(Embedding(vocab_size, input_size, input_length=seq_length))
model.add(LSTM(96, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(96))
model.add(Dropout(0.2))
model.add(Dense(96, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
# ===================================================================
# Load the Dataset with the lines of text

def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

doc = load_doc(filename + "-lines.txt")
lines = doc.split('\n')

In [None]:
# ===================================================================
# Use the tokenizer we just loaded to prepare the sequences we're using

# load the tokenizer
tokenizer = load(open(filename + '-tokenizer.pkl', 'rb'))
sequences = tokenizer.texts_to_sequences(lines)

# remove punctuation from each token
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print("Vocab Size: %d" % vocab_size)

# separate into input and output
sequences = array(sequences)

X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [None]:
# ===================================================================
# Define function to define the generated text

def generate_text():
    
    result = list()
    # select a seed text
    seed_text = lines[randint(0,len(lines))]
    
    for i in range(words_to_generate):
        # encode the seed text
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)

        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break

        # append to input
        seed_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [None]:
# ===================================================================
# Define the Callback Function

def on_epoch_end (epoch, _):
    
    # Checkpointing the model
    for i in checkpoints:
        if epoch + 1 == i:
            print("Checkpointing the model...")
            model.save("%s-cp-%d.h5" % (filename, i))
            break
    print("Generating Text...")
    print(generate_text())
    
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [None]:
# ===================================================================
# Fit Model

model.fit(X, y, batch_size=batch_size, epochs=num_epochs, callbacks=[print_callback])