Here is a code to walk through the details of designing and using our preplexity-based Long-Short Term Memory systems.

In [3]:
#importing keras layers

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
import tensorflow as tf
import keras.backend as K

Using TensorFlow backend.


In [4]:
#keras works on tensorflow
import tensorflow as tf
import keras
tf.VERSION,keras.__version__


('1.10.0', '2.2.4')

In [5]:
# importing nltk

import nltk

# Making a toy language model

First we preprocess the data

In [6]:
# source text: this is gonna be our corpus
data = """ Jack and Jill went up the hill\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n """

# integer encode text: each word gets a unique integer id
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# create word -> word sequences: each word is linked to the its follower
sequences = list()
for i in range(1, len(encoded)):
	sequence = encoded[i-1:i+1]
	sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# split into X and y elements
sequences = array(sequences)
X, y = sequences[:,0],sequences[:,1]
print(X.shape, y.shape)

# one hot encode outputs: each word's id becomes a position in an otherwise empty vector
y = to_categorical(y, num_classes=vocab_size)


Vocabulary Size: 22
Total Sequences: 24
(24,) (24,)


Then we write the model: here a 50 cells LSTM connected to a dense output layer

In [7]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             220       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________
None


Then we try to define or compute perplexity

In [8]:
#a way of defining perplexity
def perplexity(y_true, y_pred):
    crossentropy =  K.categorical_crossentropy(y_true, y_pred)
    return K.exp(crossentropy)

In [9]:
# a better way to define perplexity
def perplexity2(y_true, y_pred):
    cross_entropy = K.categorical_crossentropy(y_true, y_pred) 
    perplexity = K.pow(2.0, cross_entropy) 
    return perplexity

Then we train the model on the training part of our corpus, for 500 epochs

In [10]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',perplexity])
# fit network
model.fit(X, y, epochs=500, verbose=2)
# check perplexity slowly falling

Epoch 1/500
 - 1s - loss: 3.0909 - acc: 0.0417 - perplexity: 21.9976
Epoch 2/500
 - 0s - loss: 3.0902 - acc: 0.0833 - perplexity: 21.9807
Epoch 3/500
 - 0s - loss: 3.0895 - acc: 0.0833 - perplexity: 21.9662
Epoch 4/500
 - 0s - loss: 3.0887 - acc: 0.1667 - perplexity: 21.9493
Epoch 5/500
 - 0s - loss: 3.0880 - acc: 0.1667 - perplexity: 21.9335
Epoch 6/500
 - 0s - loss: 3.0873 - acc: 0.2083 - perplexity: 21.9174
Epoch 7/500
 - 0s - loss: 3.0865 - acc: 0.2083 - perplexity: 21.9005
Epoch 8/500
 - 0s - loss: 3.0857 - acc: 0.2083 - perplexity: 21.8835
Epoch 9/500
 - 0s - loss: 3.0849 - acc: 0.2083 - perplexity: 21.8665
Epoch 10/500
 - 0s - loss: 3.0841 - acc: 0.2083 - perplexity: 21.8494
Epoch 11/500
 - 0s - loss: 3.0833 - acc: 0.2083 - perplexity: 21.8321
Epoch 12/500
 - 0s - loss: 3.0825 - acc: 0.2083 - perplexity: 21.8147
Epoch 13/500
 - 0s - loss: 3.0817 - acc: 0.2083 - perplexity: 21.7970
Epoch 14/500
 - 0s - loss: 3.0808 - acc: 0.2083 - perplexity: 21.7791
Epoch 15/500
 - 0s - loss: 3.

<keras.callbacks.History at 0x1a2967acc0>

Finally we evaluate the model:
we have it generate a new sequence of words, or we test its perplexity on unseen data from the same corpus

In [5]:
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
	in_text, result = seed_text, seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = array(encoded)
		# predict a word in the vocabulary
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text, result = out_word, result + ' ' + out_word
	return result

In [11]:
# evaluate through generating a sentence
print(generate_seq(model, tokenizer, 'to', 10))

to fetch a pail of water jack and jill came tumbling


In [12]:
#We can also evaluate the model as its perplexity in front of a sentence

data = """ Jack and Jill went up the hill to fetch a pail of water Jack fell down and broke his crown and Jill came tumbling after\n"""
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# determine the vocabulary size
#vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
	sequence = encoded[i-1:i+1]
	sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
# split into X and y elements
sequences = array(sequences)
X1, y1 = sequences[:,0],sequences[:,1]
print(X1.shape, y1.shape)
# one hot encode outputs
y1 = to_categorical(y1, num_classes=vocab_size)

model.evaluate(X1,y1, verbose=0)

Vocabulary Size: 22
Total Sequences: 24
(24,) (24,)


[0.23141591250896454, 0.875, 1.3358511924743652]

Another way of preprocessing the data is to break it per line

# per-line model : we can break the input

In [1]:
#this time, the input is divided in segments (lines)
data = """ Jack and Jill went up the hill\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n """

In [8]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# create line-based sequences
sequences = list()
for line in data.split('\n'):
#for line in data_sents[:100]:
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

Total Sequences: 21
Max Sequence Length: 7


In [9]:
#we can define the training split

trainX = X[:19]
trainY = y[:19]

print(X.shape, y.shape, trainX.shape, trainY.shape)

(21, 6) (21, 22) (19, 6) (19, 22)


In [400]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))  #<<<< BEWARE THE INPUT LENGTH
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 251, 10)           160       
_________________________________________________________________
lstm_15 (LSTM)               (None, 50)                12200     
_________________________________________________________________
dense_15 (Dense)             (None, 16)                816       
Total params: 13,176
Trainable params: 13,176
Non-trainable params: 0
_________________________________________________________________
None


In [420]:
# compile the network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',perplexity2])
# fit the network
model.fit(trainX, trainY, epochs=500, verbose=2)


Epoch 1/500
 - 5s - loss: 2.5639 - acc: 0.3450 - perplexity2: 40.2948
Epoch 2/500
 - 5s - loss: 2.1329 - acc: 0.3710 - perplexity2: 15.0291
Epoch 3/500
 - 5s - loss: 1.9412 - acc: 0.3900 - perplexity2: 10.8643
Epoch 4/500
 - 5s - loss: 2.0486 - acc: 0.3520 - perplexity2: 9.4152
Epoch 5/500
 - 5s - loss: 1.8381 - acc: 0.4000 - perplexity2: 6.4953
Epoch 6/500
 - 5s - loss: 1.7575 - acc: 0.4080 - perplexity2: 5.6783
Epoch 7/500
 - 5s - loss: 1.7128 - acc: 0.4190 - perplexity2: 5.3828
Epoch 8/500
 - 5s - loss: 1.6835 - acc: 0.4260 - perplexity2: 5.2291
Epoch 9/500
 - 5s - loss: 1.6521 - acc: 0.4340 - perplexity2: 5.0297
Epoch 10/500
 - 5s - loss: 1.6241 - acc: 0.4410 - perplexity2: 4.9049
Epoch 11/500
 - 5s - loss: 1.6005 - acc: 0.4380 - perplexity2: 4.7620
Epoch 12/500
 - 5s - loss: 1.5788 - acc: 0.4550 - perplexity2: 4.6052
Epoch 13/500
 - 5s - loss: 1.5574 - acc: 0.4600 - perplexity2: 4.5749
Epoch 14/500
 - 5s - loss: 1.5430 - acc: 0.4580 - perplexity2: 4.5508
Epoch 15/500
 - 5s - loss:

KeyboardInterrupt: 

In [None]:
#so this is the perplexity of the model on the data it observed
model.evaluate(trainX, trainY)

In [406]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

In [None]:
#show the 'quality' of the model
generate_seq(model, tokenizer, max_length-1, 'and', 4)

In [421]:
#or, again, we define a new sequence to evaluate
data = """ Jill and Jack went up the hill\n"""

In [422]:
# eval sequences
sequences = list()
for line in data.split('\n'):
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad input sequences
#max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

# split into input and output elements
sequences = array(sequences)
X1, y1 = sequences[:,:-1],sequences[:,-1]
y1 = to_categorical(y1, num_classes=vocab_size)

model.evaluate(X1,y1, verbose=0)

Total Sequences: 11
Max Sequence Length: 252


[3.765824794769287, 0.1818181872367859, 47.031124114990234]

In [24]:
#this is how perplexed the model is on the whole data (train and test)
model.evaluate(X,y, verbose=0)

[0.19470392167568207, 1.0, 1.1529335975646973]

Finally there is one last possibility

# Two in one out : just another possibility

In [57]:


# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

# source text
data = """ Jack and Jill went up the hill\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n """
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)


# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
	sequence = encoded[i-2:i+1]
	sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))


# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)


# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)


# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())


# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit network
model.fit(X, y, epochs=500, verbose=2)

# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'Jack and', 5))
print(generate_seq(model, tokenizer, max_length-1, 'And Jill', 3))
print(generate_seq(model, tokenizer, max_length-1, 'fell down', 5))
print(generate_seq(model, tokenizer, max_length-1, 'pail of', 5))

Vocabulary Size: 22
Total Sequences: 23
Max Sequence Length: 3
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 2, 10)             220       
_________________________________________________________________
lstm_5 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_5 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/500
 - 2s - loss: 3.0912 - acc: 0.0435
Epoch 2/500
 - 0s - loss: 3.0906 - acc: 0.0000e+00
Epoch 3/500
 - 0s - loss: 3.0899 - acc: 0.1304
Epoch 4/500
 - 0s - loss: 3.0890 - acc: 0.1304
Epoch 5/500
 - 0s - loss: 3.0881 - acc: 0.1304
Epoch 6/500
 - 0s - loss: 3.0872 - acc: 0.1304
Epoch 7/500
 - 0s - loss: 3.0864 - acc: 0.1304
E

In [136]:
# Data
data = ["Two little dicky birds",
        "Sat on a wall,",
        "One called Peter,",
        "One called Paul.",
        "Fly away, Peter,",
        "Fly away, Paul!",
        "Come back, Peter,",
        "Come back, Paul."]

In [137]:
# tokenize data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
vocab = tokenizer.word_index
seqs = tokenizer.texts_to_sequences(data)

In [141]:
#prepare sentence

import numpy as np
def prepare_sentence(seq, maxlen):
    # Pads seq and slides windows
    x = []
    y = []
    for i, w in enumerate(seq):
        x_padded = pad_sequences([seq[:i]],
                                 maxlen=maxlen - 1,
                                 padding='pre')[0]  # Pads before each sequence
        x.append(x_padded)
        y.append(w)
    return x, y

# Pad sequences and slide windows
maxlen = max([len(seq) for seq in seqs])
x = []
y = []
for seq in seqs:
    x_windows, y_windows = prepare_sentence(seq, maxlen)
    x += x_windows
    y += y_windows
x = np.array(x)
y = np.array(y) - 1  # The word <PAD> does not constitute a class
y = np.eye(len(vocab))[y]  # One hot encoding

In [150]:
# Define model
model2 = Sequential()
model2.add(Embedding(input_dim=len(vocab) + 1,  # vocabulary size. Adding an
                                               # extra element for <PAD> word
                    output_dim=5,  # size of embeddings
                    input_length=maxlen - 1))  # length of the padded sequences
model2.add(LSTM(10))
model2.add(Dense(len(vocab), activation='softmax'))
model2.compile('rmsprop', 'categorical_crossentropy')

# Train network
model2.fit(x, y, epochs=1000)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.callbacks.History at 0x12696e780>

In [151]:
# Compute probability of occurence of a sentence
sentence = "come back,"
tok = tokenizer.texts_to_sequences([sentence])[0]
x_test, y_test = prepare_sentence(tok, maxlen)
x_test = np.array(x_test)
y_test = np.array(y_test) - 1  # The word <PAD> does not constitute a class
p_pred = model2.predict(x_test)  # array of conditional probabilities
vocab_inv = {v: k for k, v in vocab.items()}

# Compute product
# Efficient version: np.exp(np.sum(np.log(np.diag(p_pred[:, y_test]))))
log_p_sentence = 0
for i, prob in enumerate(p_pred):
    word = vocab_inv[y_test[i]+1]  # Index 0 from vocab is reserved to <PAD>
    history = ' '.join([vocab_inv[w] for w in x_test[i, :] if w != 0])
    prob_word = prob[y_test[i]]
    log_p_sentence += np.log(prob_word)
    print('P(w={}|h={})={}'.format(word, history, prob_word))
print('Prob. sentence: {}'.format(np.exp(log_p_sentence)))

P(w=come|h=)=0.2466132789850235
P(w=back|h=come)=0.9139593839645386
Prob. sentence: 0.22539452715216315


# Plato 

We can apply this weaponry on real data. In this case a piece of Plato

In [32]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [33]:
# load document
in_filename = 'republic.txt' # this should be a translation of choice of the Republic
doc = load_doc(in_filename)
print(doc[:200])



BOOK I

Socrates - GLAUCON 

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess; and also because
I wanted to see in what manner th


In [34]:
#tokenize cleanly
import string

# turn a doc into clean tokens
def clean_doc(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

In [35]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['book', 'i', 'socrates', 'glaucon', 'i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i', 'was', 'delighted', 'with', 'the', 'procession', 'of', 'the', 'inhabitants', 'but', 'that', 'of', 'the', 'thracians', 'was', 'equally', 'if', 'not', 'more', 'beautiful', 'when', 'we', 'had', 'finished', 'our', 'prayers', 'and', 'viewed', 'the', 'spectacle', 'we', 'turned', 'in', 'the', 'direction', 'of', 'the', 'city', 'and', 'at', 'that', 'instant', 'polemarchus', 'the', 'son', 'of', 'cephalus', 'chanced', 'to', 'catch', 'sight', 'of', 'us', 'from', 'a', 'distance', 'as', 'we', 'were', 'starting', 'on', 'our', 'way', 'home', 'and', 'told', 'his', 'servant', 'to', 'run', 'and', 'bid', 'us', 'wait', 'f

In [123]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [36]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

# save sequences to file
out_filename = 'republic_sequences.txt'
save_doc(sequences, out_filename)

Total Sequences: 118318


In [77]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
print(len(lines))

118318


In [78]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)


In [80]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [81]:
# separate into input and output
sequences = array(sequences)
sequences.shape

(118318, 51)

In [None]:
#create x and y
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [135]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 50)            800       
_________________________________________________________________
lstm_3 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 16)                1616      
Total params: 153,316
Trainable params: 153,316
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
 11648/118318 [=>............................] - ETA: 1:41 - loss: 2.3975 - acc: 0.4469

In [46]:
# save the model to file
from pickle import dump
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [47]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load cleaned text sequences
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [49]:
# load the model
from keras.models import load_model
model = load_model('model.h5')

In [50]:
#generate a babbling Plato

from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

# load cleaned text sequences
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1

# load the model
model = load_model('model.h5')

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

they remain in the upper world but this must not be allowed they must be made to descend again among the prisoners in the den and partake of their labours and honours whether they are worth having or not but is not this unjust he said ought we to give them

to be the founders of the interdicted are the same of the soul and the other of the soul which is the most miserable of the soul and the other of the soul which is the most miserable of the soul and the other of the soul and the other


# Translation corpus

Finally, we can use it on our corpus!

In [283]:
#german 'native' text

deu0 = open("train/epuds.de.pos").read() # here we take the Parts of Speech sequences, as is custom
pos = set(deu0.split())
print(len(pos))
pos

15


{'ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'VERB',
 'X'}

In [None]:
#checking the trans corp#
import timeit

deu = open("train/epuds.de.pos").read()
tokens = deu.split()
print(len(tokens))
print(tokens[:10])

# organize into sequences of tokens
length = 50 + 1
sequences = list()

for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

# save sequences to file
out_filename = 'train/deupos_sequences.txt'
save_doc(sequences, out_filename)

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [234]:
# load
in_filename = 'train/deupos_sequences.txt'#'republic_sequences.txt'#
doc = load_doc(in_filename)
lines = doc.split('\n')
print(len(lines))

print(len(doc))
lines = doc.split("\n")
print(len(lines))
print(lines[10])

lines = doc.split("\n")

start_time = timeit.default_timer()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
deu_sequences = tokenizer.texts_to_sequences(lines[:40000])
elapsed = timeit.default_timer() - start_time
print(elapsed)
print(elapsed/60)
print(len(deu_sequences))

vocab_size = len(tokenizer.word_index) + 1
#vocab_size = len(poss) + 1 #(why+1?)
print(vocab_size)

deu_sequences = array(deu_sequences)
print(deu_sequences.shape)

#smaller?


9062886
2224223867
9062886
DET NOUN DET PROPN PROPN AUX PRON PRON PUNCT PROPN PROPN NOUN PUNCT ADV ADV VERB PUNCT PRON VERB PRON ADV ADV ADV ADV ADV PUNCT SCONJ PRON ADP NUM NOUN CONJ ADJ ADV DET PROPN PROPN VERB PUNCT PRON VERB PRON PUNCT PROPN NOUN PUNCT PRON NOUN PUNCT PRON PRON
298.66911351100134
4.977818558516689
40000
16
(40000, 51)


In [None]:
#creating x and y
sequences = deu_sequences[:30000]
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [235]:
# define model: this is a deeper one than usual. Feel free to make it shallo by 
# commenting out lines
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 50, 50)            800       
_________________________________________________________________
lstm_13 (LSTM)               (None, 50, 100)           60400     
_________________________________________________________________
lstm_14 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_13 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_14 (Dense)             (None, 16)                1616      
Total params: 153,316
Trainable params: 153,316
Non-trainable params: 0
_________________________________________________________________
None


In [327]:
# compile model
#model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', perplexity2])
# fit model
model.fit(X, y, batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [328]:
#evaluate
#I take some remaining sequence
# split into input and output elements
for i in range(30000,30020):
        eva_sequences = array(deu_sequences[i:i+1])
        X1, y1 = eva_sequences[:,:-1],eva_sequences[:,-1]
        y1 = to_categorical(y1, num_classes=vocab_size)

        eva = model.evaluate(X1,y1, verbose=0)
        
        print(eva, len(eva_sequences[0]), len(deu_lines[i]))
        if eva[2]>100: print(deu_lines[i])

[8.335750579833984, 0.0, 323.0806579589844] 51 83
PRON NOUN VERB ADV DET NOUN PUNCT PRON ADP DET NOUN DET ADV ADJ NOUN VERB AUX PUNCT
[0.03127264976501465, 1.0, 1.0219131708145142] 51 51
[0.0796106830239296, 1.0, 1.0567328929901123] 51 119
[4.687814712524414, 0.0, 25.773466110229492] 51 60
[5.0307586207054555e-05, 1.0, 1.0000349283218384] 51 87
[0.15373742580413818, 1.0, 1.1124476194381714] 51 70
[14.637717247009277, 0.0, 25491.298828125] 51 46
DET NOUN ADP DET NOUN VERB ADJ CONJ NOUN PUNCT
[16.11809539794922, 0.0, 71126.296875] 51 121
PRON VERB PRON ADP NOUN ADP DET NUM NOUN DET NOUN PUNCT NOUN PUNCT NOUN CONJ NOUN PUNCT PRON PRON ADP DET NOUN VERB PUNCT
[1.1081022024154663, 0.0, 2.155618906021118] 51 63
[16.11809539794922, 0.0, 71126.296875] 51 37
ADV VERB PRON PRON PART ADV ADV PUNCT
[7.126777648925781, 0.0, 139.75709533691406] 51 49
PRON VERB PUNCT SCONJ DET ADV ADJ NOUN VERB PUNCT
[13.434011459350586, 0.0, 11067.267578125] 51 82
ADP DET NOUN DET NOUN PUNCT CONJ ADV ADP PRON ADP DE

In [269]:


# load
in_filename = 'train/eng2deu_pos_sequences.txt'#'republic_sequences.txt'#
doc = load_doc(in_filename)
lines = doc.split('\n')
print(len(lines))

print(len(doc))
lines = doc.split("\n")
print("number of lines: ",len(lines))
print(lines[10])

lines = doc.split("\n")

start_time = timeit.default_timer()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
eng2deu_sequences = tokenizer.texts_to_sequences(lines[:40000])
elapsed = timeit.default_timer() - start_time
print(elapsed)

print(len(eng2deu_sequences))

vocab_size = len(tokenizer.word_index) + 1
#vocab_size = len(poss) + 1 #(why+1?)
print(vocab_size)

eng2deu_sequences = array(eng2deu_sequences)
eng2deu_sequences.shape

#smaller?
eng2deu_sequences = eng2deu_sequences[:30000]



3677632
900822859
number of lines:  3677632
VERB PUNCT PRON ADP PRON PROPN NOUN PART VERB PUNCT PRON AUX PRON NOUN ADP ADJ NOUN ADP PRON ADP NOUN PRON NOUN DET NOUN VERB PUNCT ADP ADJ NOUN VERB PRON PROPN ADP DET ADP DET NOUN PRON ADJ NOUN ADJ NOUN CONJ VERB PRON ADP PRON NOUN ADP ADJ
85.28726735099917
40000
16


In [293]:
#individual
for i in range(1200,2220):
        eva_sequences = array(eng2deu_sequences[i:i+1])
        X1, y1 = eva_sequences[:,:-1],eva_sequences[:,-1]
        y1 = to_categorical(y1, num_classes=vocab_size)

        eva = model.evaluate(X1,y1, verbose=0)
        
        print(eva, len(eva_sequences[0]), len(deu_lines[i]))
        if eva[2]>3: print(deu_lines[i])

[0.007377952802926302, 1.0, 1.0051270723342896] 51 70
[0.0988958328962326, 1.0, 1.0709534883499146] 51 75
[0.0001467574038542807, 1.0, 1.0001016855239868] 51 28
[0.007961110211908817, 1.0, 1.0055334568023682] 51 53
[0.02812901884317398, 1.0, 1.0196888446807861] 51 58
[0.04729606211185455, 1.0, 1.0333263874053955] 51 115
[0.029552677646279335, 1.0, 1.0206955671310425] 51 77
[0.004812656901776791, 1.0, 1.0033414363861084] 51 63
[0.05605776607990265, 1.0, 1.039621114730835] 51 80
[0.05483783781528473, 1.0, 1.0387423038482666] 51 33
[1.392953634262085, 0.0, 2.626157760620117] 51 81
[4.768382950715022e-06, 1.0, 1.0000033378601074] 51 156
[0.011362489312887192, 1.0, 1.0079070329666138] 51 147
[0.0011420808732509613, 1.0, 1.0007919073104858] 51 170
[0.008442174643278122, 1.0, 1.0058687925338745] 51 206
[0.03188585117459297, 1.0, 1.0223476886749268] 51 200
[0.04800765588879585, 1.0, 1.0338362455368042] 51 148
[0.015820838510990143, 1.0, 1.0110265016555786] 51 81
[2.837221290974412e-05, 1.0, 1.

# Breaking the lines

In [599]:
#german pos sequence
data = open("train/epuds.de.pos").read()

In [600]:
#we need a common max lenght for both English and German corpora. 
#Emipirically, I will fix it to 252, the longest sequence of both corpora.

max_length = 252 #max([len(seq) for seq in sequences])

In [601]:
# create line-based sequences

sequences = list()
for line in data.split('\n'):
#for line in data_sents[:100]:
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad input sequences
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)


Total Sequences: 8635158
Max Sequence Length: 252


In [804]:
#we set a training part. Very small here (I am toying)
trainX = X[:6000]
trainY = y[:6000]

print(X.shape, y.shape, trainX.shape, trainY.shape)

(8635158, 251) (8635158, 16) (6000, 251) (6000, 16)


In [849]:
# define model
from keras.layers import Dropout

model = Sequential()
#model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(Embedding(vocab_size, 30, input_length=max_length-1))
#model.add(LSTM(100, return_sequences=True))
#model.add(Dropout(.2))
model.add(LSTM(20)) #100
model.add(Dense(10, activation='relu')) #100
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_34 (Embedding)     (None, 251, 10)           160       
_________________________________________________________________
lstm_52 (LSTM)               (None, 20)                2480      
_________________________________________________________________
dense_60 (Dense)             (None, 10)                210       
_________________________________________________________________
dense_61 (Dense)             (None, 16)                176       
Total params: 3,026
Trainable params: 3,026
Non-trainable params: 0
_________________________________________________________________
None


In [850]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',perplexity2])

In [852]:
# fit network
model.fit(trainX, trainY, epochs=20, verbose=2,validation_split=.8) #overfitting happens fast!
# for the record: 
#1000X, first iteration: 11-16s
#2000X, first iteration: 25-28ss
#4000X, first iteration: 52s
#6000X, 70s

Train on 1199 samples, validate on 4801 samples
Epoch 1/20
 - 11s - loss: 2.6981 - acc: 0.1334 - perplexity2: 6.5978 - val_loss: 2.6014 - val_acc: 0.1250 - val_perplexity2: 6.5653
Epoch 2/20
 - 10s - loss: 2.5589 - acc: 0.1334 - perplexity2: 6.3211 - val_loss: 2.5214 - val_acc: 0.1250 - val_perplexity2: 6.1824
Epoch 3/20
 - 10s - loss: 2.5034 - acc: 0.1735 - perplexity2: 6.2316 - val_loss: 2.4831 - val_acc: 0.1795 - val_perplexity2: 6.0623
Epoch 4/20
 - 10s - loss: 2.4719 - acc: 0.1610 - perplexity2: 6.1001 - val_loss: 2.4567 - val_acc: 0.1795 - val_perplexity2: 6.0278
Epoch 5/20
 - 10s - loss: 2.4493 - acc: 0.1610 - perplexity2: 5.9478 - val_loss: 2.4345 - val_acc: 0.1829 - val_perplexity2: 5.9847
Epoch 6/20
 - 10s - loss: 2.4244 - acc: 0.2110 - perplexity2: 5.9565 - val_loss: 2.4057 - val_acc: 0.2258 - val_perplexity2: 5.8901
Epoch 7/20
 - 10s - loss: 2.3969 - acc: 0.2319 - perplexity2: 5.9580 - val_loss: 2.3676 - val_acc: 0.2456 - val_perplexity2: 5.7939
Epoch 8/20
 - 10s - loss: 2.

<keras.callbacks.History at 0x1b054f6f98>

In [853]:
#a new piece of data
data = """PROPN VERB ADJ DET NOUN VERB PUNCT PROPN VERB PRON PRON PUNCT""".lower() # a specific, German sequence from the 'test'

In [854]:
# evaluating this specific sequence
sequences = list()
for line in data.split('\n'):
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad input sequences
#max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

# split into input and output elements
sequences = array(sequences)
X1, y1 = sequences[:,:-1],sequences[:,-1]
y1 = to_categorical(y1, num_classes=vocab_size)

model.evaluate(X1,y1, verbose=0)


Total Sequences: 11
Max Sequence Length: 252


[2.3281173706054688, 0.1818181872367859, 6.2405290603637695]

In [855]:
#evaluating the model on an equally small slice of the corpus
model.evaluate(X[6000:12000],y[6000:12000], verbose=0) # should be similar to the scale of the original perplexity

[2.1413686943054198, 0.30083333333333334, 5.670599614461263]

In [812]:
#english model
dataE = open("train/epuds.en.diff.pos").read()

In [877]:
#create the sequences
import random

sequences = list()
for line in dataE.split('\n'):
#for line in data_sents[:100]:
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad input sequences
#max_length = max([len(seq) for seq in sequences]) > we keep the original length

sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

# split into input and output elements
#random.shuffle(sequences)
sequences = array(sequences)
XE, yE = sequences[:,:-1],sequences[:,-1]
yE = to_categorical(yE, num_classes=vocab_size)

Total Sequences: 5727538
Max Sequence Length: 252


In [878]:
#train slice

trainXE = XE[:6000]
trainYE = yE[:6000]

print(XE.shape, yE.shape, trainXE.shape, trainYE.shape)

(5727538, 251) (5727538, 16) (6000, 251) (6000, 16)


In [902]:
# define model
modelE = Sequential()
modelE.add(Embedding(vocab_size, 50, input_length=max_length-1))
#model.add(Embedding(vocab_size, 50, input_length=max_length-1))
modelE.add(LSTM(100, return_sequences=True))
modelE.add(Dropout(.2))
modelE.add(LSTM(100))
modelE.add(Dense(100, activation='relu'))
modelE.add(Dense(vocab_size, activation='softmax'))
print(modelE.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_40 (Embedding)     (None, 251, 50)           800       
_________________________________________________________________
lstm_60 (LSTM)               (None, 251, 100)          60400     
_________________________________________________________________
dropout_8 (Dropout)          (None, 251, 100)          0         
_________________________________________________________________
lstm_61 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_72 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_73 (Dense)             (None, 16)                1616      
Total params: 153,316
Trainable params: 153,316
Non-trainable params: 0
_________________________________________________________________
None

In [903]:
# compile network
modelE.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',perplexity2])

In [904]:
# fit network
modelE.fit(trainXE, trainYE, epochs=2, verbose=2, validation_split=.8)
#for the record: 2000X takes 26ss first iteration

Train on 1199 samples, validate on 4801 samples
Epoch 1/2
 - 31s - loss: 2.4666 - acc: 0.1985 - perplexity2: 6.2917 - val_loss: 2.4056 - val_acc: 0.1189 - val_perplexity2: 6.0951
Epoch 2/2
 - 23s - loss: 2.3023 - acc: 0.1935 - perplexity2: 5.8183 - val_loss: 2.3880 - val_acc: 0.2304 - val_perplexity2: 6.2839


<keras.callbacks.History at 0x1afd5af518>

In [899]:
#as above
data = """PROPN PUNCT PROPN PUNCT PROPN PROPN CONJ PROPN PROPN AUX VERB PRON NOUN VERB DET NOUN ADP ADJ NOUN PRON AUX VERB ADP ADP ADV PUNCT""".lower()

In [900]:
#evaluate the data
sequences = list()
for line in data.split('\n'):
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad input sequences
#max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

# split into input and output elements
sequences = array(sequences)
X1, y1 = sequences[:,:-1],sequences[:,-1]
y1 = to_categorical(y1, num_classes=vocab_size)

modelE.evaluate(X1,y1, verbose=0)

Total Sequences: 25
Max Sequence Length: 252


[3.1158721446990967, 0.1599999964237213, 14.805424690246582]

In [901]:
#evaluate on other english sequences
modelE.evaluate(XE[6000:12000],yE[6000:12000], verbose=0) #perplexity is a bit higher (overfitting already kicked in) but comparable

[2.4531072902679445, 0.21366666666666667, 8.294150867462157]

In [885]:
#how does the german model perform?
model.evaluate(XE[2000:4000],yE[2000:4000], verbose=0) #the German model is very surprised by the english sentence (and this is good)

[2.4642726554870604, 0.2245, 8.295025939941405]

In [823]:
#data
deutra = open("train/epuds.en-de.de.pos").read() #this is German translation

In [824]:
#prepare the data 
sequences = list()
for line in deutra.split('\n'):
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad input sequences
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

# split into input and output elements
sequences = array(sequences)
Xdeutra, ydeutra = sequences[:,:-1],sequences[:,-1]
ydeutra = to_categorical(ydeutra, num_classes=vocab_size)

Total Sequences: 3533328
Max Sequence Length: 252


In [886]:
#How does German perform on the German translations? (moving small scale)
model.evaluate(Xdeutra[:2000],ydeutra[:2000], verbose=0) #German models seems a bit surprised by the translation

[2.1503540115356445, 0.3135, 5.839153644561768]

In [887]:
#What about English?
modelE.evaluate(Xdeutra[:2000],ydeutra[:2000], verbose=0) #English is very surprised! (good)

[2.705368900299072, 0.1495, 8.670412788391113]

In [888]:
#For a comparison, this is the surprisal of English on English:
modelE.evaluate(XE[4000:6000],yE[4000:6000], verbose=0)

[2.364716844558716, 0.22, 6.203449783325195]

In [889]:
#Same with German
model.evaluate(X[4000:6000],y[4000:6000], verbose=0)

[2.1052982006073, 0.303, 5.621415363311767]

and so forth

In [890]:
#In "conclusion"
#1. English on general English
print(modelE.evaluate(XE[6000:8000],yE[6000:8000], verbose=0)[2])
#2. English on to-be-translated English
#3. German on general German
print(model.evaluate(X[6000:8000],y[6000:8000], verbose=0)[2])
#4. German on German translation
print(model.evaluate(Xdeutra[:3000],ydeutra[:3000], verbose=0)[2])


[2.4003945388793944, 0.2135, 6.539171417236328]
[2.1339319400787353, 0.303, 5.613580635070801]
[2.140509120941162, 0.31033333333333335, 5.8668851509094235]


In [891]:
#German on English
print(model.evaluate(XE[6000:8000],yE[6000:8000], verbose=0))

[2.513754270553589, 0.226, 8.65403955078125]


it can be a good idea to save the models

In [831]:
#let's save these guys
from keras.models import load_model

#model.save('German_deep_25.h5')  # creates a HDF5 file 'my_model.h5'
#del model  # deletes the existing model
modelE.save('English_deep_25.h5') 
# returns a compiled model
# identical to the previous one
#model = load_model('my_model.h5')

we can look at the single sentences

In [637]:
#qualitative: English and German perplexity, sentence by sentence!
for i in range(100): 
    xprv,yprv = Xdeutra[i:i+1],ydeutra[i:i+1]
    engper = modelE.evaluate(xprv,yprv,verbose=0)[2]
    gerper = model.evaluate(xprv,yprv,verbose=0)[2]
    print(engper)
    print(gerper)
    if gerper>engper: print("shining through?")
    print("\n")

4.629480838775635
1.8778960704803467


4.333113670349121
2.245802164077759


12.823036193847656
13.32221794128418
shining through?


2.006741762161255
2.4519245624542236
shining through?


4.848020076751709
2.7908830642700195


14.375545501708984
3.5838027000427246


5.641058921813965
3.1994900703430176


15.491340637207031
4.042491912841797


7.334090232849121
4.221426486968994


1.638213872909546
3.4203832149505615
shining through?


7.282899379730225
1.5249924659729004


7.253878116607666
2.966615915298462


5.923389434814453
7.25738525390625
shining through?


9.17809772491455
6.407848834991455


16.424650192260742
6.8134989738464355


3.1725151538848877
3.2996888160705566
shining through?


10.633072853088379
33.40980529785156
shining through?


1.2891801595687866
5.816845417022705
shining through?


5.377060413360596
1.453563928604126


7.2516703605651855
1.8786709308624268


482.2633056640625
2.143958568572998


18.308876037597656
4.985576152801514


7.321451187133789
5.77915763

In [838]:
#So! what does english think of this english Machine translation of Kafka?
modelE.evaluate(Xmet[:4000],ymet[:4000], verbose=0) #English test: English is quite surprised

[2.9305297107696533, 0.2565, 112.8181590499878]

In [839]:
#what does German think?
model.evaluate(Xmet[:4000],ymet[:4000], verbose=0) #German test

[2.847840805053711, 0.24175, 43.55324765014648]

In [673]:
#German is less surprised than English!? 

In [674]:
#let's try the human trans

In [840]:
#human translation
mymeta = deutra = open("metamor.rtf").read()
#sentok
mymetasen = nltk.sent_tokenize(mymeta)
#pos
mymetpos = [nltk.pos_tag(nltk.wordpunct_tokenize(sen), tagset='universal') for sen in mymetasen]
#only pos
mymetpos = [[tup[1] for tup in sen] for sen in mymetpos]
#no punct (to keep the original format)
newseq = []
for seq in mymetpos:
    prv=[]
    for el in seq:
        if el == '.': prv.append('PUNCT')
        elif el == 'PRT': prv.append('PART')
        else: prv.append(el)
    newseq.append(prv)
#then again
mymetpos = [" ".join(seq) for seq in newseq]

print(mymetpos[-1])

CONJ PRON VERB ADP DET NOUN ADP PRON ADJ NOUN CONJ ADJ NOUN DET ADP DET NOUN ADP PRON NOUN PRON NOUN NOUN PART PRON NOUN ADJ CONJ VERB PRON ADJ NOUN VERB


In [841]:
# check

#max_length = 252 #has to be the same as the English model. Convoluted, I know!
#mah?

sequences = list()
for line in mymetpos:#data.split("\n"):
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad input sequences
#max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

# split into input and output elements
sequences = array(sequences)
Xmethum, ymethum = sequences[:,:-1],sequences[:,-1]
ymethum = to_categorical(ymethum, num_classes=vocab_size) # i have NO IDEA what i am doing here!!

Total Sequences: 24550
Max Sequence Length: 252


In [842]:
#test size
n=6000

In [843]:
#double eval for english model
modelE.evaluate(Xmethum[:n],ymethum[:n], verbose=0), modelE.evaluate(Xmet[:n],ymet[:n], verbose=0) #English test 

([3.1389377161661782, 0.2395, 126.3743952738444],
 [2.9279761555989583, 0.25883333333333336, 110.43650953928629])

In [844]:
#double eval for german model
model.evaluate(Xmethum[:n],ymethum[:n], verbose=0), model.evaluate(Xmet[:n],ymet[:n], verbose=0) #German test

([3.106140266418457, 0.21966666666666668, 40.23620025634766],
 [2.8224224751790365, 0.2425, 42.021267707824705])

In [None]:
## German is less surprised about the translations. Is this shining through a lot? Is it not shining through at all? 