LSTM: Train an LSTM to mimic Russell’s style and thoughts

In [38]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils


# Preprocessing

## i create a corpus

In [39]:
# load ascii text and covert to lowercase
book1 = "./Data/The Problems of Philosophy.txt"
book2 = "./Data/The Analysis of Mind.txt"
book3 = "./Data/Mysticism and Logic and Other Essays.txt"
book4 = "./Data/Our Knowledge of the External World as a Field for Scientific Method in Philosophy.txt"

file1 = open(book1)
file2 = open(book2)
file3 = open(book3)
file4 = open(book4)

text1 = file1.read()
text2 = file2.read()
text3 = file3.read()
text4 = file4.read()

file1.close()
file2.close()
file3.close()
file4.close()

text1 = text1.lower()
text2 = text2.lower()
text3 = text3.lower()
text4 = text4.lower()

In [40]:
# Concatenate them to a corpus
corpus = text1 + text2 + text3 + text4
len(corpus)

1590944

## ii character-level embedding

### replace punctuations with space

In [41]:
# have a look at the elements in the original corpus

chars = sorted(list(set(corpus)))
print(chars)
n_vocab = len(chars)
print(n_vocab)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '\xa0', '§', '·', 'â', 'æ', 'è', 'é', 'ë', 'î', 'ï', 'ô', 'ö', 'ü', 'œ', 'ŭ', 'α', 'β', 'γ', 'η', 'θ', 'ι', 'κ', 'λ', 'ν', 'ο', 'π', 'ρ', 'σ', 'τ', 'φ', 'ὴ', 'ή', 'ί', 'ὸ', 'ό', '′', '″']
98


In [42]:
# remove some basic punctuations
import string
cor = corpus
for i in string.punctuation:
    cor_pro = cor.replace(i, ' ')
    cor = cor_pro
corpus = cor

In [43]:
# have a look at the elements in the corpus after removing some basic punctuations

chars = sorted(list(set(corpus)))
print(chars)
n_vocab = len(chars)
print(n_vocab)

['\n', ' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '§', '·', 'â', 'æ', 'è', 'é', 'ë', 'î', 'ï', 'ô', 'ö', 'ü', 'œ', 'ŭ', 'α', 'β', 'γ', 'η', 'θ', 'ι', 'κ', 'λ', 'ν', 'ο', 'π', 'ρ', 'σ', 'τ', 'φ', 'ὴ', 'ή', 'ί', 'ὸ', 'ό', '′', '″']
75


In [44]:
# replace more punctuations
other_punc = str('\n' + ' ' + '\xa0' + '§' + '·' + '′' + '″')
cor = corpus
for i in other_punc:
    cor_pro = cor.replace(i, ' ')
    cor = cor_pro
corpus = cor

In [45]:
# have a look at the elements in the corpus after removing other punctuations

chars = sorted(list(set(corpus)))
print(chars)
n_vocab = len(chars)
print(n_vocab)

[' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'â', 'æ', 'è', 'é', 'ë', 'î', 'ï', 'ô', 'ö', 'ü', 'œ', 'ŭ', 'α', 'β', 'γ', 'η', 'θ', 'ι', 'κ', 'λ', 'ν', 'ο', 'π', 'ρ', 'σ', 'τ', 'φ', 'ὴ', 'ή', 'ί', 'ὸ', 'ό']
69


In [46]:
# remove special characters and have a look
cor = corpus
for i in chars[37:]:
    cor_pro = cor.replace(i, '')
    cor = cor_pro
corpus = cor


In [47]:
# have a look at the elements in the corpus after removing special characters

chars = sorted(list(set(corpus)))
print(chars)
n_vocab = len(chars)
print(n_vocab)

[' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
37


In [48]:
n_chars = len(corpus)
print(n_chars)

1590778


In [49]:
f = open('./Data/corpus.txt', 'wb')
pickle.dump(corpus, f)
f.close()

In [50]:
corpus



### normalize

In [12]:
vocab_nor = list()
for i in chars:
    nor = str((ord(i) - ord(' '))/(ord('z') - ord(' ')))
    vocab_nor.append(nor)
print(vocab_nor)

['0.0', '0.17777777777777778', '0.18888888888888888', '0.2', '0.2111111111111111', '0.2222222222222222', '0.23333333333333334', '0.24444444444444444', '0.25555555555555554', '0.26666666666666666', '0.2777777777777778', '0.7222222222222222', '0.7333333333333333', '0.7444444444444445', '0.7555555555555555', '0.7666666666666667', '0.7777777777777778', '0.7888888888888889', '0.8', '0.8111111111111111', '0.8222222222222222', '0.8333333333333334', '0.8444444444444444', '0.8555555555555555', '0.8666666666666667', '0.8777777777777778', '0.8888888888888888', '0.9', '0.9111111111111111', '0.9222222222222223', '0.9333333333333333', '0.9444444444444444', '0.9555555555555556', '0.9666666666666667', '0.9777777777777777', '0.9888888888888889', '1.0']


In [13]:
vocab_dict = dict(zip(chars, vocab_nor))
print(vocab_dict)

{' ': '0.0', '0': '0.17777777777777778', '1': '0.18888888888888888', '2': '0.2', '3': '0.2111111111111111', '4': '0.2222222222222222', '5': '0.23333333333333334', '6': '0.24444444444444444', '7': '0.25555555555555554', '8': '0.26666666666666666', '9': '0.2777777777777778', 'a': '0.7222222222222222', 'b': '0.7333333333333333', 'c': '0.7444444444444445', 'd': '0.7555555555555555', 'e': '0.7666666666666667', 'f': '0.7777777777777778', 'g': '0.7888888888888889', 'h': '0.8', 'i': '0.8111111111111111', 'j': '0.8222222222222222', 'k': '0.8333333333333334', 'l': '0.8444444444444444', 'm': '0.8555555555555555', 'n': '0.8666666666666667', 'o': '0.8777777777777778', 'p': '0.8888888888888888', 'q': '0.9', 'r': '0.9111111111111111', 's': '0.9222222222222223', 't': '0.9333333333333333', 'u': '0.9444444444444444', 'v': '0.9555555555555556', 'w': '0.9666666666666667', 'x': '0.9777777777777777', 'y': '0.9888888888888889', 'z': '1.0'}


In [14]:
corpus_rescaled = []
for i in corpus:
    nor = vocab_dict[i]
    corpus_rescaled.append(nor)

# print(corpus_rescaled)
print(len(corpus_rescaled))

1590778


## iii

In [15]:
W = 100

## iv

In [16]:
vocab_ascii = list()
for i in chars:
    vocab_int = str(ord(i))
    vocab_ascii.append(vocab_int)

In [17]:
vocab_ascii_dict = dict(zip(chars, vocab_ascii))

In [18]:
# prepare the dataset of input to output pairs encoded as integers
S = 1
dataX = []
dataY = []
for i in range(0, n_chars - W, S):
    seq_in = corpus[i : i + W]
    seq_out = corpus[i + W]
    dataX.append([vocab_dict[char] for char in seq_in])
    dataY.append(vocab_ascii_dict[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  1590678


## v one-hot encoding on the output

In [19]:
y = np_utils.to_categorical(dataY)

In [20]:
y.shape

(1590678, 123)

## vi a single hidden layer

In [21]:
dataX

[['0.8888888888888888',
  '0.9111111111111111',
  '0.7666666666666667',
  '0.7777777777777778',
  '0.7222222222222222',
  '0.7444444444444445',
  '0.7666666666666667',
  '0.0',
  '0.0',
  '0.8111111111111111',
  '0.8666666666666667',
  '0.0',
  '0.9333333333333333',
  '0.8',
  '0.7666666666666667',
  '0.0',
  '0.7777777777777778',
  '0.8777777777777778',
  '0.8444444444444444',
  '0.8444444444444444',
  '0.8777777777777778',
  '0.9666666666666667',
  '0.8111111111111111',
  '0.8666666666666667',
  '0.7888888888888889',
  '0.0',
  '0.8888888888888888',
  '0.7222222222222222',
  '0.7888888888888889',
  '0.7666666666666667',
  '0.9222222222222223',
  '0.0',
  '0.8111111111111111',
  '0.0',
  '0.8',
  '0.7222222222222222',
  '0.9555555555555556',
  '0.7666666666666667',
  '0.0',
  '0.7444444444444445',
  '0.8777777777777778',
  '0.8666666666666667',
  '0.7777777777777778',
  '0.8111111111111111',
  '0.8666666666666667',
  '0.7666666666666667',
  '0.7555555555555555',
  '0.0',
  '0.85555555

In [22]:
len(dataX)

1590678

In [23]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, W, 1))


In [24]:
X

array([[['0.8888888888888888'],
        ['0.9111111111111111'],
        ['0.7666666666666667'],
        ...,
        ['0.9888888888888889'],
        ['0.0'],
        ['0.8111111111111111']],

       [['0.9111111111111111'],
        ['0.7666666666666667'],
        ['0.7777777777777778'],
        ...,
        ['0.0'],
        ['0.8111111111111111'],
        ['0.8666666666666667']],

       [['0.7666666666666667'],
        ['0.7777777777777778'],
        ['0.7222222222222222'],
        ...,
        ['0.8111111111111111'],
        ['0.8666666666666667'],
        ['0.0']],

       ...,

       [['0.8888888888888888'],
        ['0.7222222222222222'],
        ['0.9222222222222223'],
        ...,
        ['0.7666666666666667'],
        ['0.9111111111111111'],
        ['0.8111111111111111']],

       [['0.7222222222222222'],
        ['0.9222222222222223'],
        ['0.9333333333333333'],
        ...,
        ['0.9111111111111111'],
        ['0.8111111111111111'],
        ['0.9333333333333333']]

In [25]:
X.shape

(1590678, 100, 1)

In [26]:
y.shape[1]

123

In [35]:
import pickle

In [None]:
f = open('y.txt', 'wb')
pickle.dump(y, f)
f.close()

In [36]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [27]:
# define the LSTM model

model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.


## vii refine the model

In [28]:
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


## viii do not use a test dataset

## ix epochs

In [51]:
n_epoch = 1

## x checkpoint

In [52]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]


In [53]:
# fit the model
model.fit(X, y, epochs=n_epoch, batch_size=100, callbacks=callbacks_list)

Epoch 1/1

Epoch 00001: loss improved from inf to 2.54867, saving model to weights-improvement-01-2.5487.hdf5


<keras.callbacks.History at 0xe3a92beb8>

## xi prediction

### pre-process the testX

In [54]:
# load ascii text and covert to lowercase
testX = "./Data/test text.txt"

file_testX = open(testX)

text_testX = file_testX.read()

file_testX.close()

text_testX = text_testX.lower()

In [55]:
# have a look at the elements in the original corpus

chars_testX = sorted(list(set(text_testX)))
print(chars_testX)
n_vocab_testX = len(chars_testX)
print(n_vocab_testX)

[' ', ',', '.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']
27


In [56]:
# remove some basic punctuations
import string
txt = text_testX
for i in string.punctuation:
    txt_pro = txt.replace(i, ' ')
    txt = txt_pro
text_testX = txt

In [57]:
# have a look at the elements in the corpus after removing some basic punctuations

chars_testX = sorted(list(set(text_testX)))
print(chars_testX)
n_vocab_testX = len(chars_testX)
print(n_vocab_testX)

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']
25


In [83]:
# normalize
text_testX_rescaled = []
for i in text_testX:
    nor = vocab_dict[i]
    text_testX_rescaled.append(nor)

print(len(text_testX_rescaled))

154


In [85]:
corpus_rescaled[:5]

['0.8888888888888888',
 '0.9111111111111111',
 '0.7666666666666667',
 '0.7777777777777778',
 '0.7222222222222222']

In [86]:
text_testX_rescaled[:5]

['0.9333333333333333',
 '0.8',
 '0.7666666666666667',
 '0.9111111111111111',
 '0.7666666666666667']

### generate the text

In [58]:
# load the network weights
filename = "weights-improvement-01-2.5487.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [62]:
ascii_vocab_dict = dict(zip(vocab_ascii, chars))
print(ascii_vocab_dict)

{'32': ' ', '48': '0', '49': '1', '50': '2', '51': '3', '52': '4', '53': '5', '54': '6', '55': '7', '56': '8', '57': '9', '97': 'a', '98': 'b', '99': 'c', '100': 'd', '101': 'e', '102': 'f', '103': 'g', '104': 'h', '105': 'i', '106': 'j', '107': 'k', '108': 'l', '109': 'm', '110': 'n', '111': 'o', '112': 'p', '113': 'q', '114': 'r', '115': 's', '116': 't', '117': 'u', '118': 'v', '119': 'w', '120': 'x', '121': 'y', '122': 'z'}


In [149]:
print("comencement:")
print("\"",pattern, "\"")

comencement:
" t as they would physical phenomena  this school of psychologists tends not to emphasize the object   "


In [152]:
y_pred = []
# generate characters
for i in range(1000):
    pattern_rescaled = [vocab_dict[value] for value in pattern]
    x = numpy.reshape(pattern_rescaled, (1, len(pattern_rescaled), 1))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = ascii_vocab_dict[str(index)]
    #seq_in = [int_to_char[value] for value in pattern]
    y_pred.append(result)
    pattern = pattern + result
    pattern = pattern[1:len(pattern)]


In [159]:
text_testX

'there are those who take mental phenomena naively  just as they would physical phenomena  this school of psychologists tends not to emphasize the object  '

In [158]:
y_pred_str

'and the soete  the sare th the soete  the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in the sare  in