Reference: https://machinelearningmastery.com/develop-character-based-neural-language-model-keras/

Preparation

In [0]:
!pip install -U -q PyDrive
from googleapiclient.discovery import build

import os, pickle
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
#auth.authenticate_user()
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
drive_service = build('drive', 'v3')


[?25l[K    1% |▎                               | 10kB 19.4MB/s eta 0:00:01[K    2% |▋                               | 20kB 1.8MB/s eta 0:00:01[K    3% |█                               | 30kB 2.6MB/s eta 0:00:01[K    4% |█▎                              | 40kB 1.7MB/s eta 0:00:01[K    5% |█▋                              | 51kB 2.1MB/s eta 0:00:01[K    6% |██                              | 61kB 2.5MB/s eta 0:00:01[K    7% |██▎                             | 71kB 2.8MB/s eta 0:00:01[K    8% |██▋                             | 81kB 3.2MB/s eta 0:00:01[K    9% |███                             | 92kB 3.5MB/s eta 0:00:01[K    10% |███▎                            | 102kB 2.8MB/s eta 0:00:01[K    11% |███▋                            | 112kB 2.8MB/s eta 0:00:01[K    12% |████                            | 122kB 4.0MB/s eta 0:00:01[K    13% |████▎                           | 133kB 4.0MB/s eta 0:00:01[K    14% |████▋                           | 143kB 7.5MB/s eta 0:00:01[

In [0]:
listed = drive.ListFile({'q': "title contains '.txt'"}).GetList()
for file in listed:
    print('title {}, id {}'.format(file['title'], file['id']))

title train_en.txt, id 1YXiG1W5JR4USoEflTUkmGi339ckeB7Pi
title train_de.txt, id 1vDTy7hFwv2ftQfgAi9dgHGocSlxNc9Ad


Get character sequences

In [0]:
def load_doc(filename):
  file = open(filename,'r')
  text = file.read()
  file.close()
  return text

In [0]:
from google.colab import files
uploaded = files.upload()


Saving train_de.txt to train_de.txt
Saving train_en.txt to train_en.txt


In [0]:

# load text
raw_text = load_doc('train_en.txt')
#print(raw_text)


In [0]:
tokens = raw_text.split()
raw_text = ' '.join(tokens)

In [0]:
length = 10
sequences = list()
for i in range(length, len(raw_text)):
	# select sequence of tokens
	seq = raw_text[i-length:i+1]
	# store
	sequences.append(seq)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 18591412


In [0]:

# save tokens to file, one dialog per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()


In [0]:
out_filename = 'char_sequences.txt'
save_doc(sequences, out_filename)

In [0]:
in_filename = 'char_sequences.txt'
raw_text = load_doc(in_filename)
lines = raw_text.split('\n')

In [0]:

chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))


In [0]:
sequences = list()
for line in lines:
	# integer encode line
	encoded_seq = [mapping[char] for char in line]
	# store
	sequences.append(encoded_seq)

In [0]:

# vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 139


In [0]:
import numpy
sequences = numpy.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]

In [0]:
import keras.utils as ks
sequences = [ks.to_categorical(x, num_classes=vocab_size) for x in X]
print("first done")
X = ks.array(sequences)
print("second done")

y = ks.to_categorical(y, num_classes=vocab_size)
print("third done")


In [0]:
model = Sequential()
model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

In [0]:

# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, epochs=100, verbose=2)

In [0]:
model.save('model.h5')

In [0]:

dump(mapping, open('mapping.pkl', 'wb'))

Load model

In [0]:
from pickle import load
from keras.models import load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
	in_text = seed_text
	# generate a fixed number of characters
	for _ in range(n_chars):
		# encode the characters as integers
		encoded = [mapping[char] for char in in_text]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# one hot encode
		encoded = to_categorical(encoded, num_classes=len(mapping))
		encoded = encoded.reshape(1, encoded.shape[0], encoded.shape[1])
		# predict character
		yhat = model.predict_classes(encoded, verbose=0)
		# reverse map integer to character
		out_char = ''
		for char, index in mapping.items():
			if index == yhat:
				out_char = char
				break
		# append to input
		in_text += char
	return in_text

# load the model
model = load_model('model.h5')
# load the mapping
mapping = load(open('mapping.pkl', 'rb'))

# test start of rhyme
print(generate_seq(model, mapping, 10, 'Sing a son', 20))
# test mid-line
print(generate_seq(model, mapping, 10, 'king was i', 20))
# test not in original
print(generate_seq(model, mapping, 10, 'hello worl', 20))