In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.losses import binary_crossentropy, categorical_crossentropy

In [None]:
from keras.optimizers import SGD
from keras.metrics import top_k_categorical_accuracy
import keras.backend as k
import numpy as np
import sys, os, string, random

In [None]:
characters = string.printable
char_indices = dict((c, i) for i, c in enumerate(characters))
indices_char = dict((i, c) for i, c in enumerate(characters))

In [None]:
INPUT_VOCAB_SIZE = len(characters)
LINE_SIZE = 80
BATCH_SIZE = 200
STEP_PER_EPOCH = 5000
EPOCHS = 4

In [None]:
def encode_one_hot(line):
  x = np.zeros((1, LINE_SIZE * INPUT_VOCAB_SIZE))
  sp_idx = char_indices[' ']
  for i, c in enumerate(line):
    index = char_indices[c] if c in characters else sp_idx
    x[0][index] = 1
  for i in range(len(line), LINE_SIZE):
    x[0][i][sp_idx] = 1
  return x.reshape([1, LINE_SIZE * INPUT_VOCAB_SIZE])

In [None]:
def decode_one_hot(y):
  s = []
  x = y.reshape([1, LINE_SIZE, INPUT_VOCAB_SIZE])
  for onehot in x[0]:
    one_index = np.argmax(onehot)
    s.append(indices_char[one_index])
  return ''.join(s)

In [None]:
def input_generator(nsamples):
  def generate_line():
    inline = []; outline = []
    for _ in range(nsamples):
      c = random.choice(characters)
      expected = c.lower() if c in string.ascii_letters else ' '
      inline.append(c); outline.append(expected)
    for i in range(LINE_SIZE):
      if outline[i] == ' ': continue
      if i > 0 and i < LINE_SIZE - 1:
        outline[i] = ' ' if outline[i-1] == ' ' and outline[i+1] == ' '  else outline[i]
      if (i == 0 and outline[i+1] == ' ') or (i == LINE_SIZE-1 and outline[i-1] == ' '):
        outline[i] = ' '
    return ''.join(inline), ''.join(outline)

  while True:
    data_in = np.zeros((nsamples, LINE_SIZE * INPUT_VOCAB_SIZE))
    data_out = np.zeros((nsamples, LINE_SIZE* INPUT_VOCAB_SIZE))
    for i in range(nsamples):
      input_data, expected = generate_line()
      data_in[i] = encode_one_hot(input_data)[0]
      data_out[i] = encode_one_hot(expected)[0]
    yield data_in, data_out

In [None]:
def train(model):
  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
  print("compiled")

  input_gen = input_generator(BATCH_SIZE)
  validation_gen = input_generator(BATCH_SIZE)
  print("data is generated")

  model.fit(input_gen,
            epochs = EPOCHS, workers=1,
            steps_per_epoch = STEP_PER_EPOCH,
            validation_data = validation_gen,
            validation_steps = 10)

In [None]:
def build_model():
  model = Sequential()
  model.add(Dense(LINE_SIZE * INPUT_VOCAB_SIZE,
                  input_shape=(LINE_SIZE * INPUT_VOCAB_SIZE,),
                  activation='sigmoid'))
  return model

In [None]:
def build_deep_model():
  model = Sequential()
  model.add(Dense(80,
                  input_shape=(LINE_SIZE*INPUT_VOCAB_SIZE,),
                  activation='sigmoid'))
  model.add(Dense(800, activation='sigmoid'))
  model.add(Dense(LINE_SIZE*INPUT_VOCAB_SIZE, activation='sigmoid'))
  return model

In [None]:
model = build_deep_model()

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 80)                640080    
                                                                 
 dense_4 (Dense)             (None, 800)               64800     
                                                                 
 dense_5 (Dense)             (None, 8000)              6408000   
                                                                 
Total params: 7112880 (27.13 MB)
Trainable params: 7112880 (27.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
train(model)

compiled
data is generated
Epoch 1/4
 317/5000 [>.............................] - ETA: 28:16 - loss: 0.0087 - accuracy: 0.0000e+00

In [None]:
!wget https://www.gutenberg.org/cache/epub/46144/pg46144.txt

--2024-03-27 20:10:25--  https://www.gutenberg.org/cache/epub/46144/pg46144.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 125798 (123K) [text/plain]
Saving to: ‘pg46144.txt’


2024-03-27 20:10:27 (233 KB/s) - ‘pg46144.txt’ saved [125798/125798]



In [None]:
!head pg46144.txt > pg46144_head.txt

In [None]:
with open('pg46144_head.txt') as f:
  for line in f:
    if line.isspace(): continue
    batch = encode_one_hot(line)
    preds = model.predict(batch)
    normal = decode_values(preds)
    print(normal, line)

 ﻿The Project Gutenberg eBook of Six Cups of Coffee

 This ebook is for the use of anyone anywhere in the United States and

 most other parts of the world at no cost and with almost no restrictions

 whatsoever. You may copy it, give it away or re-use it under the terms

 of the Project Gutenberg License included with this ebook or online

 at www.gutenberg.org. If you are not located in the United States,

 you will have to check the laws of the country where you are located

 before using this eBoo