# **Large Corpus Text Generation**

In [1]:
import sys
import requests
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential 
from tensorflow.keras.callbacks import ModelCheckpoint 
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM

## **Load a large corpus of text**

In [2]:
r = requests.get("https://cs.stanford.edu/people/karpathy/char-rnn/warpeace_input.txt")

In [3]:
raw_txt = r.text

In [4]:
chars = sorted(list(set(raw_txt)))

In [5]:
print("Corpus {}".format(len(raw_txt)))
print("Categoris {}".format(len(chars)))

Corpus 3258246
Categoris 87


## **Create mappings**

In [6]:
ix_to_char = {ix:char for ix, char in enumerate(chars)}
char_to_ix = {char:ix for ix, char in enumerate(chars)}

## **Create training data**

In [7]:
maxlen = 10
X_data = []
y_data = []

for i in range(0, len(raw_txt) - maxlen, 1):
  in_seq = raw_txt[i: i + maxlen]
  out_seq = raw_txt[i + maxlen]
  X_data.append([char_to_ix[char] for char in in_seq])
  y_data.append([char_to_ix[out_seq]])

nb_chars = len(X_data)
print('Number of sequence: ', int(len(X_data)/maxlen))

Number of sequence:  325823


In [8]:
# scale and transform the data 
X = np.reshape(X_data, (nb_chars, maxlen, 1))
n_vocab = len(chars)
X = X/float(n_vocab)

In [9]:
X[:10]

array([[[0.98850575],
        [0.95402299],
        [0.96551724],
        [0.04597701],
        [0.56321839],
        [0.65517241],
        [0.73563218],
        [0.73563218],
        [0.10344828],
        [0.02298851]],

       [[0.95402299],
        [0.96551724],
        [0.04597701],
        [0.56321839],
        [0.65517241],
        [0.73563218],
        [0.73563218],
        [0.10344828],
        [0.02298851],
        [0.48275862]],

       [[0.96551724],
        [0.04597701],
        [0.56321839],
        [0.65517241],
        [0.73563218],
        [0.73563218],
        [0.10344828],
        [0.02298851],
        [0.48275862],
        [0.8045977 ]],

       [[0.04597701],
        [0.56321839],
        [0.65517241],
        [0.73563218],
        [0.73563218],
        [0.10344828],
        [0.02298851],
        [0.48275862],
        [0.8045977 ],
        [0.70114943]],

       [[0.56321839],
        [0.65517241],
        [0.73563218],
        [0.73563218],
        [0.10344828],
  

In [10]:
X.shape

(3258236, 10, 1)

## **Treat categorical columns**

In [11]:
y = tf.keras.utils.to_categorical(y_data)

In [12]:
print("The shape of X_training data : ", X.shape)
print("The shape of y_training_data : ", y.shape)

The shape of X_training data :  (3258236, 10, 1)
The shape of y_training_data :  (3258236, 86)


## **Define Model**

In [13]:
Model = tf.keras.Sequential([
                             
              tf.keras.layers.LSTM(800, input_shape=(len(X[1]), 1), return_sequences=True),
              tf.keras.layers.Dropout(0.2),
              tf.keras.layers.LSTM(800, return_sequences=True),
              tf.keras.layers.Dropout(0.2),
              tf.keras.layers.LSTM(800),
              tf.keras.layers.Dropout(0.2),
              tf.keras.layers.Dense(len(y[1]), activation='softmax')

])






In [14]:
Model.compile(loss = 'categorical_crossentropy', optimizer='adam')

## **Creating a checkpoint**

In [15]:
filepath = "model_weights_saved.hdf5"

In [16]:
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

In [17]:
model_callbacks = [checkpoint]

## **Defining a custom callback**

In [18]:
epoch_number = 0
filename = 'predictions.txt'
file = open(filename, 'w')
file.truncate()
file.close()

class CustomCallback(tf.keras.callbacks.Callback):

  def on_epoch_end(self, epoch, logs=None):
    global epoch_number
    epoch_number = epoch_number + 1

    filename = 'predictions.txt'
    file = open(filename, 'a')
    seed = "looking fo"

    pattern = []
    for i in seed:
      value = char_to_ix[i]
      pattern.append(value)

    file.seek(0)
    file.write("\n\n Epoch Number : {}\n\n".format(epoch_number))
    for i in range(100):
      X = np.reshape(pattern, (1, len(pattern), 1))
      X = X/float(n_vocab)
      int_prediction = Model.predict(X, verbose=0)
      index = np.argmax(int_prediction)
      prediction = ix_to_char[index]
      sys.stdout.write(prediction)
      file.write(prediction)
      pattern.append(index)
      pattern = pattern[1:len(pattern)]

    file.close()


## **Training**

In [None]:
Model.fit(X, y, batch_size=2000, epochs=1, callbacks= [CustomCallback(), model_callbacks])

Epoch 1/10
  50/1630 [..............................] - ETA: 12:17:59 - loss: 3.5335

### **For rerun from a known checkpoint use following code**

In [None]:
try:
  Model.load_weights(filepath)

except Exception as error:
  print("Error loading in model: {}".format(error))


In [None]:
Model.fit(X, y, batch_size=2000, epochs=2, callbacks= [CustomCallback(), model_callbacks])