## Character-level language model.

In [1]:
# https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-language-model-nlp-python-code/
import numpy as np
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
# read file
f = open('text.txt',mode='r',encoding='utf-8')

In [3]:
import re

def text_cleaner(text):
   
    # lower case text
    newString = text.lower()
    newString = re.sub(r"'s\b","",newString)
    # remove punctuations
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    long_words=[]
    # remove short word
    for i in newString.split():
        if len(i)>=3:                  
            long_words.append(i)
    return (" ".join(long_words)).strip()

# preprocess the text
text = f.read()
data_new = text_cleaner(text)

In [5]:
def create_seq(text):
    length = 30
    sequences = list()
    for i in range(length, len(text)):
        # select sequence of tokens
        seq = text[i-length:i+1]
        # store
        sequences.append(seq)
    print('Total Sequences: %d' % len(sequences))
    return sequences

# create sequences   
sequences = create_seq(data_new)

Total Sequences: 7052


In [6]:
# create a character mapping index
chars = sorted(list(set(data_new)))
mapping = dict((c, i) for i, c in enumerate(chars))

def encode_seq(seq):
    sequences = list()
    for line in seq:
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences

# encode the sequences
sequences = encode_seq(sequences)

In [7]:
from sklearn.model_selection import train_test_split

# vocabulary size
vocab = len(mapping)
sequences = np.array(sequences)
# create X and y
X, y = sequences[:,:-1], sequences[:,-1]
# one hot encode y
y = to_categorical(y, num_classes=vocab)
# create train and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

print('Train shape:', X_tr.shape, 'Val shape:', X_val.shape)

Train shape: (6346, 30) Val shape: (706, 30)


In [8]:
# define model
model = Sequential()
model.add(Embedding(vocab, 50, input_length=30, trainable=True))
model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab, activation='softmax'))
print(model.summary())

# compile the model
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
# fit the model
model.fit(X_tr, y_tr, epochs=100, verbose=2, validation_data=(X_val, y_val))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 50)            1350      
_________________________________________________________________
gru (GRU)                    (None, 150)               90900     
_________________________________________________________________
dense (Dense)                (None, 27)                4077      
Total params: 96,327
Trainable params: 96,327
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
199/199 - 8s - loss: 2.7641 - acc: 0.2042 - val_loss: 2.4358 - val_acc: 0.2748
Epoch 2/100
199/199 - 6s - loss: 2.3085 - acc: 0.3158 - val_loss: 2.2548 - val_acc: 0.3173
Epoch 3/100
199/199 - 7s - loss: 2.1736 - acc: 0.3531 - val_loss: 2.1594 - val_acc: 0.3484
Epoch 4/100
199/199 - 6s - loss: 2.0701 - acc: 0.3815 - val_loss: 2.0899 - val_acc: 0.3839
Epoch 5/100
1

Epoch 82/100
199/199 - 6s - loss: 0.3009 - acc: 0.9042 - val_loss: 3.2285 - val_acc: 0.4745
Epoch 83/100
199/199 - 6s - loss: 0.3067 - acc: 0.9015 - val_loss: 3.2522 - val_acc: 0.4589
Epoch 84/100
199/199 - 6s - loss: 0.2994 - acc: 0.8973 - val_loss: 3.2699 - val_acc: 0.4688
Epoch 85/100
199/199 - 6s - loss: 0.3075 - acc: 0.8980 - val_loss: 3.2888 - val_acc: 0.4660
Epoch 86/100
199/199 - 6s - loss: 0.2964 - acc: 0.9026 - val_loss: 3.2816 - val_acc: 0.4759
Epoch 87/100
199/199 - 5s - loss: 0.2991 - acc: 0.9006 - val_loss: 3.3169 - val_acc: 0.4632
Epoch 88/100
199/199 - 6s - loss: 0.2949 - acc: 0.9017 - val_loss: 3.3677 - val_acc: 0.4618
Epoch 89/100
199/199 - 6s - loss: 0.2965 - acc: 0.8969 - val_loss: 3.3484 - val_acc: 0.4688
Epoch 90/100
199/199 - 6s - loss: 0.2850 - acc: 0.9075 - val_loss: 3.4219 - val_acc: 0.4745
Epoch 91/100
199/199 - 6s - loss: 0.2817 - acc: 0.9099 - val_loss: 3.3936 - val_acc: 0.4802
Epoch 92/100
199/199 - 6s - loss: 0.2785 - acc: 0.9086 - val_loss: 3.4504 - val_

<tensorflow.python.keras.callbacks.History at 0x1c2e4875848>

In [9]:
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict character
        yhat = model.predict_classes(encoded, verbose=0)
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break
        # append to input
        in_text += char
    return in_text

In [17]:
generate_seq(model,mapping,30,'what the',100)



'what the most barbalowe for the support this declaration with firm refused his assent laws the most wholesom'

In [15]:
model.save('cha_lm')

INFO:tensorflow:Assets written to: cha_lm\assets


In [16]:
# load model
from tensorflow import keras
model = keras.models.load_model('cha_lm')