In [1]:
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, LeakyReLU, Dropout, BatchNormalization, Conv2D, MaxPool2D, Flatten, Reshape
from keras.callbacks import EarlyStopping, ModelCheckpoint
import pandas as pd
import random

Using TensorFlow backend.


In [3]:
ENGLISH_MODEL_PATH = 'models/english-20-model.h5'
ENGLISH_TEXT_PATH = 'data/english/republic_sequences.txt'
ENGLISH_TOKENIZER_PATH = 'tokenizers/english-20-tokenizer.pkl'

CHINESE_MODEL_PATH = 'models/chinese-20-model.h5'
CHINESE_TEXT_PATH = 'data/chinese/cleaned-chinese-education.csv'
CHINESE_TOKENIZER_PATH = 'tokenizers/chinese-20-tokenizer.pkl'

MALAY_MODEL_PATH = 'models/malay-20-model.h5'
MALAY_TEXT_PATH = 'data/malay/cleaned-malay-emotions.csv'
MALAY_TOKENIZER_PATH = 'tokenizers/malay-20-tokenizer.pkl'

MALAYSIAN_MODEL_PATH = 'models/malaysian-20-model.h5'
MALAYSIAN_TEXT_PATH = 'data/malaysian/cleaned-tweet.csv'
MALAYSIAN_TOKENIZER_PATH = 'tokenizers/malaysian-20-tokenizer.pkl'

description = 'english-20'

In [3]:
# load english
in_filename = 'data/english/republic_sequences.txt'
with open(in_filename) as f:
    doc = f.read()
lines = doc.split('\n')
lines = [' '.join(l.split(' ')[:20]) for l in lines[:4000]]

In [7]:
# load chinese/malay/malaysian
df = pd.read_csv(CHINESE_TEXT_PATH)
lines = list(df['text'])
lines = random.sample(lines, 4000)
print(len(lines))
print(lines[:5])

# lines = list(filter(lambda x: len(x.split(' ')) >= 20, lines))
# lines = [' '.join(l.split(' ')[-20:]) for l in lines]
# df.head()

# for l in lines:
#     assert len(l.split(' ')) == 20, str(len(l.split(' ')))
# lines[-5:]

4000
['该 榜 前 十 位 为 湖 南 大 学 四 川 大 学 吉 林 大 学 重 庆', '这 张 许 可 证 在 留 学 签 证 有 效 期 间 内 都 有 效 如 果', '两 岸 期 盼 和 平 统 一 的 心 情 一 样 的 和 谐 相 同 的 迫', '北 京 成 招 网 上 报 名 正 在 进 行 考 生 报 考 时 要 注 意', '最 先 开 始 的 提 前 批 次 录 取 时 间 月 日 日 参 加 该 批']


In [8]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
len(tokenizer.word_index)

2239

In [4]:
# load the tokenizer
tokenizer = pickle.load(open(ENGLISH_TOKENIZER_PATH, 'rb'))

In [92]:
# save the tokenizer
pickle.dump(tokenizer, open(description + '-tokenizer.pkl', 'wb'))

In [8]:
def get_sequence_of_tokens(corpus, tokenizer):
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    total_words = len(tokenizer.word_index) + 1
    return input_sequences, total_words

In [9]:
X, vocab_size = get_sequence_of_tokens(lines, tokenizer)

In [10]:
X[3]

[235, 232, 827, 2, 1]

In [11]:
def generate_padded_sequences(input_sequences, total_words):
    max_sequence_len = max([len(x) for x in input_sequences])
    print(max_sequence_len)
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

X, y, seq_length = generate_padded_sequences(X, vocab_size)

20


In [12]:
# # separate into input and output
# sequences = np.array(sequences)
# X, y = sequences[:,:-1], sequences[:,-1]
# y = to_categorical(y, num_classes=vocab_size)
X.shape[1]

19

In [5]:
model = load_model(ENGLISH_MODEL_PATH)

In [15]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 512, input_length=seq_length - 1))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))

# model.add(Reshape((10, 10, 1)))

# model.add(Conv2D(256, 3, input_shape=(10, 10, 1)))
# # model.add(MaxPool2D(2))
# model.add(LeakyReLU(alpha=0.1))
# model.add(BatchNormalization())

# model.add(Conv2D(512, 3))
# # model.add(MaxPool2D(2))
# model.add(LeakyReLU(alpha=0.1))
# model.add(BatchNormalization())

# model.add(Flatten())
# model.add(LeakyReLU(alpha=0.1))

model.add(Dense(4096))
model.add(LeakyReLU(alpha=0.1))
model.add(BatchNormalization())

# model.add(Dense(8096))
# model.add(LeakyReLU(alpha=0.1))
# # model.add(Dropout(0.2))
# model.add(BatchNormalization())

model.add(Dense(2048))
model.add(LeakyReLU(alpha=0.1))

model.add(Dense(vocab_size, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 19, 512)           1146880   
_________________________________________________________________
lstm_1 (LSTM)                (None, 19, 128)           328192    
_________________________________________________________________
dropout_1 (Dropout)          (None, 19, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 4096)              528384    
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 4096)              0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 4096)              16384     
__________

In [16]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [15]:
es = EarlyStopping(monitor='loss',
                  patience=3,
                  verbose=1,
                  mode='min',
                  restore_best_weights=True)
mc = ModelCheckpoint(description + '-best_Weights.h5',
                     monitor='loss',
                    verbose=1,
                    save_best_only=True,
                    save_weights_only=True)
# model.load_weights(MALAYSIAN_WEIGHTS_PATH)

In [18]:
# fit model
batch_size=1024
epochs=10

model.fit(X, y, batch_size=batch_size, epochs=epochs, callbacks=[es, mc])

Epoch 1/10

Epoch 00001: loss improved from 0.22080 to 0.20791, saving model to english-20-best_Weights.h5
Epoch 2/10

Epoch 00002: loss improved from 0.20791 to 0.20610, saving model to english-20-best_Weights.h5
Epoch 3/10

Epoch 00003: loss improved from 0.20610 to 0.20465, saving model to english-20-best_Weights.h5
Epoch 4/10

Epoch 00004: loss improved from 0.20465 to 0.20361, saving model to english-20-best_Weights.h5
Epoch 5/10

Epoch 00005: loss improved from 0.20361 to 0.20359, saving model to english-20-best_Weights.h5
Epoch 6/10

Epoch 00006: loss improved from 0.20359 to 0.20184, saving model to english-20-best_Weights.h5
Epoch 7/10

Epoch 00007: loss did not improve from 0.20184
Epoch 8/10

Epoch 00008: loss improved from 0.20184 to 0.20028, saving model to english-20-best_Weights.h5
Epoch 9/10

Epoch 00009: loss did not improve from 0.20028
Epoch 10/10

Epoch 00010: loss did not improve from 0.20028


<keras.callbacks.History at 0x7f1aa0bd7048>

In [49]:
# save the model to file
weights_name = description + '-weights.h5'
model.save_weights(weights_name)

model_name = description + '-model.h5'
model.save(model_name)

In [6]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    # truncate sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
    # predict probabilities for each word
    # yhat = self.model.predict_classes(encoded, verbose=0)
    predicted_l = list(tuple(enumerate(model.predict(encoded)[0])))
    top_3 = sorted(predicted_l, key=lambda x: x[1], reverse=True)[:3]
    print(top_3)
    # map predicted word index to word
    predicted_words = []
    for i, word in enumerate(top_3):
        for w in list(tokenizer.word_index.items()):
            if w[1] == word[0]:
                predicted_words.append({'word': w[0], 'probability': word[1]})
    return predicted_words

In [25]:
generated = generate_seq(model, tokenizer, 19, 'my father and grandfather', 1)
print(generated)

[(20, 0.9998729), (4, 8.585863e-05), (204, 4.0247425e-05)]
[{'word': 'for', 'probability': 0.9998729}, {'word': 'of', 'probability': 8.585863e-05}, {'word': 'whose', 'probability': 4.0247425e-05}]
