In [None]:
import os
os.chdir('drive/My Drive/MLProject_20202')

In [None]:
from keras import models

In [None]:
import json

with open('Data/dict_10.txt', 'r') as in_file:
    vocab = json.load(in_file)

with open('Data/reverse_dict_10.txt', 'r') as in_file:
    reverse_vocab = json.load(in_file)

In [None]:
import pandas as pd
import numpy as np
from keras.layers.merge import add
from keras.models import Sequential, Model
from keras.layers import Input, LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                        Activation, Flatten, Reshape, concatenate, \
                        Dropout, BatchNormalization, Bidirectional
from keras.optimizers import Adam, RMSprop
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping

In [None]:
#define length for feature and label
feature_max_length = 512
label_max_length = 8

In [None]:
df = pd.read_json('Data/data_text_embedded.json')
df.head()

Unnamed: 0,embedded_text,embedded_title
0,"[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[1, 141, 99, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[1, 205, 206, 207, 208, 209, 210, 211, 212, 21...","[1, 268, 213, 210, 211, 212, 2, 0, 0, 0, 0, 0,..."
2,"[1, 4, 442, 3, 443, 213, 248, 444, 445, 446, 2...","[1, 312, 490, 491, 9047, 2, 0, 0, 0, 0, 0, 0, ..."
3,"[1, 4, 572, 573, 267, 574, 219, 575, 576, 318,...","[1, 272, 103, 573, 572, 2, 0, 0, 0, 0, 0, 0, 0..."
4,"[1, 4, 796, 797, 798, 799, 800, 445, 801, 241,...","[1, 935, 871, 287, 2, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
def format_length(arr, length, vocab):
    while len(arr) < length:
        arr.append(vocab['<pad>'])
    newArr = arr[:(length-1)]
    if arr[length-1] == vocab['<pad>']:
        newArr.append(vocab['<pad>'])
    else:
        newArr.append(vocab['<end>'])
    return newArr

In [None]:
df.embedded_text = df.embedded_text.apply(lambda x: format_length(x, feature_max_length, vocab))
df.embedded_title = df.embedded_title.apply(lambda x: format_length(x, label_max_length, vocab))

In [None]:
df.embedded_title

0                 [1, 141, 99, 2, 0, 0, 0, 0]
1          [1, 268, 213, 210, 211, 212, 2, 0]
2           [1, 312, 490, 491, 9047, 2, 0, 0]
3            [1, 272, 103, 573, 572, 2, 0, 0]
4              [1, 935, 871, 287, 2, 0, 0, 0]
                         ...                 
17875        [1, 272, 1456, 3869, 2, 0, 0, 0]
17876          [1, 3018, 5600, 2, 0, 0, 0, 0]
17877    [1, 239, 231, 879, 142, 539, 231, 2]
17878          [1, 1994, 1897, 2, 0, 0, 0, 0]
17879          [1, 414, 425, 104, 2, 0, 0, 0]
Name: embedded_title, Length: 17880, dtype: object

In [None]:
# data generation for model
def data_generator(label, feature, max_length_title, batch_size, vocab_size):
    X1, X2, y = [], [], []
    n = 0
    # length of label/feature should be equal
    while 1:
        for i in range(len(label)):
            n += 1
            input = feature[i]
            output = label[i]
            for j in range(1,max_length_title):
                if output[j] == vocab['<end>']:
                    break
                else:
                    in_output = output[:j]
                    out_output = output[j]
                    in_output = pad_sequences([in_output], maxlen=max_length_title)[0]
                    out_output = to_categorical([out_output], num_classes=vocab_size)[0]
                    X1.append(input)
                    X2.append(in_output)
                    y.append(out_output)
            if n == batch_size:
                yield [np.array(X1), np.array(X2)], np.array(y)
                X1, X2, y = [], [], []
                n = 0

In [None]:
X = np.array(np.array(df['embedded_text']).tolist())
y = np.array(np.array(df['embedded_title']).tolist())

In [None]:
X

array([[   1,    4,    5, ...,    0,    0,    0],
       [   1,  205,  206, ...,  188,  406,    2],
       [   1,    4,  442, ...,    0,    0,    0],
       ...,
       [   1,    4,  872, ...,    0,    0,    0],
       [   1, 6299, 1641, ...,    0,    0,    0],
       [   1,  205, 3644, ...,    0,    0,    0]])

In [None]:
vocab_size = len(vocab)
embedding_dim = 256

inputs1 = Input(shape=(feature_max_length,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(512, activation='relu')(fe1)
inputs2 = Input(shape=(label_max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = Bidirectional(LSTM(256, return_sequences=True))(se2)
# se4 = Dropout(0.5)(se3)
se5 = Bidirectional(LSTM(256))(se3)
decoder1 = add([fe2, se5])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs = [inputs1, inputs2], outputs = outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 8, 256)       3280384     input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 8, 256)       0           embedding[0][0]                  
______________________________________________________________________________________________

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'MeanSquaredError'])

In [None]:
# model.optimizer.lr = 0.0001
model.optimizer.lr = 0.001
epochs = 100
batch_size = 32
steps = len(y)//batch_size
checkpointer = ModelCheckpoint(filepath=os.path.join('/content/drive/My Drive/MLProject_20202/Model/LSTM/model_bilstm_512_8_all.h5'), \
                               monitor='val_loss', save_best_only=True, verbose=1)
es = EarlyStopping(monitor='val_loss', patience=5)

In [None]:
from sklearn.model_selection import KFold

In [None]:
num_fold = 5

kf = KFold(n_splits=num_fold, shuffle=True)
kf.get_n_splits(X)

5

In [None]:
for train_idx, test_idx in kf.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    train_generator = data_generator(y_train, X_train, label_max_length, batch_size, vocab_size)
    test_generator = data_generator(y_test, X_test, label_max_length, batch_size, vocab_size)
    model.fit(train_generator, epochs=epochs//num_fold, steps_per_epoch=len(X_train)//batch_size, \
              validation_data=test_generator, validation_steps=len(X_test)//batch_size,\
              callbacks=[checkpointer, es])
    print()

Epoch 1/20

Epoch 00001: val_loss improved from inf to 8.95610, saving model to /content/drive/My Drive/MLProject_20202/Model/LSTM/model_bilstm_512_8_all.h5
Epoch 2/20

Epoch 00002: val_loss improved from 8.95610 to 8.50765, saving model to /content/drive/My Drive/MLProject_20202/Model/LSTM/model_bilstm_512_8_all.h5
Epoch 3/20

Epoch 00003: val_loss improved from 8.50765 to 8.10883, saving model to /content/drive/My Drive/MLProject_20202/Model/LSTM/model_bilstm_512_8_all.h5
Epoch 4/20

Epoch 00004: val_loss improved from 8.10883 to 7.75558, saving model to /content/drive/My Drive/MLProject_20202/Model/LSTM/model_bilstm_512_8_all.h5
Epoch 5/20

Epoch 00005: val_loss improved from 7.75558 to 7.44585, saving model to /content/drive/My Drive/MLProject_20202/Model/LSTM/model_bilstm_512_8_all.h5
Epoch 6/20

Epoch 00006: val_loss improved from 7.44585 to 7.17868, saving model to /content/drive/My Drive/MLProject_20202/Model/LSTM/model_bilstm_512_8_all.h5
Epoch 7/20

Epoch 00007: val_loss impr