In [1]:
import tensorflow.keras as keras
from tensorflow.python.keras.utils import to_categorical
import numpy as np
import os, sys
from data_helper import read_data, sents2sequences
from model import summary_model
from model_helper import plot_attention_weights
from logger import get_logger

In [2]:
project_path = '/home/swayam/Desktop/tf2_project/'
if project_path not in sys.path:
    sys.path.append(project_path)

In [3]:
base_dir = '/home/swayam/Desktop/tf2_project/'
logger = get_logger("model.train",os.path.join(base_dir, 'logs'))

model.train | INFO | Started log /home/swayam/Desktop/tf2_project/logs/model.train


In [4]:
batch_size = 64
hidden_size = 96
ip_timesteps, op_timesteps = 20, 20

In [5]:
def get_data(train_size, random_seed=100):
#randomly shuffle train/test

    ip_text = read_data(os.path.join(project_path, 'data', 'text.txt'))
    op_text = read_data(os.path.join(project_path, 'data', 'summary.txt'))

    op_text = ['sos ' + sent[:-1] + 'eos .'  if sent.endswith('.') else 'sos ' + sent + ' eos .' for sent in op_text]

    np.random.seed(random_seed)
    inds = np.arange(len(ip_text))
    np.random.shuffle(inds)

    train_inds = inds[:train_size]
    test_inds = inds[train_size:]
    tr_ip_text = [ip_text[ti] for ti in train_inds]
    tr_op_text = [op_text[ti] for ti in train_inds]

    ts_ip_text = [ip_text[ti] for ti in test_inds]
    ts_op_text = [op_text[ti] for ti in test_inds]

    return tr_ip_text, tr_op_text, ts_ip_text, ts_op_text


In [6]:
def preprocess_data(ip_tokenizer, op_tokenizer, ip_text, op_text, in_timesteps, op_timesteps):
#Preprocessing and generating sequence of word indices

    ip_seq = sents2sequences(ip_tokenizer, ip_text, reverse=False, padding_type='pre', pad_length=ip_timesteps)
    op_seq = sents2sequences(op_tokenizer, op_text, pad_length=op_timesteps)

    return ip_seq, op_seq

In [7]:
def infer_nmt(encoder_model, decoder_model, test_ip_seq, ip_vsize, op_vsize):
    """
    Infer logic
    :param encoder_model: keras.Model
    :param decoder_model: keras.Model
    :param test_ip_seq: sequence of word ids
    :param ip_vsize: int
    :param op_vsize: int
    :return:
    """

    test_op_seq = sents2sequences(op_tokenizer, ['sos'], op_vsize)
    test_ip_onehot_seq = to_categorical(test_ip_seq, num_classes=ip_vsize)
    test_op_onehot_seq = np.expand_dims(to_categorical(test_op_seq, num_classes=op_vsize), 1)

    enc_outs, enc_last_state = encoder_model.predict(test_ip_onehot_seq)
    dec_state = enc_last_state
    attention_weights = []
    op_text = ''
    for i in range(20):

        dec_out, attention, dec_state = decoder_model.predict([enc_outs, dec_state, test_op_onehot_seq])
        dec_ind = np.argmax(dec_out, axis=-1)[0, 0]

        if dec_ind == 0:
            break
        test_op_seq = sents2sequences(op_tokenizer, [op_index2word[dec_ind]], op_vsize)
        test_op_onehot_seq = np.expand_dims(to_categorical(test_op_seq, num_classes=op_vsize), 1)

        attention_weights.append((dec_ind, attention))
        op_text += op_index2word[dec_ind] + ' '

    return op_text, attention_weights

In [8]:
if __name__ == '__main__':

    debug = True

    train_size = 100000 if not debug else 8000
    filename = ''

    tr_ip_text, tr_op_text, ts_ip_text, ts_op_text = get_data(train_size=train_size)

#Defining tokenizers
    ip_tokenizer = keras.preprocessing.text.Tokenizer(oov_token='UNK')
    ip_tokenizer.fit_on_texts(tr_ip_text)

    op_tokenizer = keras.preprocessing.text.Tokenizer(oov_token='UNK')
    op_tokenizer.fit_on_texts(tr_op_text)

#Getting preprocessed data
    ip_seq, op_seq = preprocess_data(ip_tokenizer, op_tokenizer, tr_ip_text, tr_op_text, ip_timesteps, op_timesteps)

    ip_vsize = max(ip_tokenizer.index_word.keys()) + 1
    op_vsize = max(op_tokenizer.index_word.keys()) + 1

#Defining the full model
    full_model, infer_enc_model, infer_dec_model = summary_model(hidden_size=hidden_size, batch_size=batch_size,
        ip_timesteps=ip_timesteps, op_timesteps=op_timesteps,ip_vsize=ip_vsize, op_vsize=op_vsize)

#load model
    full_model.load_weights('/home/swayam/Desktop/tf2_project/summarizer.h5')

#Index2word
    ip_index2word = dict(zip(ip_tokenizer.word_index.values(), ip_tokenizer.word_index.keys()))
    op_index2word = dict(zip(op_tokenizer.word_index.values(), op_tokenizer.word_index.keys()))

#inference
    test_ip = tr_ip_text[110]
    logger.info('input_text: {}'.format(test_ip))
    test_ip_seq = sents2sequences(ip_tokenizer, [test_ip], pad_length=ip_timesteps)
    test_op, attn_weights = infer_nmt(encoder_model=infer_enc_model, decoder_model=infer_dec_model,test_ip_seq=test_ip_seq, ip_vsize=ip_vsize, op_vsize=op_vsize)
    logger.info('output_summary: {}'.format(test_op))

model.train | INFO | input_text: Our primary reason for buying this product is that it came in a variety pack.  We make most of our own baby food, but we were going on vacation and that was going to be difficult.  Since packing space was limited, we had this shipped to our destination.  It was the only brand we could find to order in a variety pack.  The kids liked all the flavors and the consistency seemed good.

model.train | INFO | output_summary: earth's best variety pack eos 
