In [None]:
import tensorflow.keras as keras
from tensorflow.python.keras.utils import to_categorical
import numpy as np
import os, sys
from data_helper import read_data, sents2sequences
from model import summary_model
from model_helper import plot_attention_weights
from logger import get_logger

In [None]:
project_path = os.getcwd()#'/home/swayam/Desktop/tf2_project/'
if project_path not in sys.path:
    sys.path.append(project_path)

In [None]:
base_dir = os.getcwd()#'/home/swayam/Desktop/tf2_project/'
logger = get_logger("model.train",os.path.join(base_dir, 'logs'))

In [None]:
batch_size = 64
hidden_size = 96
ip_timesteps, op_timesteps = 20, 20

In [None]:
def get_data(train_size, random_seed=100):
#randomly shuffle train/test

    ip_text = read_data(os.path.join(project_path, 'data', 'text.txt'))
    op_text = read_data(os.path.join(project_path, 'data', 'summary.txt'))
    logger.info('Length of text: {}'.format(len(ip_text)))

    op_text = ['sos ' + sent[:-1] + 'eos .'  if sent.endswith('.') else 'sos ' + sent + ' eos .' for sent in op_text]

    np.random.seed(random_seed)
    inds = np.arange(len(ip_text))
    np.random.shuffle(inds)

    train_inds = inds[:train_size]
    test_inds = inds[train_size:]
    tr_ip_text = [ip_text[ti] for ti in train_inds]
    tr_op_text = [op_text[ti] for ti in train_inds]

    ts_ip_text = [ip_text[ti] for ti in test_inds]
    ts_op_text = [op_text[ti] for ti in test_inds]

    return tr_ip_text, tr_op_text, ts_ip_text, ts_op_text

In [None]:
def preprocess_data(ip_tokenizer, op_tokenizer, ip_text, op_text, in_timesteps, op_timesteps):
#Preprocessing and generating sequence of word indices

    ip_seq = sents2sequences(ip_tokenizer, ip_text, reverse=False, padding_type='pre', pad_length=ip_timesteps)
    op_seq = sents2sequences(op_tokenizer, op_text, pad_length=op_timesteps)
    logger.info('Vocabulary size (Input): {}'.format(np.max(ip_seq)+1))
    logger.info('Vocabulary size (Output): {}'.format(np.max(op_seq)+1))
    logger.debug('IP text shape: {}'.format(ip_seq.shape))
    logger.debug('OP text shape: {}'.format(op_seq.shape))

    return ip_seq, op_seq

In [None]:
def train(full_model, ip_seq, op_seq, batch_size, n_epochs=10):
#Training the model

    for ep in range(n_epochs):
        losses = []
        for bi in range(0, ip_seq.shape[0] - batch_size, batch_size):

            ip_onehot_seq = to_categorical(ip_seq[bi:bi + batch_size, :], num_classes=ip_vsize)
            op_onehot_seq = to_categorical(op_seq[bi:bi + batch_size, :], num_classes=op_vsize)

            full_model.train_on_batch([ip_onehot_seq, op_onehot_seq[:, :-1, :]], op_onehot_seq[:, 1:, :])

            l = full_model.evaluate([ip_onehot_seq, op_onehot_seq[:, :-1, :]], op_onehot_seq[:, 1:, :],
                                    batch_size=batch_size, verbose=0)

            losses.append(l)
        if (ep + 1) % 1 == 0:
            logger.info("Loss in epoch {}: {}".format(ep + 1, np.mean(losses)))

In [None]:
if __name__ == '__main__':

    debug = True

    train_size = 100000 if not debug else 8000
    filename = ''

    tr_ip_text, tr_op_text, ts_ip_text, ts_op_text = get_data(train_size=train_size)

#Defining tokenizers
    ip_tokenizer = keras.preprocessing.text.Tokenizer(oov_token='UNK')
    ip_tokenizer.fit_on_texts(tr_ip_text)

    op_tokenizer = keras.preprocessing.text.Tokenizer(oov_token='UNK')
    op_tokenizer.fit_on_texts(tr_op_text)

#Getting preprocessed data
    ip_seq, op_seq = preprocess_data(ip_tokenizer, op_tokenizer, tr_ip_text, tr_op_text, ip_timesteps, op_timesteps)

    ip_vsize = max(ip_tokenizer.index_word.keys()) + 1
    op_vsize = max(op_tokenizer.index_word.keys()) + 1

#Defining the full model
    full_model, infer_enc_model, infer_dec_model = summary_model(hidden_size=hidden_size, batch_size=batch_size,
        ip_timesteps=ip_timesteps, op_timesteps=op_timesteps,ip_vsize=ip_vsize, op_vsize=op_vsize)

    n_epochs = 1000 if not debug else 100
    train(full_model, ip_seq, op_seq, batch_size, n_epochs)

#save model
    full_model.save('summarizer.h5')