<a href="https://colab.research.google.com/github/uvaizm/AIWorks/blob/master/Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/27/a4/d10c0acc8528d838cda5eede0ee9c784caa598dbf40bd0911ff8d067a7eb/gensim-3.6.0-cp36-cp36m-manylinux1_x86_64.whl (23.6MB)
[K    100% |████████████████████████████████| 23.6MB 1.4MB/s 
Collecting smart-open>=1.2.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/4b/1f/6f27e3682124de63ac97a0a5876da6186de6c19410feab66c1543afab055/smart_open-1.7.1.tar.gz
Collecting boto>=2.32 (from smart-open>=1.2.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/23/10/c0b78c27298029e4454a472a1919bde20cb182dab1662cec7f2ca1dcc523/boto-2.49.0-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 14.9MB/s 
[?25hCollecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading https://files.pythonhosted.org/packages/61/39/122222b5e85cd41c391b68a99ee296584b2a2d1d233e7ee32b4532384f2d/bz2file-0.98.tar.gz
Collecting boto3 (from smart-open>=1.2.1->gensim)
[?25l  Downlo

In [0]:
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 26 16:06:36 2018

@author: muvaiz
"""

from nltk.tokenize import word_tokenize
import re
import collections
import pickle
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

train_article_path ="train.article.txt"
train_title_path ="train.title.txt"
valid_article_path ="valid.article.filter.txt"
valid_title_path ="valid.title.filter.txt"

#to clean sentence
def clean_str(sentence):
    sentence=re.sub("[#.]+","#",sentence)
    return sentence

#read each line from given file and will pass it to clean_str function for cleaning sentence. "toy" flag will be used for optimization
def get_text_list(data_path,toy):
    with open(data_path,'r',encoding="utf-8") as f:
        if not toy:
            return([clean_str(x.strip()) for x in f.readlines()])
        else:
            return([clean_str(x.strip()) for x in f.readlines()][:50000])
            
            
#to build a dictionary containing each unique word from the training data and respective value
#returns formed word vs number dictionary, inverse of it, max article length,max summary length
def build_dict(step,toy=False):
    if step=="train":
        train_article_list=get_text_list(train_article_path,toy)
        train_title_list=get_text_list(train_title_path,toy)
        
        train_list=train_article_list+train_title_list
        
        words=list()
        
        for sentence in train_list:
            for word in word_tokenize(sentence):
                words.append(word)
                
        word_counter=collections.Counter(words).most_common()
        word_dict=dict()
        word_dict["<padding>"]=0
        word_dict["<unk>"]=1
        word_dict["<s>"]=2
        word_dict["</s>"]=3
        
        for word,_ in word_counter:
            word_dict[word]=len(word_dict)
            
        with open("word_dict.pickle","wb") as f:
            pickle.dump(word_dict,f)
            
    elif step=="valid":
        with open("word_dict.pickle","rb") as f:
            word_dict=pickle.load(f)
            
    reversed_dict = dict(zip(word_dict.values(),word_dict.keys()))
    
    article_max_len=50
    summary_max_len=15
    
    return word_dict,reversed_dict,article_max_len,summary_max_len

def build_dataset(step,word_dict,article_max_len,summary_max_len,toy=False):
    if step=="train":
        article_list=get_text_list(train_article_path,toy)
        title_list=get_text_list(train_title_path,toy)
    elif step=="valid":
        article_list=get_text_list(valid_article_path,toy)
    else:
        raise NotImplementedError
    
    #sample output of sentence_with_word_lists_per_sent : [["I","am","a","boy"],["I","like","rain"]]
    sentences_with_word_lists_per_sent=[word_tokenize(sentence) for sentence in article_list]
    
    #sample output of sentence_with_word_lists_per_sent : [["101","21","343","711"],["100","234","1526"]]
    sentences_with_word_vector_lists_per_sent=[[word_dict.get(word,word_dict["<unk>"]) for word in sentence] for sentence in sentences_with_word_lists_per_sent]
                                                              
    truncated_word_vector_lists_per_sent=[sentence[:article_max_len] for sentence in sentences_with_word_vector_lists_per_sent]
    output_word_vector_lists=[sentence + (article_max_len - len(sentence)) * [word_dict["<padding>"]] for sentence in truncated_word_vector_lists_per_sent]
                                                              
    if step=="valid":
        return output_word_vector_lists
    else:
        title_with_word_lists_per_sent = [word_tokenize(sentence) for sentence in title_list]
        title_with_word_vector_lists_per_sent=[[word_dict.get(word,word_dict["<unk>"]) for word in sentence] for sentence in title_with_word_lists_per_sent]
        output_title_vector_lists=[sentence[:(summary_max_len-1)]for sentence in title_with_word_vector_lists_per_sent]
        return(output_word_vector_lists,output_title_vector_lists)
        
def batch_iter(inputs, outputs, batch_size, num_epochs):
    inputs = np.array(inputs)
    outputs = np.array(outputs)

    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, len(inputs))
            yield inputs[start_index:end_index], outputs[start_index:end_index]
            
def get_init_embedding(reversed_dict,embedding_size):
    glove_file="glove.6B.50d.txt"
    word2vec_file=get_tmpfile("word2vec_format.vec")
    glove2word2vec(glove_file,word2vec_file)
    print("Loading Glove vectors...")
    word_vectors = KeyedVectors.load_word2vec_format(word2vec_file)
    
    word_vec_list = list()
    
    for _,word in sorted(reversed_dict.items()):
        try:
            word_vec = word_vectors.word_vec(word)
        except KeyError:
            word_vec=np.zeros([embedding_size],dtype=np.float32)
            
        word_vec_list.append(word_vec)
        
    word_vec_list[2] = np.random.normal(0, 1, embedding_size)
    word_vec_list[3] = np.random.normal(0, 1, embedding_size)
    
    return(np.array(word_vec_list))





In [0]:
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 26 18:36:30 2018

@author: muvaiz
"""

import tensorflow as tf
from tensorflow.contrib import rnn
#from utils import get_init_embedding


class Model(object):
    def __init__(self, reversed_dict, article_max_len, summary_max_len, args, forward_only=False):
        self.vocabulary_size = len(reversed_dict)
        self.embedding_size = args.embedding_size
        self.num_hidden = args.num_hidden
        self.num_layers = args.num_layers
        self.learning_rate = args.learning_rate
        self.beam_width = args.beam_width
        if not forward_only:
            self.keep_prob = args.keep_prob
        else:
            self.keep_prob = 1.0
        self.cell = tf.nn.rnn_cell.BasicLSTMCell
        with tf.variable_scope("decoder/projection"):
            self.projection_layer = tf.layers.Dense(self.vocabulary_size, use_bias=False)

        self.batch_size = tf.placeholder(tf.int32, (), name="batch_size")
        self.X = tf.placeholder(tf.int32, [None, article_max_len])
        self.X_len = tf.placeholder(tf.int32, [None])
        self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len])
        self.decoder_len = tf.placeholder(tf.int32, [None])
        self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len])
        self.global_step = tf.Variable(0, trainable=False)

        with tf.name_scope("embedding"):
            if not forward_only and args.glove:
                
                init_embeddings = tf.constant(get_init_embedding(reversed_dict, self.embedding_size),dtype=tf.float32)
            else:
                init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0)
            self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings)
            self.encoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.X), perm=[1, 0, 2])
            self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.decoder_input), perm=[1, 0, 2])

        with tf.name_scope("encoder"):
            fw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            bw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            fw_cells = [rnn.DropoutWrapper(cell) for cell in fw_cells]
            bw_cells = [rnn.DropoutWrapper(cell) for cell in bw_cells]

            encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                fw_cells, bw_cells, self.encoder_emb_inp,
                sequence_length=self.X_len, time_major=True, dtype=tf.float32)
            self.encoder_output = tf.concat(encoder_outputs, 2)
            encoder_state_c = tf.concat((encoder_state_fw[0].c, encoder_state_bw[0].c), 1)
            encoder_state_h = tf.concat((encoder_state_fw[0].h, encoder_state_bw[0].h), 1)
            self.encoder_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)

        with tf.name_scope("decoder"), tf.variable_scope("decoder") as decoder_scope:
            decoder_cell = self.cell(self.num_hidden * 2)

            if not forward_only:
                attention_states = tf.transpose(self.encoder_output, [1, 0, 2])
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, attention_states, memory_sequence_length=self.X_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size)
                initial_state = initial_state.clone(cell_state=self.encoder_state)
                helper = tf.contrib.seq2seq.TrainingHelper(self.decoder_emb_inp, self.decoder_len, time_major=True)
                decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, initial_state)
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, scope=decoder_scope)
                self.decoder_output = outputs.rnn_output
                self.logits = tf.transpose(
                    self.projection_layer(self.decoder_output), perm=[1, 0, 2])
                self.logits_reshape = tf.concat(
                    [self.logits, tf.zeros([self.batch_size, summary_max_len - tf.shape(self.logits)[1], self.vocabulary_size])], axis=1)
            else:
                tiled_encoder_output = tf.contrib.seq2seq.tile_batch(
                    tf.transpose(self.encoder_output, perm=[1, 0, 2]), multiplier=self.beam_width)
                tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, multiplier=self.beam_width)
                tiled_seq_len = tf.contrib.seq2seq.tile_batch(self.X_len, multiplier=self.beam_width)
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, tiled_encoder_output, memory_sequence_length=tiled_seq_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size * self.beam_width)
                initial_state = initial_state.clone(cell_state=tiled_encoder_final_state)
                decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=decoder_cell,
                    embedding=self.embeddings,
                    start_tokens=tf.fill([self.batch_size], tf.constant(2)),
                    end_token=tf.constant(3),
                    initial_state=initial_state,
                    beam_width=self.beam_width,
                    output_layer=self.projection_layer
                )
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder, output_time_major=True, maximum_iterations=summary_max_len, scope=decoder_scope)
                self.prediction = tf.transpose(outputs.predicted_ids, perm=[1, 2, 0])

        with tf.name_scope("loss"):
            if not forward_only:
                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits_reshape, labels=self.decoder_target)
                weights = tf.sequence_mask(self.decoder_len, summary_max_len, dtype=tf.float32)
                self.loss = tf.reduce_sum(crossent * weights / tf.to_float(self.batch_size))

                params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, params)
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.update = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

In [26]:
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 26 23:47:16 2018

@author: muvaiz
"""

import time
start = time.perf_counter()
import tensorflow as tf
import argparse
import pickle
import os
import nltk
from nltk.tokenize import word_tokenize
#from model import Model
#from utils import build_dict, build_dataset, batch_iter

# Uncomment next 2 lines to suppress error and Tensorflow info verbosity. Or change logging levels
# tf.logging.set_verbosity(tf.logging.FATAL)
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

nltk.download('punkt')

class Simulate_Arguments():
  
  def __init__(self):
    self.num_hidden=150
    self.num_layers=2
    self.beam_width=10
    self.glove=True
    self.embedding_size=300
    
    self.learning_rate=1e-3
    self.batch_size=64
    self.num_epochs=10
    self.keep_prob=0.8
    
    self.toy=True
    self.with_model=False

'''def add_arguments(parser):
    parser.add_argument("--num_hidden", type=int, default=150, help="Network size.")
    parser.add_argument("--num_layers", type=int, default=2, help="Network depth.")
    parser.add_argument("--beam_width", type=int, default=10, help="Beam width for beam search decoder.")
    parser.add_argument("--glove", action="store_true", help="Use glove as initial word embedding.")
    parser.add_argument("--embedding_size", type=int, default=300, help="Word embedding size.")

    parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate.")
    parser.add_argument("--batch_size", type=int, default=64, help="Batch size.")
    parser.add_argument("--num_epochs", type=int, default=10, help="Number of epochs.")
    parser.add_argument("--keep_prob", type=float, default=0.8, help="Dropout keep prob.")

    parser.add_argument("--toy", action="store_true", help="Use only 50K samples of data")

    parser.add_argument("--with_model", action="store_true", help="Continue from previously saved model")

'''

#parser = argparse.ArgumentParser()
#add_arguments(parser)
#args = parser.parse_args()
args=Simulate_Arguments()

with open("args.pickle", "wb") as f:
    pickle.dump(args, f)

if not os.path.exists("saved_model_1"):
    os.mkdir("saved_model_1")
else:
    if args.with_model:
        old_model_checkpoint_path = open('saved_model/checkpoint', 'r')
        old_model_checkpoint_path = "".join(["saved_model/",old_model_checkpoint_path.read().splitlines()[0].split('"')[1] ])


print("Building dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("train", args.toy)
print("Loading training dataset...")
train_x, train_y = build_dataset("train", word_dict, article_max_len, summary_max_len, args.toy)


with tf.Session() as sess:
    model = Model(reversed_dict, article_max_len, summary_max_len, args)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables())
    if 'old_model_checkpoint_path' in globals():
        print("Continuing from previous trained model:" , old_model_checkpoint_path , "...")
        saver.restore(sess, old_model_checkpoint_path )

    batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs)
    num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1

    print("\nIteration starts.")
    print("Number of batches per epoch :", num_batches_per_epoch)
    for batch_x, batch_y in batches:
        batch_x_len = list(map(lambda x: len([y for y in x if y != 0]), batch_x))
        batch_decoder_input = list(map(lambda x: [word_dict["<s>"]] + list(x), batch_y))
        batch_decoder_len = list(map(lambda x: len([y for y in x if y != 0]), batch_decoder_input))
        batch_decoder_output = list(map(lambda x: list(x) + [word_dict["</s>"]], batch_y))

        batch_decoder_input = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_input))
        batch_decoder_output = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_output))

        train_feed_dict = {
            model.batch_size: len(batch_x),
            model.X: batch_x,
            model.X_len: batch_x_len,
            model.decoder_input: batch_decoder_input,
            model.decoder_len: batch_decoder_len,
            model.decoder_target: batch_decoder_output
        }

        _, step, loss = sess.run([model.update, model.global_step, model.loss], feed_dict=train_feed_dict)

        if step % 1000 == 0:
            print("step {0}: loss = {1}".format(step, loss))

        if step % num_batches_per_epoch == 0:
            hours, rem = divmod(time.perf_counter() - start, 3600)
            minutes, seconds = divmod(rem, 60)
            saver.save(sess, "./saved_model/model.ckpt", global_step=step)
            print(" Epoch {0}: Model is saved.".format(step // num_batches_per_epoch),
            "Elapsed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds) , "\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Building dictionary...
Loading training dataset...
Loading Glove vectors...


ValueError: ignored

In [38]:
isinstance(get_init_embedding(reversed_dict,300), (np.ndarray, np.generic))

Loading Glove vectors...


True

In [25]:
print(tf.__version__)

1.12.0


In [36]:
(get_init_embedding(reversed_dict,300)).astype(tf.object.as_numpy_dtype)

Loading Glove vectors...


AttributeError: ignored

In [31]:
print(get_init_embedding(reversed_dict,300).shape)

Loading Glove vectors...
(17212,)


In [34]:
print(get_init_embedding(reversed_dict,300)[100])

Loading Glove vectors...
[ 0.40545   0.43805   0.36237   0.25683   0.38254   0.68255  -0.97853
  0.12741  -0.46129  -0.54809  -0.35384  -0.56697  -0.65756   0.50184
  0.53248  -0.77956  -0.089944 -0.37572  -1.1097   -0.30734  -0.022657
  0.11632   0.67704  -0.051499 -0.59719  -1.02      0.24289  -0.60216
 -0.35183  -0.54053   3.9844    0.41521   0.040419  0.26909   1.1193
  0.52924   0.37308   0.28924  -0.14714   0.23566  -0.72709   0.053276
  0.45373   0.20374  -0.13384   0.015313 -0.22037  -0.15662  -0.30289
 -0.77536 ]


In [0]:
print(reversed_dict.shape)