In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [5]:
!ls "gdrive/My Drive/tweet_generator"

code  summary


In [6]:
!pip install gensim
!pip install wget
  
import nltk
nltk.download('punkt')

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9681 sha256=b462e81ec195e673b0883c65ba0818bb3d6edee4c89ac97c3db9c834081e7c1c
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
from nltk.tokenize import word_tokenize
import re
import collections
import pickle
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

default_path = "gdrive/My Drive/tweet_generator/summary/"

train_article_path = default_path + "sumdata/train/train_news_sports.txt"
train_title_path   = default_path + "sumdata/train/train_tweets_sports.txt"
valid_article_path = default_path + "sumdata/train/test_news_sports.txt"
valid_title_path   = default_path + "sumdata/train/test_tweets_sports.txt"



def clean_str(sentence):
    sentence = re.sub("[#.]+", "#", sentence)
    return sentence


def get_text_list(data_path, toy):
    with open (data_path, "r", encoding="utf-8") as f:
        if not toy:
            return [clean_str(x.strip()) for x in f.readlines()][:]
        else:
            return [clean_str(x.strip()) for x in f.readlines()][:50]


def build_dict(step, word_dict_folder_path, toy=False):
    if step == "train":
        train_article_list = get_text_list(train_article_path, toy)
        train_title_list = get_text_list(train_title_path, toy)

        words = list()
        for sentence in train_article_list + train_title_list:
            for word in word_tokenize(sentence):
                words.append(word)

        word_counter = collections.Counter(words).most_common()
        word_dict = dict()
        word_dict["<padding>"] = 0
        word_dict["<unk>"] = 1
        word_dict["<s>"] = 2
        word_dict["</s>"] = 3
        for word, _ in word_counter:
            word_dict[word] = len(word_dict)

        with open(word_dict_folder_path + "word_dict.pickle", "wb") as f:
            pickle.dump(word_dict, f)

    elif step == "valid":
        with open(word_dict_folder_path + "word_dict.pickle", "rb") as f:
            word_dict = pickle.load(f)

    reversed_dict = dict(zip(word_dict.values(), word_dict.keys()))

    article_max_len = 50
    summary_max_len = 15

    return word_dict, reversed_dict, article_max_len, summary_max_len


def build_dataset(step, word_dict, article_max_len, summary_max_len, toy=False):
    if step == "train":
        article_list = get_text_list(train_article_path, toy)
        title_list = get_text_list(train_title_path, toy)
    elif step == "valid":
        article_list = get_text_list(valid_article_path, toy)
    else:
        raise NotImplementedError

    x = [word_tokenize(d) for d in article_list]
    x = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in x]
    x = [d[:article_max_len] for d in x]
    x = [d + (article_max_len - len(d)) * [word_dict["<padding>"]] for d in x]
    
    if step == "valid":
        return x
    else:        
        y = [word_tokenize(d) for d in title_list]
        y = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in y]
        y = [d[:(summary_max_len - 1)] for d in y]
        return x, y


def batch_iter(inputs, outputs, batch_size, num_epochs):
    inputs = np.array(inputs)
    outputs = np.array(outputs)

    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, len(inputs))
            yield inputs[start_index:end_index], outputs[start_index:end_index]


def get_init_embedding(reversed_dict, embedding_size):
    print("Loading Glove vectors...")

    with open( default_path + "glove/model_glove_300.pkl", 'rb') as handle:
        word_vectors = pickle.load(handle)
        
    word_vec_list = list()
    for _, word in sorted(reversed_dict.items()):
        try:
            word_vec = word_vectors.word_vec(word)
        except KeyError:
            word_vec = np.zeros([embedding_size], dtype=np.float32)

        word_vec_list.append(word_vec)

    # Assign random vector to <s>, </s> token
    word_vec_list[2] = np.random.normal(0, 1, embedding_size)
    word_vec_list[3] = np.random.normal(0, 1, embedding_size)

    return np.array(word_vec_list)

In [0]:
# with open(d)

In [0]:
# with open(default_path+"sumdata/train/test_one.txt","w") as inp:
#   inp.write("Bengaluru The Congress in Karnataka has termed the Supreme Court order on the political crisis in the state as bad judgment which seemed to protect the defectors and encourage horse trading In a series of tweets Karnataka Pradesh Congress Committee KPCC president Dinesh Gundu Rao termed it an extraordinary order On Wednesday the Supreme Court directed that the 15 rebel Congress and JD S MLAs ought not to be compelled to take part in the proceedings of the state Assembly where Chief Minister HD Kumaraswamy is slated to face the floor test on July 18 Supreme Court order seems perfectly coordinated to help the rebel MLAs to violate the whip It has set a wrong precedent as the value of the Whip as per 10th schedule of the Constitution is now redundant An extraordinary order indeed Rao tweeted The SupremeCourt verdict is now encroaching upon the rights of the Legislature This is a bad judgement which seems to protect the defectors and encourages horse trading and also violating the doctrine of separation of powers KarnatakaPoliticalCrisis Dinesh Gundu Rao ದ ನ ಶ ಗ ಡ ರ ವ dineshgrao July 17 2019 The SupremeCourt verdict is now encroaching upon the rights of the Legislature This is a bad judgement which seems to protect the defectors and encourages horse trading and also violating the doctrine of separation of powers he said in another tweet A bench headed by Chief Justice Ranjan Gogoi also said the speaker was free to decide on the resignations of the rebel MLAs within the time frame decided by him The apex court was hearing the plea of 15 rebel Congress JD S MLAs seeking direction for the speaker to accept their resignations from the assembly Noting that Congress disqualification petition with the speaker against party MLAs is as per section 2 1a of the anti defection law Rao tweeted It s not for violating the whip but for indulging in anti party activities to join hands with BJP to topple our govt and voluntarily giving up membership The Congress has moved disqualification petition against 13 MLAs including independent R Shankar who had merged his Karnataka Pragnyavantha Janatha Party KPJP with it The other Congress MLAs include Pratap Gowda Patil BC Patil Shivram Hebbar S T Somashekar Byrati Basavaraj Anand Singh Roshan Baig Munirathna K Sudhakar and MTB Nagaraj Disqualification petition had been moved against Ramesh Jarkiholi and Mahesh Kumatalli earlier itself The JD S too on its part had moved disqualification petition against its 3 MLAs Gopalaiah A H Vishwanath and Narayana Gowda but the speaker had said it was not in proper format and those who submitted were not party MLAs or senior leaders As many as 16 MLAs 13 from the Congress and three from the JD S have resigned while two independent MLAs S Shankar and H Nagesh have withdrawn their support to the coalition government keeping it on the edge Get the best of News18 delivered to your inbox subscribe to News18 Daybreak Follow News18 com on Twitter Instagram Facebook Telegram TikTok and on YouTube and stay in the know with what s happening in the world around you in real time")

In [0]:
# with open(default_path+"sumdata/train/test_title_one.txt","w") as inp:
#   inp.write("The Court hearing the plea of 15 rebel Cong JD S MLAs seeking direction for the speaker to accept their resignations said the Speaker was free to decide on the resignations of the rebel MLAs within the time frame decided by him https t co znxBaVUFQG")

In [0]:
import tensorflow as tf
from tensorflow.contrib import rnn


class Model(object):
    def __init__(self, reversed_dict, article_max_len, summary_max_len, args, forward_only=False):
        self.vocabulary_size = len(reversed_dict)
        self.embedding_size = args.embedding_size
        self.num_hidden = args.num_hidden
        self.num_layers = args.num_layers
        self.learning_rate = args.learning_rate
        self.beam_width = args.beam_width
        if not forward_only:
            self.keep_prob = args.keep_prob
        else:
            self.keep_prob = 1.0
        self.cell = tf.nn.rnn_cell.BasicLSTMCell
        with tf.variable_scope("decoder/projection"):
            self.projection_layer = tf.layers.Dense(self.vocabulary_size, use_bias=False)

        self.batch_size = tf.placeholder(tf.int32, (), name="batch_size")
        self.X = tf.placeholder(tf.int32, [None, article_max_len])
        self.X_len = tf.placeholder(tf.int32, [None])
        self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len])
        self.decoder_len = tf.placeholder(tf.int32, [None])
        self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len])
        self.global_step = tf.Variable(0, trainable=False)

        with tf.name_scope("embedding"):
            if not forward_only and args.glove:
                init_embeddings = tf.constant(get_init_embedding(reversed_dict, self.embedding_size), dtype=tf.float32)
            else:
                init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0)
            self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings)
            self.encoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.X), perm=[1, 0, 2])
            self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.decoder_input), perm=[1, 0, 2])

        with tf.name_scope("encoder"):
            fw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            bw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            fw_cells = [rnn.DropoutWrapper(cell) for cell in fw_cells]
            bw_cells = [rnn.DropoutWrapper(cell) for cell in bw_cells]

            encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                fw_cells, bw_cells, self.encoder_emb_inp,
                sequence_length=self.X_len, time_major=True, dtype=tf.float32)
            self.encoder_output = tf.concat(encoder_outputs, 2)
            encoder_state_c = tf.concat((encoder_state_fw[0].c, encoder_state_bw[0].c), 1)
            encoder_state_h = tf.concat((encoder_state_fw[0].h, encoder_state_bw[0].h), 1)
            self.encoder_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)

        with tf.name_scope("decoder"), tf.variable_scope("decoder") as decoder_scope:
            decoder_cell = self.cell(self.num_hidden * 2)

            if not forward_only:
                attention_states = tf.transpose(self.encoder_output, [1, 0, 2])
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, attention_states, memory_sequence_length=self.X_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size)
                initial_state = initial_state.clone(cell_state=self.encoder_state)
                helper = tf.contrib.seq2seq.TrainingHelper(self.decoder_emb_inp, self.decoder_len, time_major=True)
                decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, initial_state)
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, scope=decoder_scope)
                self.decoder_output = outputs.rnn_output
                self.logits = tf.transpose(
                    self.projection_layer(self.decoder_output), perm=[1, 0, 2])
                self.logits_reshape = tf.concat(
                    [self.logits, tf.zeros([self.batch_size, summary_max_len - tf.shape(self.logits)[1], self.vocabulary_size])], axis=1)
            else:
                tiled_encoder_output = tf.contrib.seq2seq.tile_batch(
                    tf.transpose(self.encoder_output, perm=[1, 0, 2]), multiplier=self.beam_width)
                tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, multiplier=self.beam_width)
                tiled_seq_len = tf.contrib.seq2seq.tile_batch(self.X_len, multiplier=self.beam_width)
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, tiled_encoder_output, memory_sequence_length=tiled_seq_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size * self.beam_width)
                initial_state = initial_state.clone(cell_state=tiled_encoder_final_state)
                decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=decoder_cell,
                    embedding=self.embeddings,
                    start_tokens=tf.fill([self.batch_size], tf.constant(2)),
                    end_token=tf.constant(3),
                    initial_state=initial_state,
                    beam_width=self.beam_width,
                    output_layer=self.projection_layer
                )
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder, output_time_major=True, maximum_iterations=summary_max_len, scope=decoder_scope)
                self.prediction = tf.transpose(outputs.predicted_ids, perm=[1, 2, 0])

        with tf.name_scope("loss"):
            if not forward_only:
                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits_reshape, labels=self.decoder_target)
                weights = tf.sequence_mask(self.decoder_len, summary_max_len, dtype=tf.float32)
                self.loss = tf.reduce_sum(crossent * weights / tf.to_float(self.batch_size))

                params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, params)
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.update = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

## Model specification

In [0]:
model_name = "sports_model"

In [0]:
model_folder_path = default_path + "saved_models/" + model_name + "/"
word_dict_folder_path = default_path + "word_dicts/" + model_name + "/"
results_folder_path =  default_path + "results/" + model_name + "/"

## Train

In [0]:
import time
start = time.perf_counter()
import tensorflow as tf
import argparse
import pickle
import os

In [43]:
if not os.path.exists(model_folder_path):
    os.makedirs(model_folder_path)
    print(model_name + " model directory created")

sports_model model directory created


In [44]:
if not os.path.exists(word_dict_folder_path):
    os.makedirs(word_dict_folder_path)
    print(model_name + " word_dict directory created")

sports_model word_dict directory created


In [45]:
class args:
    pass
  
args.num_hidden=150
args.num_layers=2
args.beam_width=10
args.glove="store_true"
args.embedding_size=300

args.learning_rate=1e-3
args.batch_size=64
args.num_epochs=50
args.keep_prob = 0.8

args.toy=False

args.with_model="store_true"

args.continue_with_prev_model = False





if args.with_model:
    if os.path.exists(model_folder_path + 'checkpoint'):
        old_model_checkpoint_path = open(model_folder_path + 'checkpoint', 'r')
        old_model_checkpoint_path = "".join([model_folder_path,old_model_checkpoint_path.read().splitlines()[0].split('"')[1] ])
        args.continue_with_prev_model = True
        print("old checkpoint restored")


print("Building dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("train", word_dict_folder_path,args.toy)
print("Loading training dataset...")
train_x, train_y = build_dataset("train", word_dict, article_max_len, summary_max_len, args.toy)

tf.reset_default_graph()

with tf.Session() as sess:
    model = Model(reversed_dict, article_max_len, summary_max_len, args)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)
    if args.continue_with_prev_model:
        print("Continuing from previous trained model:" , old_model_checkpoint_path , "...")
        saver.restore(sess, old_model_checkpoint_path )

    batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs)
    num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1

    print("\nIteration starts.")
    print("Number of batches per epoch :", num_batches_per_epoch)
    for batch_x, batch_y in batches:
        batch_x_len = list(map(lambda x: len([y for y in x if y != 0]), batch_x))
        batch_decoder_input = list(map(lambda x: [word_dict["<s>"]] + list(x), batch_y))
        batch_decoder_len = list(map(lambda x: len([y for y in x if y != 0]), batch_decoder_input))
        batch_decoder_output = list(map(lambda x: list(x) + [word_dict["</s>"]], batch_y))

        batch_decoder_input = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_input))
        batch_decoder_output = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_output))

        train_feed_dict = {
            model.batch_size: len(batch_x),
            model.X: batch_x,
            model.X_len: batch_x_len,
            model.decoder_input: batch_decoder_input,
            model.decoder_len: batch_decoder_len,
            model.decoder_target: batch_decoder_output
        }

        _, step, loss = sess.run([model.update, model.global_step, model.loss], feed_dict=train_feed_dict)

        if step % 1000 == 0:
            print("step {0}: loss = {1}".format(step, loss))

        if step % num_batches_per_epoch == 0:
            hours, rem = divmod(time.perf_counter() - start, 3600)
            minutes, seconds = divmod(rem, 60)
            saver.save(sess, model_folder_path + "model.ckpt", global_step=step)
            print(" Epoch {0}: Model is saved.".format(step // num_batches_per_epoch),
            "Elapsed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds) , "\n")

Building dictionary...
Loading training dataset...
Loading Glove vectors...

Iteration starts.
Number of batches per epoch : 157
 Epoch 1: Model is saved. Elapsed: 00:02:05.33 

 Epoch 2: Model is saved. Elapsed: 00:03:16.41 

 Epoch 3: Model is saved. Elapsed: 00:04:26.08 

 Epoch 4: Model is saved. Elapsed: 00:05:36.00 

 Epoch 5: Model is saved. Elapsed: 00:06:46.62 

 Epoch 6: Model is saved. Elapsed: 00:07:56.41 

step 1000: loss = 64.16128540039062
 Epoch 7: Model is saved. Elapsed: 00:09:07.18 

 Epoch 8: Model is saved. Elapsed: 00:10:17.10 

 Epoch 9: Model is saved. Elapsed: 00:11:26.93 

 Epoch 10: Model is saved. Elapsed: 00:12:36.54 

 Epoch 11: Model is saved. Elapsed: 00:13:46.73 

 Epoch 12: Model is saved. Elapsed: 00:14:57.66 

step 2000: loss = 40.052677154541016
 Epoch 13: Model is saved. Elapsed: 00:16:07.59 

 Epoch 14: Model is saved. Elapsed: 00:17:17.43 

 Epoch 15: Model is saved. Elapsed: 00:18:27.55 

 Epoch 16: Model is saved. Elapsed: 00:19:37.78 

 Epoch 

## Test

In [46]:
if not os.path.exists(results_folder_path):
    os.makedirs(results_folder_path)
    print(model_name + " result directory created")

sports_model result directory created


In [47]:
import tensorflow as tf
import pickle


tf.reset_default_graph()

class args:
    pass
  
args.num_hidden=150
args.num_layers=2
args.beam_width=10
args.glove="store_true"
args.embedding_size=300

args.learning_rate=1e-3
args.batch_size=64
args.num_epochs=10
args.keep_prob = 0.8

args.toy=False

args.with_model="store_true"



print("Loading dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("valid", word_dict_folder_path, args.toy)
print("Loading validation dataset...")
valid_x = build_dataset("valid", word_dict, article_max_len, summary_max_len, args.toy)
valid_x_len = [len([y for y in x if y != 0]) for x in valid_x]
print("Loading article and reference...")
article = get_text_list(valid_article_path, args.toy)
reference = get_text_list(valid_title_path, args.toy)

with tf.Session() as sess:
    print("Loading saved model...")
    model = Model(reversed_dict, article_max_len, summary_max_len, args, forward_only=True)
    saver = tf.train.Saver(tf.global_variables())
    ckpt = tf.train.get_checkpoint_state(model_folder_path)
    saver.restore(sess, ckpt.model_checkpoint_path)

    batches = batch_iter(valid_x, [0] * len(valid_x), args.batch_size, 1)

    print("Writing summaries to 'result.txt'...")
    summary_array = []
    for batch_x, _ in batches:
        batch_x_len = [len([y for y in x if y != 0]) for x in batch_x]

        valid_feed_dict = {
            model.batch_size: len(batch_x),
            model.X: batch_x,
            model.X_len: batch_x_len,
        }

        prediction = sess.run(model.prediction, feed_dict=valid_feed_dict)
        prediction_output = [[reversed_dict[y] for y in x] for x in prediction[:, 0, :]]
        for line in prediction_output:
            summary = list()
            for word in line:
                if word == "</s>":
                    break
                if word not in summary:
                    summary.append(word)
            summary_array.append(" ".join(summary))
        
        with open(results_folder_path + "result.txt", "w") as f:
            for line in summary_array:
                print(line,file=f)

    print('Summaries have been generated')

Loading dictionary...
Loading validation dataset...
Loading article and reference...
Loading saved model...
INFO:tensorflow:Restoring parameters from gdrive/My Drive/tweet_generator/summary/saved_models/sports_model/model.ckpt-7850
Writing summaries to 'result.txt'...
Summaries have been generated
