In [2]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import re
from random import randint
import datetime
import nltk
import inflect
import contractions
from bs4 import BeautifulSoup
import re, string, unicodedata
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
import yaml
import spacy
from time import time
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

config = yaml.load(open("config.yml"), Loader=yaml.FullLoader)

[nltk_data] Downloading package stopwords to /home/sunit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sunit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/sunit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    print("Invalid device or cannot modify virtual devices once initialized.")
    print(physical_devices)
    
print(tf.test.is_gpu_available())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True


In [6]:
def read_dataset(input_dir):
    data = pd.read_csv(input_dir + '/news_summary_more.csv', encoding='utf-8')
    ingested_data = pd.DataFrame()
    ingested_data['passage'] = data['text']
    ingested_data['summary'] = data['headlines']
    print(ingested_data.head())
    return ingested_data

In [22]:
def text_strip(column, add_start_end_tokens=False):
    for row in column:
        row = unicode_to_ascii(row.lower().strip())

        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ."
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        row = re.sub(r"([?.!,¿])", r" \1 ", row)
        row = re.sub(r'[" "]+', " ", row)

        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        row = re.sub(r"[^a-zA-Z?.!,¿]+", " ", row)

        row = row.strip()

        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        if add_start_end_tokens:
            row = '<start> ' + row + ' <end>'

        # # ORDER OF REGEX IS VERY VERY IMPORTANT!!!!!!
        # row = re.sub("(\\t)", ' ', str(row)).lower()  # remove escape charecters
        # row = re.sub("(\\r)", ' ', str(row)).lower()
        # row = re.sub("(\\n)", ' ', str(row)).lower()
        #
        # row = re.sub("(__+)", ' ', str(row)).lower()  # remove _ if it occurs more than one time consecutively
        # row = re.sub("(--+)", ' ', str(row)).lower()  # remove - if it occurs more than one time consecutively
        # row = re.sub("(~~+)", ' ', str(row)).lower()  # remove ~ if it occurs more than one time consecutively
        # row = re.sub("(\+\++)", ' ', str(row)).lower()  # remove + if it occurs more than one time consecutively
        # row = re.sub("(\.\.+)", ' ', str(row)).lower()  # remove . if it occurs more than one time consecutively
        #
        # row = re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(row)).lower()  # remove <>()|&©ø"',;?~*!
        #
        # row = re.sub("(mailto:)", ' ', str(row)).lower()  # remove mailto:
        # row = re.sub(r"(\\x9\d)", ' ', str(row)).lower()  # remove \x9* in text
        # row = re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(row)).lower()  # replace INC nums to INC_NUM
        # row = re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(row)).lower()  # replace CM# and CHG# to CM_NUM
        #
        # row = re.sub("(\.\s+)", ' ', str(row)).lower()  # remove full stop at end of words(not between)
        # row = re.sub("(\-\s+)", ' ', str(row)).lower()  # remove - at end of words(not between)
        # row = re.sub("(\:\s+)", ' ', str(row)).lower()  # remove : at end of words(not between)
        #
        # row = re.sub("(\s+.\s+)", ' ', str(row)).lower()  # remove any single charecters hanging between 2 spaces
        #
        # # Replace any url as such https://abc.xyz.net/browse/sdf-5327 ====> abc.xyz.net
        # try:
        #     url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(row))
        #     repl_url = url.group(3)
        #     row = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', repl_url, str(row))
        # except:
        #     pass  # there might be emails with no url in them
        #
        # row = re.sub("(\s+)", ' ', str(row)).lower()  # remove multiple spaces

        # # Should always be last
        # row = re.sub("(\s+.\s+)", ' ', str(row)).lower()  # remove any single charecters hanging between 2 spaces

        yield row

In [23]:
def visualize_data(data, passage_word_limit, summary_word_limit):
    passage_count, summary_count = [], []
    for passage in data['passage']:
        passage_count.append(len(passage.split()))
    for summary in data['summary']:
        summary_count.append(len(summary.split()))

    counts_df = pd.DataFrame()
    counts_df['passage_count'] = passage_count
    counts_df['summary_count'] = summary_count
    counts_df.hist(bins=10)
    plt.show()

    cnt = 0
    for i in data['passage']:
        if len(i.split()) <= passage_word_limit:
            cnt = cnt + 1
    print(cnt / len(data['passage']))

    cnt = 0
    for i in data['summary']:
        if len(i.split()) <= summary_word_limit:
            cnt = cnt + 1
    print(cnt / len(data['summary']))


# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
                 if unicodedata.category(c) != 'Mn')

In [24]:
def preprocess_data(config):
    input_dir = config['input_dir']
    ingested_data_df = read_dataset(input_dir)
    cleaned_passages = text_strip(ingested_data_df['passage'], add_start_end_tokens=False)
    cleaned_summaries = text_strip(ingested_data_df['summary'], add_start_end_tokens=True)

    # nlp = spacy.load('en', disable='parser')
    nlp = spacy.load('en_core_web_sm')
    t = time()
    processed_passages = [str(doc) for doc in nlp.pipe(cleaned_passages, batch_size=5000)]
    processed_summaries = [str(doc) for doc in nlp.pipe(cleaned_summaries, batch_size=5000)]
    print("Preprocessing time : {}".format(round((time() - t) / 60, 2)))

    processed_data = pd.DataFrame()
    processed_data['passage'] = processed_passages
    processed_data['summary'] = processed_summaries

    visualize_data(processed_data, config['max_passage_len'], config['max_summary_len'])

    short_passage = []
    short_summary = []

    for i in range(len(processed_passages)):
        if len(processed_passages[i].split()) <= config['max_passage_len'] and \
                len(processed_summaries[i].split()) <= config['max_summary_len']:
            short_passage.append(processed_passages[i])
            short_summary.append(processed_summaries[i])
    postprocessed_data = pd.DataFrame({'passage': short_passage, 'summary': short_summary})
    print(postprocessed_data.head())

    return postprocessed_data

In [25]:
postprocessed_data = preprocess_data(config)
postprocessed_data.to_csv("data/postprocessed_data.csv")
postprocessed_data.head()

                                             passage  \
0  Saurav Kant, an alumnus of upGrad and IIIT-B's...   
1  Kunal Shah's credit card bill payment platform...   
2  New Zealand defeated India by 8 wickets in the...   
3  With Aegon Life iTerm Insurance plan, customer...   
4  Speaking about the sexual harassment allegatio...   

                                             summary  
0  upGrad learner switches to career in ML & Al w...  
1  Delhi techie wins free food from Swiggy for on...  
2  New Zealand end Rohit Sharma-led India's 12-ma...  
3  Aegon life iTerm insurance plan helps customer...  
4  Have known Hirani for yrs, what if MeToo claim...  


KeyboardInterrupt: 

In [4]:
postprocessed_data = pd.read_csv("data/postprocessed_data.csv", encoding='utf-8')

In [5]:
postprocessed_data['word_count_passage'] = postprocessed_data['passage'].apply(lambda x: len(str(x).split()))
var = postprocessed_data['word_count_passage'].values
var = np.sort(var, axis=None)
for i in range(0, 100, 10):
    print("{} percentile value is {}".format(i, var[int(len(var)*float((i)/100))]))
print("100 percentile value is ", var[-1])

0 percentile value is 1
10 percentile value is 60
20 percentile value is 62
30 percentile value is 63
40 percentile value is 64
50 percentile value is 65
60 percentile value is 66
70 percentile value is 67
80 percentile value is 68
90 percentile value is 69
100 percentile value is  72


In [6]:
var = postprocessed_data['word_count_passage'].values
var = np.sort(var, axis=None)
for i in range(90, 100):
    print("{} percentile value is {}".format(i, var[int(len(var)*(float(i)/100))]))
print("100 percentile value is ", var[-1])   

90 percentile value is 69
91 percentile value is 69
92 percentile value is 70
93 percentile value is 70
94 percentile value is 70
95 percentile value is 70
96 percentile value is 71
97 percentile value is 71
98 percentile value is 71
99 percentile value is 72
100 percentile value is  72


In [7]:
X_train, X_val, y_train, y_val = train_test_split(postprocessed_data["passage"], postprocessed_data["summary"], test_size=0.3, random_state=69, shuffle=True)

In [8]:
embeddings = {}
# f = open('data/embeddings/glove.6B.300d.txt')
f = open('data/embeddings/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings))

#Calculating Percentage of words from train text present in Word2vec model
words_source_train = []
for i in X_train :
  words_source_train.extend(i.split(' '))
## Find the total number of words in the Train data of Essays.
print("all the words in the corpus", len(words_source_train))
## Find the unique words in this set of words
words_source_train = set(words_source_train)
print("the unique words in the corpus", len(words_source_train))
## Find the words present in both Glove Vectors as well as our corpus.
inter_words = set(embeddings.keys()).intersection(words_source_train)
print("The number of words that are present in both glove vectors and our corpus are {} which \
is nearly {}% ".format(len(inter_words), np.round((float(len(inter_words))/len(words_source_train))
*100)))
words_corpus_source_train = {}
words_glove = set(embeddings.keys())
for i in words_source_train:
  if i in words_glove:
    words_corpus_source_train[i] = embeddings[i]
print("word 2 vec length", len(words_corpus_source_train))

Loaded 400000 word vectors.
all the words in the corpus 4263044
the unique words in the corpus 66136
The number of words that are present in both glove vectors and our corpus are 51582 which is nearly 78.0% 
word 2 vec length 51582


In [9]:
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(X_train))

# word_index = x_tokenizer.word_index
# embedding_matrix = np.zeros((len(word_index) + 1, 300))  # ...since we are using 300 dim glove vecs
# for word, i in word_index.items():
#     embedding_vector = embeddings.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

X_train = x_tokenizer.texts_to_sequences(X_train)
X_val = x_tokenizer.texts_to_sequences(X_val)

X_train = pad_sequences(X_train, maxlen=config['max_passage_len'], padding='post')
X_val = pad_sequences(X_val, maxlen=config['max_passage_len'], padding='post')

X_VOCAB_SIZE = len(x_tokenizer.word_index) + 1

In [10]:
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(y_train))

y_train = y_tokenizer.texts_to_sequences(y_train)
y_val = y_tokenizer.texts_to_sequences(y_val)

y_train = pad_sequences(y_train, maxlen=config['max_summary_len'], padding='post')
y_val = pad_sequences(y_val, maxlen=config['max_summary_len'], padding='post')

Y_VOCAB_SIZE = len(y_tokenizer.word_index) + 1

In [11]:
import keras
from keras.layers import Dropout, Dense, Embedding, LSTM, Bidirectional, Concatenate, TimeDistributed, merge
from attention import AttentionLayer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.metrics import matthews_corrcoef, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import shuffle
from keras.regularizers import l2
import warnings
import logging
logging.basicConfig(level=logging.INFO)
from keras import Input, Model

from keras import backend as K
K.clear_session() 

In [12]:
latent_dim = config['latent_dim']

In [13]:
# ENCODER
encoder_inputs = Input(shape=(config['max_passage_len'],))
encoder_embeddings = Embedding(X_VOCAB_SIZE, latent_dim, trainable=True)(encoder_inputs)

encoder_lstm1 = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True, kernel_regularizer=l2(0.02), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
encoder_output1, state_h1_f, state_h1_b, state_c1_f, state_c1_b = encoder_lstm1(encoder_embeddings)
state_h1 = Concatenate()([state_h1_f, state_h1_b])
state_c1 = Concatenate()([state_c1_f, state_c1_b])


encoder_lstm2 = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True, kernel_regularizer=l2(0.02), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
encoder_output2, state_h2_f, state_h2_b, state_c2_f, state_c2_b = encoder_lstm2(encoder_output1)
state_h2 = Concatenate()([state_h2_f, state_h2_b])
state_c2 = Concatenate()([state_c2_f, state_c2_b])


encoder_lstm3 = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True, kernel_regularizer=l2(0.02), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01)), merge_mode='sum')
encoder_output, state_h3_f, state_h3_b, state_c3_f, state_c3_b = encoder_lstm3(encoder_output2)
print(encoder_output.shape, state_h3_f.shape, state_h3_b.shape, state_c3_f.shape, state_c3_b.shape)

encoder_states = [state_h3_f, state_c3_f, state_h3_b, state_c3_b]
encoder_states = K.sum(encoder_states, axis=0)

encoder_hidden_state = Concatenate(axis=0)([state_h3_f, state_h3_b])
encoder_cell_state = Concatenate(axis=0)([state_c3_f, state_c3_b])

print(encoder_states.shape, encoder_hidden_state.shape, encoder_cell_state.shape)

(None, 72, 300) (None, 300) (None, 300) (None, 300) (None, 300)
(None, 300) (None, 300) (None, 300)


In [14]:
# DECODER
decoder_inputs = Input(shape=(None,))
decoder_embeddings = Embedding(Y_VOCAB_SIZE, latent_dim, trainable=True)(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, kernel_regularizer=l2(0.02), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01))
decoder_output, decoder_hidden_state, decoder_cell_state = decoder_lstm(decoder_embeddings, initial_state=[encoder_hidden_state, encoder_cell_state])
print(decoder_output.shape, decoder_hidden_state.shape, decoder_cell_state.shape)

attention_layer = AttentionLayer(name="attention_layer")
attn_output, attn_state = attention_layer([encoder_output, decoder_output])
print(attn_output.shape)

decoder_concat_output = Concatenate(axis=0, name='decoder_concat_layer')([decoder_output, attn_output])
print(decoder_concat_output.shape)

# query_encoding = tf.keras.layers.GlobalAveragePooling1D()(encoder_output)
# query_value_attention = attn_output_pooled
# decoder_concat_output = Concatenate(axis=0)([query_encoding, query_value_attention])
# print(decoder_concat_output.shape)

(None, None, 300) (None, 300) (None, 300)
(None, None, 300)
(None, None, 300)


In [15]:
# DENSE
decoder_dense = TimeDistributed(Dense(Y_VOCAB_SIZE, activation='softmax'))
decoder_attn_output = decoder_dense(decoder_concat_output)
print(decoder_attn_output.shape)

(None, None, 28418)


In [16]:
# MODEL
model = Model([encoder_inputs, decoder_inputs], decoder_attn_output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 72)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 72, 300)      19839900    input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   [(None, 72, 600), (N 1442400     embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) [(None, 72, 600), (N 2162400     bidirectional[0][0]              
______________________________________________________________________________________________

In [17]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string], '')
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

In [18]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [19]:
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min')
mcp_save = ModelCheckpoint('models/{epoch:03d}-{accuracy:03f}-{val_accuracy:03f}.hdf5', save_best_only=True, monitor='val_loss', mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4, mode='min')

history = model.fit(x=[X_train, y_train[:,:-1]], y=y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:,1:],
                           validation_data=([X_val, y_val[:,:-1]], y_val.reshape(y_val.shape[0], y_val.shape[1], 1)[:, 1:]),
                           epochs=4,
                           batch_size=8,
                           callbacks=[earlyStopping, mcp_save, reduce_lr_loss],
                           verbose=1)

Epoch 1/4


CancelledError:  [_Derived_]RecvAsync is cancelled.
	 [[{{node Adam/Adam/group_deps/NoOp/_117}}]]
	 [[gradient_tape/model/embedding/embedding_lookup/Reshape/_114]] [Op:__inference_train_function_20953]

Function call stack:
train_function


In [None]:
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

In [None]:
# print("\n Evaluating Model ... \n")
# predicted = model.predict_classes(X_val)
# print(metrics.classification_report(y_val, predicted))
# print("\n")
# logger = logging.getLogger("logger")
# result = compute_metrics(y_val, predicted)
# for key in (result.keys()):
#     logger.info("  %s = %s", key, str(result[key]))

In [None]:
print(attn_output.shape)