In [None]:
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
import math
import re
import json
from matplotlib import pyplot as plt
from nltk import sent_tokenize
from gensim.models import Word2Vec
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Attention
from keras.layers import Embedding
import tensorflow as tf
import pydot
#from attention_decoder import AttentionDecoder

Run Instructions: 
 1. Make sure the json folder (containing all the .json files) and the document "oa-ccby-40k-ids.csv" are in the same folder at the same level as this jupyter notebook
 2. Run from top to bottom!

In [22]:
# generator for loading in and getting the vocabulary set for each level
CORPUS_SIZE = 750
CORPUS_SIZE_WITH_TESTING = 1000
def data_generator_overall_vocab(papers_ids):
    json_file_ids = pd.read_csv(papers_ids)
    file_ids = np.array(json_file_ids)
    X_batch, y_batch = [],[]
    X_vocab = []
    X_sentences = []
    index = 0
    while True:
        X_batch, y_batch = [], []
        s = "json/" + str(file_ids[index][0]) + ".json"
        data = json.load(open(s))
        sentences = []
        # grabs all the sentences in the body text
        for key, value in data.items():
            if key == "body_text":
                for v in value:
                    sentences.append(v["sentence"])
        sentences = np.array(sentences)
        #preprocessing
        # makes everything lowercase and removes punctuation
        for i in range(0, sentences.size):
            sentences[i] = sentences[i].lower()
            sentences[i] = re.sub(r'[^\w\s]', '', sentences[i])
        X_sentences = np.array([sentences[i] for i in range(0, sentences.size)])
        # transform sentences into a list of words
        X_batch = np.array([np.array(sentences[i].split(" ")) for i in range(0, sentences.size)])
        try:
            # get the abstract data
            y_sentences = sent_tokenize(data["abstract"])
            for i in range(0, len(y_sentences)):
                # preprocessing
                y_sentences[i] = y_sentences[i].lower()
                y_sentences[i] = re.sub(r'[^\w\s]', '', y_sentences[i])
            # splits each sentence  into a list of words
            y_batch = np.array([np.array(sentence.split(" ")) for sentence in y_sentences])
        except:
            yield
        # gets all the words used
        X_corpus = []
        for l in X_batch:
            for t in l:
                X_corpus.append(t)
        index = index + 1
        # np.unique(X_corpus) returns the vocabulary set in this document
        yield X_batch, y_batch, X_sentences, np.unique(X_corpus)

In [29]:
vocab_generator = data_generator_overall_vocab("oa-ccby-40k-ids.csv")
# X_train- the data in the form of np.array(np.array(np.array)) where the layers are document-sentence-word from out to in
# y_train - the data is in the same format as X_train
# documents - the data is in the form of np.array(np.array()) where the layers are document-sentence
# sentence_dump - a list of all the sentences in the corpus
X_train, y_train, documents, sentence_dump = np.empty(CORPUS_SIZE, dtype = object), np.empty(CORPUS_SIZE, dtype = object), np.empty(CORPUS_SIZE, dtype = object), []
X_test, y_test, documents_test, sentence_dump_test = np.empty(CORPUS_SIZE_WITH_TESTING - CORPUS_SIZE, dtype = object), np.empty(CORPUS_SIZE_WITH_TESTING - CORPUS_SIZE, dtype = object), np.empty(CORPUS_SIZE_WITH_TESTING - CORPUS_SIZE, dtype = object), []

# iterates through the given document
# params: index- document id
# start_v: the current set of vocabulary before processing this document
#
def iterate(index, start_v):
    x_b, y_b, x_s, v = next(vocab_generator)
    X_train[index] = x_b
    y_train[index] = y_b
    documents[index] = x_s
    for s in x_s:
        sentence_dump.append(s)
    return np.union1d(start_v, v)

def iterate_test(index):
    x_b, y_b, x_s, v = next(vocab_generator)
    X_test[index] = x_b
    y_test[index] = y_b
    documents_test[index] = x_s
    for s in x_s:
        sentence_dump_test.append(s)
    pass
# goes through all the data in the corpus
x_1, y_1, x_s1, vocab = next(vocab_generator)
X_train[0] = x_1
y_train[0] = y_1
documents[0] = x_s1
num_fails = 0
for i in range(1,CORPUS_SIZE):
    try:
        vocab = iterate(i, vocab)
    except:
        continue
for i in range(0, CORPUS_SIZE_WITH_TESTING - CORPUS_SIZE):
    try:
        iterate(i,vocab)
    except:
        continue
# after running this cell, X_train, y_train, documents, sentence

In [30]:
# gets rid of the null training documents
index = 0
for X in X_train:
    X = np.array(X)
    if X.size <= 1:
        X_train = np.delete(X_train, i)
    else:
        index += 1
REFINED_CORPUS_SIZE = X_train.size

In [5]:
# adds zeros of embedding size to the vocab words not in the current model
def fill_in_blanks(vocab, word2vec_model):
    for v in vocab:
        try:
            word2vec_model.wv[v]
        except:
            word2vec_model.wv[v] = np.zeros(200)
    return word2vec_model


TF-IDF approach to weighting the words

In [6]:
# TF-IDF Manual Approach For Attention Vector
# gets the overall counts of all the documents in the corpus
def word_count_dict(vocab, data):
    count_dict = {}
    for v in vocab:
        count_dict[v] = 0
    index = 0
    for X in data:
        X = np.array(X)
        if X.size <= 1:
            continue
        for arr in X:
            arr = np.array(arr)
            for token in arr:
                try:
                    count_dict[token] = count_dict[token] + 1
                except:
                    continue
        index += 1
    return count_dict

In [7]:
# gets the frequency of all terms in the selected paper
def term_frequency(counter, data, index):
    term_dict = {}
    total_count = 0
    if data[index] == None:
        return term_dict
    for arr in data[index]:
        arr = np.array(arr)
        for token in arr:
            total_count = total_count + 1
            
    for c in counter:
        term_dict[c] = counter[c] / total_count
    
    return term_dict, total_count

In [8]:
# gets the log inverse document appearance of a all tokens in the document
def inverse_term_frequency(counter, data):
    inverse_dict = {}
    for c in counter.keys():
        inDoc = 0
        for doc in data:
            if doc == None:
                continue
            if any(c in x for x in np.array(doc)):
                inDoc += 1
            # to smooth the data, if it does not occur, say it occurred once
        if inDoc == 0:
            inverse_dict[c] == math.log(REFINED_CORPUS_SIZE/1)
        else:
            inverse_dict[c] = math.log(REFINED_CORPUS_SIZE / inDoc)

    return inverse_dict

In [9]:
# gets the overall tf_idf weights for the words in the vocabulary
def tf_idf(vocab, data, index):
    tf_idf = {}
    counter = word_count_dict(vocab, data)
    term_freq = term_frequency(counter, data, index)
    inverse_term_freq = inverse_term_frequency(counter, data)
    for term in vocab:
        tf_idf[term] = term_freq[term] * inverse_term_freq[term]
    return td_idf

In [10]:
# MergeSort for getting top 10,000 words, as I was getting a truth value error that I could not find a fix for in the built-in functions
def merge(l, r):
    n = len(l) + len(r)
    A = l
    for key, value in r.items():
        A[key] = value 
    keys_A = list(A.keys())
    keys_r = list(r.keys())
    keys_l = list(l.keys())
    j = 0
    k = 0
    for i in range(0, n):
        if (j > len(l)):
            keys_A[i] = keys_r[k]
            A[keys_A[i]] = r[keys_r[k]]
            k += 1
        elif (k > len(l)):
            keys_A[i] = keys_l[j]
            A[keys_A[i]] = l[keys_l[j]]
            j += 1
        elif (l[keys_l[j]] <= r[keys_r[k]]):
            keys_A[i] = keys_l[j]
            A[keys_A[i]] = l[keys_l[j]]
            j += 1
        else:
            keys_A[i] = keys_r[k]
            A[keys_A[i]] = r[keys_r[k]]
            k += 1
    return A
            
# trims the vocab down to the top ten thousand words
# uses a MergeSort Algorithm
def trimVocab(counter):
    if (len(counter) == 1):
        return counter
    right_side = dict(list(counter.items())[len(counter)//2:])
    left_side = dict(list(counter.items())[:len(counter)//2])
    left_side = trimVocab(left_side)
    right_side = trimVocab(right_side)
    
    counter = merge(left_side, right_side)
    ten_thousand_most_common = dict(list(counter.items())[:10000])
    return ten_thousand_most_common

In [11]:
# the counter dictionary (not a Counter object, wasn't working as well in other functions)
counter = word_count_dict(vocab,X_train)

In [12]:
# alternative method that I realized worked after implementing MergeSort
# quicker than my implementation, so I switched it over
res = dict(list(sorted(counter.items(), key = lambda x: x[1], reverse = True))[:10000])
res_k = list(res.keys())
trimmed_vocab = list(res.keys())
trimmed_vocab.append("UNK")
trimmed_vocab = np.array(trimmed_vocab)

In [13]:
# counts the number of words in a given Document
def docCount(data, index):
    total_count = 0
    if data[index] == None:
        return 0
    for arr in data[index]:
        arr = np.array(arr)
        for token in arr:
            total_count += 1
    
    return total_count

In [14]:
# maps the word to index and vice versa, for converting words to a numerical categorical value
def mapWordToIndex(vocab):
    w_t_i = {}
    i_t_w = {}
    w_t_i["UNK"] = 0
    i_t_w[0] = "UNK"
    index = 1
    for v in vocab:
        w_t_i[v] = index
        i_t_w[index] = v
        index += 1
        
    return w_t_i, i_t_w

# maps the word and indices from mapWordToIndex to the Word2Vec Embeddings
def mapToEmbedding(i_t_w, word2vec, vocab_size):
    i_t_e = {}
    w_t_e = {}
    for i, w in i_t_w.items():
        if w == "UNK":
            i_t_e[i] = np.zeros(200)
            w_t_e[w] = np.zeros(200)
        else:
            i_t_e[i] = word2vec.wv[w]
            w_t_e[w] = word2vec.wv[w]
    return i_t_e, w_t_e

# creates an embedding matrix of size (vocab_size, embedding_size)
# needed to put in the embedding_intializer parameter in the keras Embedding Layer for the RNN
def createEmbeddingMatrix(vocab_size, embedding_size, i_t_e):
    embedding_matrix = np.zeros((vocab_size + 1, embedding_size))
    for i, e in i_t_e.items():
        embedding_matrix[i] = e
    return embedding_matrix    

In [15]:
# splits all the sentences into a simple 2d array to process in word2vec
split_sentences = np.array([sentence.split(" ") for sentence in sentence_dump])
word2vec_model = Word2Vec(sentences = split_sentences, sg = 1, window = 5, size = 200, min_count = 1)
word2vec_model = fill_in_blanks(vocab, word2vec_model)

prepared


In [16]:
# use word2vec as a model input
w_t_i, i_t_w = mapWordToIndex(trimmed_vocab)
i_t_e, w_t_e = mapToEmbedding(i_t_w, word2vec_model, trimmed_vocab.size)
embedding_matrix = createEmbeddingMatrix(trimmed_vocab.size, 200, i_t_e)
#word2vec_model = Word2Vec(sentences = sentences_dump, sg = 1, window = 5, size = 200, min_count = 1)
embedding_matrix.shape

(10002, 200)

## RNN
### Pointer Generator Model

In [17]:
# helper function for RNN, where most of the action happens
# for each individual document
def add_new_word(word_data, cur_sum, inputs_src, pos):
    vocab_size = trimmed_vocab.size + 1
    # Average abstract length is 150-250 words in length, so I thought 200 words would be a good length.
    sum_txt_length = pos
    # source side for Hidden Layer W
    
    # overloaded my application memory even with one epoch
    src_embedding = Embedding(vocab_size, 200, embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix), trainable=False)(inputs_src)
    src_hidden_layer = LSTM(200)(src_embedding)
    #sum side for Hidden Layer U
    
    # did not use pre-trained word embeddings as this is supposed to take into account the already used words in the summary 
    inputs_cur_sum = Input(shape=(sum_txt_length,))
    cur_sum_embedding = Embedding(vocab_size, 200)(inputs_cur_sum)
    cur_sum_hidden_layer = LSTM(200)(cur_sum_embedding)
    #decoder side for Hidden Layer V
    attention_result = Attention()([src_hidden_layer, cur_sum_hidden_layer])
    decoder = tf.concat([attention_result, cur_sum_hidden_layer], 1)
    decoded = Dense(vocab_size, activation='softmax')(decoder)
    
    return decoded

### Encoder-Decoder with Attention

In [18]:
# helper function for the RNN, though most of the work is done here
def add_new_word_simple(word_data, cur_sum, inputs_src, pos):
    vocab_size = trimmed_vocab.size + 1
    # Average abstract length is 150-250 words in length, so I thought 200 words would be a good length.
    sum_txt_length = pos
    
    # source side for Hidden Layer W
    src_embedding = Embedding(vocab_size, 200, embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix), trainable=False)(inputs_src)
    src_hidden_layer = LSTM(200)(src_embedding)
    decoded = Dense(vocab_size, activation='sigmoid')(src_hidden_layer)
    #attention layer
    # the distribution is the TF-IDF for this document
    
#     attention_dist = tf_idf(trimmed_vocab, X_train, 0)
#    attention_result = Attention()([decoded, ])
    simple_model = Model(inputs=inputs_src, outputs = decoded)

    simple_model.compile(optimizer='adam', loss='categorical_crossentropy')
    return decoded

### Function that creates the RNN

In [31]:
# creaes the RNN, compiles it, and returns the model
def document_summarize(sum_length, article_choice, simple):
    model = Model()
    inputs_src = Input(shape=(7000,))
    cur_sum = Input(shape=(None,))
    output_sum = cur_sum
    if simple:
        output_sum = tf.concat([output_sum,add_new_word_simple(X_train, output_sum, inputs_src, i)], axis = 1)
    else:
        output_sum = add_new_word(X_train, output_sum, inputs_src, i)
    if simple:
        model = Model(inputs=inputs_src, outputs = output_sum)
    else:
        sum_len = Input(shape=(sum_length,))
        model = Model(inputs=[inputs_src,cur_sum], outputs = output_sum)
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    return model

### The next cell throws an error as the models do not compile properly

In [20]:
# Pointer Generator
model = document_summarize(20, 0, False)
simple_model = document_summarize(20,0,True)

ValueError: Graph disconnected: cannot obtain value for tensor Tensor("input_3:0", shape=(None, 999), dtype=float32) at layer "embedding_1". The following previous layers were accessed without issue: []

### Generates the batch data for the step in the epoch of training the RNN

In [None]:
def generateStepper(X_train, y_train, vocab, simple):
    index = 0
    # will pass the data as indices
    w_t_i, i_t_w = mapWordToIndex(vocab)
    while True:
        while (X_train[index] == None) or (y_train[index] == None):
            index+=1
        X_start = X_train[index]
        X_batch = []
        src_txt_length = docCount(X_train, index)
        req_length = 0
        # creates the X_batch data
        for sents in X_start:
            sents = list(sents)
            if req_length >= 7000:
                break
            for token in sents:
                if req_length >= 7000:
                    break
                req_length += 1
                try:
                    new_input = w_t_i[token]
                except:
                    new_input = w_t_i["UNK"]
                X_batch.append(new_input)
        while req_length < 7000:
            req_length += 1
            X_batch.append(0)
        # creates the y_true value
        y_src = y_train[index]
        y_batch = []
        index = 0
        for y_sent in y_src:
            y_sent = list(y_sent)
            if index >= 20:
                break
            for token in y_sent:
                if index >= 20:
                    break
                index += 1
                try:
                    y_batch.append(w_t_i[token])
                except:
                    y_batch.append(0)
        index += 1
        # as of now: the output is also the input to the pointer generator
        # I know that is wrong. It should be an array that start with a special initializer token 
        # and after each step it should add the newly generated value
        if simple:
            yield np.array(X_batch), np.array(y_batch)
        else:
            yield [np.array(X_batch), np.array(y_batch)], np.array(y_batch)
        X_start = []

In [None]:
simple_model.summary()

In [None]:
model.summary()

In [None]:
# GOAL TF-IDF to re-weight the embeddings between steps/epochs
document_data_generator = generateStepper(X_train, y_train, trimmed_vocab, True)
simple_model.fit(document_data_generator, steps_per_epoch = 1, epochs = 1)

In [None]:
# Needed Steps (Unfinished)
# Actually translating to words
#l, y = next(document_data_generator)
#model.predict(l[0])
# post-processing step
# combination - get the top 200 from both and find where they overlap and use overlap. Might need to find larger sets
#2. linear scaling - multiply together after smoothing tfidfs

Sources:
1. https://machinelearningmastery.com/gentle-introduction-text-summarization/
2. https://stackoverflow.com/questions/28373282/how-to-read-a-json-dictionary-type-file-with-pandas
3. https://towardsdatascience.com/recurrent-neural-networks-rnns-3f06d7653a85
4. http://www.abigailsee.com/2017/04/16/taming-rnns-for-better-summarization.html
5. https://www.scribbr.com/apa-style/apa-abstract/
6. https://machinelearningmastery.com/data-preparation-variable-length-input-sequences-sequence-prediction/