To-Do: 
- Switch to 100d pretrained embeddings (reduce parameters) 
- Decoder layers to incorporate 4 hidden layers
- Optimization: Adam, RMSProp? What's the difference? 
- Incorporate translation of decoder at the very end 
- How to incorporate ROUGE metric into accuracy 
- Split into train/validation/test such that we are consistent across the board when testing different models 
- research: how to save the models and keep them such that their training history + loss can be evaluated? 

In [1]:
# Import necessary packages 

import numpy as np
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
import collections

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
# useful references: 
# https://github.com/PacktPublishing/Hands-On-Natural-Language-Processing-with-Python/blob/master/Chapter08/02_example.ipynb

## I. PreProcessing data 

In [2]:
sepdata = pd.read_csv('wikihowSep.csv')[['headline','text']]

In [3]:
sepdata.head()
# We only care about 'headline' and 'text' 

Unnamed: 0,headline,text
0,\nSell yourself first.,"Before doing anything else, stop and sum up y..."
1,\nRead the classics before 1600.,Reading the classics is the very first thing ...
2,\nJoin online artist communities.,Depending on what scale you intend to sell yo...
3,\nMake yourself public.,Get yourself out there as best as you can by ...
4,\nBlog about your artwork.,"Given the hundreds of free blogging websites,..."


In [4]:
sepdata_v1 = sepdata.dropna(axis=0).reset_index(drop=True) 
# Get rid of any NA rows 

In [5]:
len(sepdata_v1)

1387290

## Clean up of text

In [6]:
import re #for regex search purposes          
from nltk.corpus import stopwords #stopwords that are provided to us via nltk 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
import math

In [7]:
# List of contractions that we will map to 

contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}

In [8]:
# Changes here: 
# Do not get rid of stopwords 
# Do not get rid of short words. 

def text_cleaner(text):
    # Step 0: Convert to string in case a float or int is found.
    newString = str(text)
    # Step 1: Lower case the text 
    newString = newString.lower()
    # Step 2: Get rid of commas
    newString = re.sub(r'\([^)]*\)', "", newString)
    # Step 3: Get rid of quotations 
    #newString = re.sub('"',"", newString)
    # Step 4: get rid of contractions with our contraction mapping 
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])  
    # Step 5: get rid of the \n stuff 
    newString = re.sub(r"'s\n","",newString)
    # Step 6: anything that is a number, get rid of it 
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    tokens = newString.split()
    remaining = []
    for i in tokens: 
        if len(i)>=1: 
            remaining.append(i)
    # Step 7: Tokenize everything first and keep the words that are not stop words 
    # Also keep only words that are greater than or equal to 3 characters long 
    #tokens = [w for w in newString.split() if not w in stop_words]
    #long_words=[]
    #for i in tokens:
    #    if len(i)>=3: #removing short words
    #        long_words.append(i)   
    return (" ".join(remaining)).strip()

In [9]:
sepdata_v1['text'][0:5].apply(text_cleaner)[0]

'before doing anything else stop and sum up yourself as an artist now think about how to translate that to an online profile be it the few words twitter allows you or an entire page of indulgence that your own website would allow you bring out the most salient features of your creativity your experience your passion and your reasons for painting make it clear to readers why you are an artist who loves art produces high quality art and is a true champion of art if you are not great with words find a friend who can help you with this really important aspect of selling online the establishment of your credibility and reliability'

In [10]:
# Changes here: 
# keep all words (previously, I got rid of words that were only 1 character long) 

def headline_cleaner(text):
    # Step 0: Convert to string in case a float or int is found.
    newString = str(text)
    # Step 1: remove quotations 
    newString = re.sub('"','', newString)
    # Step 2: look up contractions 
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")]) 
    # Step 3: Get rid of the \n stuff 
    newString = re.sub(r"'s\n","",newString)
    # Step 4: Get rid of numbers or anything not in the alphabet
    newString = re.sub("[^a-zA-Z]", " ", newString)
    # Step 5: Lower case 
    newString = newString.lower()
    tokens=newString.split()
    # Step 6: keep words that are greater than 1 character long 
    remaining=[]
    for i in tokens:
        if len(i)>=1:                                 
            remaining.append(i) 
    return (" ".join(remaining)).strip()

In [11]:
sepdata_v1['headline'][0:5].apply(headline_cleaner)[0]

'sell yourself first'

In [12]:
cleaned_data = sepdata_v1['text'].apply(text_cleaner) 

In [13]:
cleaned_y = sepdata_v1['headline'].apply(headline_cleaner)

In [14]:
clean_data = pd.concat([cleaned_data, cleaned_y], axis=1)
clean_data.columns = ['text','headline']

In [15]:
clean_data.head()

Unnamed: 0,text,headline
0,before doing anything else stop and sum up you...,sell yourself first
1,reading the classics is the very first thing y...,read the classics before
2,depending on what scale you intend to sell you...,join online artist communities
3,get yourself out there as best as you can by a...,make yourself public
4,given the hundreds of free blogging websites y...,blog about your artwork


In [26]:
#####
# Functions to build encoder and decoder embeddings later 
##### 

# Building top vocab 
def count_words(words_dict, text):
    for sentence in text:
        for word in sentence.split():
            if word not in words_dict:
                words_dict[word] = 1
            else:
                words_dict[word] += 1

def convert_text_to_ids(text, word2int_dict, eos=False): 
    output = []
    for item in text:
        item2int=[]
        for word in item.split(): 
            if word in word2int_dict: 
                item2int.append(word2int_dict[word]) 
            else: 
                item2int.append(word2int_dict[TOKEN_UNK])
        if eos: 
            item2int.append(word2int_dict[TOKEN_EOS])
        output.append(item2int)
    return output

TOKEN_GO = '<GO>'
TOKEN_EOS = '<EOS>'
TOKEN_PAD = '<PAD>'
TOKEN_UNK = '<UNK>'
# These are special tokens 

In [21]:
# Load pretrained embeddings and build word vector matrix 

# Vivian: originally using 300D but too many parameters...

def build_word_vector_matrix(vector_file):
    embedding_index = {}
    f = open(vector_file)
    for line in f: 
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype = 'float32') 
        embedding_index[word] = coefs
    f.close() 
    return embedding_index

# Replace the path here to point to the glove.6B.50d.txt vectors file on your system
embeddings_index = build_word_vector_matrix('glove.6B.100d.txt')

In [66]:
def text_to_seq(texts, MAX_NB_WORDS, EMBEDDING_DIM, isSummary): 
    # -- Build word count dictionary: 
    word_counts_dict = {}
    count_words(word_counts_dict, texts) 
    print("Total words in Vocabulary:", len(word_counts_dict))
    
    # -- sort and return top MAX_NB_WORDS 
    sorted_x = sorted(word_counts_dict.items(), key=lambda kv: kv[1], reverse=True)
    sorted_dict_test = collections.OrderedDict(sorted_x)
    d = collections.Counter(sorted_dict_test)
    word_dict = dict(d.most_common(VOCAB_SIZE))
    
    # -- Build word to int dictionary 
    word2int = {} 
    value = 0
    for word, count in word_dict.items():
        if word in embeddings_index:
            word2int[word] = value
            value += 1
    special_codes = [TOKEN_UNK,TOKEN_PAD,TOKEN_EOS,TOKEN_GO]
    for code in special_codes:
        word2int[code] = len(word2int)
        
    # -- Build int to word dictionary 
    int2word = {}
    for word, value in word2int.items():
        int2word[value] = word
    
    # -- Build word_emb_matrix 
    word_emb_matrix = np.zeros((len(word2int), EMBEDDING_DIM), dtype=np.float32) 
    for word, i in word2int.items():
        if word in embeddings_index:
            # put in the embedding for the word 
            word_emb_matrix[i] = embeddings_index[word]
        else:
            # if word is not found, put in a random embedding 
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
            word_emb_matrix[i] = new_embedding
    print("Length of word embeddings: ", len(word_emb_matrix))
    
    # -- Finally, convert doc to the entire sequence 
    if isSummary==True: 
        doc_as_ids = convert_text_to_ids(texts, word2int) #summaries do not get the EOS tag at the end 
    else: 
        doc_as_ids = convert_text_to_ids(texts, word2int, eos=True) 
    return (doc_as_ids, word_emb_matrix, word2int)

In [70]:
VOCAB_SIZE = 40000
embedding_dim = 300 

y_data, decoder_emb, y_word_index = text_to_seq(clean_data['headline'], VOCAB_SIZE, embedding_dim, isSummary = True)
X_data, encoder_emb, x_word_index = text_to_seq(clean_data['text'], VOCAB_SIZE, embedding_dim, isSummary = False)

Total words in Vocabulary: 68630
Length of word embeddings:  39475
Total words in Vocabulary: 156212
Length of word embeddings:  39896


In [71]:
encoder_emb.shape

(39896, 300)

In [72]:
len(x_word_index)

39896

In [73]:
decoder_emb.shape

(39475, 300)

In [74]:
len(y_word_index)

39475

In [75]:
# Now to deal with padding 

# Arbitrary lengths decided by exploratory work...can change this later as we see fit 
max_text_length = 100
max_summary_length = 30

def pad_sequences(textids_seq, summaryids_seq, x_word_index, y_word_index, max_text_length, max_summary_length): 
    padded_text = []
    for item in textids_seq: 
        itemlen = len(item) 
        if itemlen<max_text_length: 
            # sequence is shorter than max_text_length 
            padded_text.append(item + [x_word_index[TOKEN_PAD]]*(max_text_length - itemlen))
            #padded_text.append(np.array(item + [word2int[TOKEN_PAD]]*(max_text_length - itemlen)))
        else: 
            # sequence is longer than max_text_length 
            #sublist = item[:(max_text_length-1)]
            padded_text.append(item[:(max_text_length-1)] + [x_word_index[TOKEN_EOS]])
            #padded_text.append(np.array(item[:(max_text_length-1)] + [word2int[TOKEN_EOS]]))
    padded_summary = []
    for item in summaryids_seq: 
        itemlen = len(item)
        if itemlen<max_summary_length: 
            # sequence is shorter than max_summary_length 
            padded_summary.append(item + [y_word_index[TOKEN_PAD]]*(max_summary_length - itemlen))
            #padded_summary.append(np.array(item + [word2int[TOKEN_PAD]]*(max_summary_length - itemlen)))
        else: 
            # sequence is longer than max_summary_length 
            # also recall from previous: no EOS tag for summaries 
            padded_summary.append(item[:(max_summary_length)])
            #padded_summary.append(np.array(item[:(max_summary_length)]))
    return(np.array(padded_text), np.array(padded_summary))

X_data_padded, y_data_padded = pad_sequences(X_data, y_data,x_word_index, y_word_index, max_text_length, max_summary_length)




In [76]:
X_data_padded[0]
#print(len(X_data_padded[0])) #!00 long

array([   69,   208,   308,   394,   363,     4,  3760,    37,    86,
          21,    30,  3013,   213,   153,    41,    65,     1,  6484,
          12,     1,    30,   226,  1467,    15,    10,     0,   151,
         425,  2265,   697,     2,     9,    30,   613,   336,     6,
       19728,    12,     5,   155,   415,   127,   197,     2,   292,
          39,     0,    79, 39892,  1486,     6,     5,  4593,     5,
         318,     5,  3911,     4,     5,  1195,    11,  1847,    26,
          10,   408,     1,  2596,   405,     2,    14,    30,  3013,
          97,  3635,  1287,  4137,   203,   641,  1287,     4,     8,
           3,   996, 11312,     6,  1287,    13,     2,    14,    20,
         206,    18,   425,    73,     3,   241,    97,    16,    52,
       39894])

In [77]:
x_word_index[TOKEN_EOS] #note the EOS tag at the end 

39894

In [78]:
y_data_padded[0]

array([  976,    60,    93, 39472, 39472, 39472, 39472, 39472, 39472,
       39472, 39472, 39472, 39472, 39472, 39472, 39472, 39472, 39472,
       39472, 39472, 39472, 39472, 39472, 39472, 39472, 39472, 39472,
       39472, 39472, 39472])

In [79]:
y_data_padded[1]

array([  150,     0,  9609,    79, 39472, 39472, 39472, 39472, 39472,
       39472, 39472, 39472, 39472, 39472, 39472, 39472, 39472, 39472,
       39472, 39472, 39472, 39472, 39472, 39472, 39472, 39472, 39472,
       39472, 39472, 39472])

In [80]:
y_word_index[TOKEN_PAD]

39472

In [102]:
y_data_padded.shape

(1387290, 30)

In [103]:
y_data_padded

array([[  976,    60,    93, ..., 39472, 39472, 39472],
       [  150,     0,  9609, ..., 39472, 39472, 39472],
       [  461,   217,  3287, ..., 39472, 39472, 39472],
       ...,
       [   45,     8,     0, ..., 39472, 39472, 39472],
       [   45,     8,     0, ..., 39472, 39472, 39472],
       [   45,     8, 39471, ..., 39472, 39472, 39472]])

## II. Build Model 

In [None]:
# Note to self: Most of these, trying to stay as close to Vivek's model naming so we can compare 

In [81]:
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, TimeDistributed, Bidirectional, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from attention_keras.layers.attention import AttentionLayer

from tensorflow.keras import backend as K 
K.clear_session() 
hidden_units = 200 #Paper mentions 600 hidden units, but we can change this 

In [82]:
len(x_word_index) # given that I have the tokens in all of my word indexes, i think I can just take len()

39896

In [83]:
enc_embedding_layer = Embedding(len(x_word_index),
                            embedding_dim,
                            weights=[encoder_emb],
                            input_length=max_text_length,
                            trainable=False,
                            name='EncoderEmbeddingLayer')

In [84]:
dec_embedding_layer = Embedding(len(y_word_index),
                            embedding_dim,
                            weights=[decoder_emb],
                            input_length=max_summary_length,
                            trainable=False,
                            name='DecoderEmbeddingLayer')

In [85]:
# Encoder 

# Encoder input 
# 2D (sequence_length, None), where sequence length is the MAX_LEN unified by padding in preprocessing
encoder_inputs = Input(shape=(max_text_length,), name="EncoderInput") 
enc_emb = enc_embedding_layer(encoder_inputs) 


#LSTM 1 
encoder_lstm1 = LSTM(hidden_units,return_sequences=True,return_state=True, name='EncLSTM1') 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb) 

#LSTM 2 
encoder_lstm2 = LSTM(hidden_units,return_sequences=True,return_state=True, name='EncLSTM2') 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1) 

#LSTM 3 
encoder_lstm3=LSTM(hidden_units, return_state=True, return_sequences=True, name='EncLSTM3') 
encoder_output3, state_h3, state_c3= encoder_lstm3(encoder_output2) 

#LSTM 4 
encoder_lstm4=LSTM(hidden_units, return_state=True, return_sequences=True, name='EncLSTM4') 
encoder_outputs, state_h, state_c= encoder_lstm4(encoder_output3)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [86]:
# Decoder 

decoder_inputs = Input(shape=(None,), name = 'DecoderInput') 
#dec_emb_layer = Embedding(y_voc_size, latent_dim,trainable=True) 
dec_emb = dec_embedding_layer(decoder_inputs) 

#LSTM using encoder_states as initial state
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True, name='DecLSTM1') 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c]) 

#Attention Layer
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs]) 

# Concat attention output and decoder LSTM output 
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

#Dense layer
decoder_dense = TimeDistributed(Dense(len(y_word_index)+1, activation='softmax')) 
decoder_outputs = decoder_dense(decoder_concat_input)

In [87]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
EncoderInput (InputLayer)       [(None, 100)]        0                                            
__________________________________________________________________________________________________
EncoderEmbeddingLayer (Embeddin (None, 100, 300)     11968800    EncoderInput[0][0]               
__________________________________________________________________________________________________
EncLSTM1 (LSTM)                 [(None, 100, 200), ( 400800      EncoderEmbeddingLayer[0][0]      
__________________________________________________________________________________________________
EncLSTM2 (LSTM)                 [(None, 100, 200), ( 320800      EncLSTM1[0][0]                   
______________________________________________________________________________________________

In [89]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [108]:
model.fit([X_data_padded[0:2000],y_data_padded[0:2000]], 
                  y_data_padded[0:2000],
                  epochs=1,
                  batch_size=50)


# history=model.fit([x_tr,y_tr[:,:-1]], y_tr.reshape(y_tr.shape[0],y_tr.shape[1], 1)[:,1:] ,epochs=50,callbacks=[es],batch_size=512, validation_data=([x_val,y_val[:,:-1]], y_val.reshape(y_val.shape[0],y_val.shape[1], 1)[:,1:]))


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


<tensorflow.python.keras.callbacks.History at 0x16af13048>