In [1]:
import numpy as np
import pandas as pd
#import cPickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os


from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed, SpatialDropout1D
from keras.models import Model
from keras.layers.normalization import BatchNormalization

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, regularizers, constraints

  return f(*args, **kwds)
Using TensorFlow backend.


In [2]:
MAX_SENT_LENGTH = 60
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2


def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()


In [3]:
data_train = pd.read_csv('/Users/venkatakrishnamohansunkara/Desktop/DM/classification_data.csv', sep=',')
print(data_train.shape)

(17575, 7)


In [4]:
data_train.head()

Unnamed: 0.1,Unnamed: 0,Content,sentiment_compound,sentiment_neu,sentiment_neg,sentiment_pos,Category
0,0,GANDHINAGAR: The state government's vaunted 'K...,0.8795,0.939,0.012,0.05,unrest
1,1,"LUCKNOW: A committed RSS pracharak since 1977,...",-0.15,0.852,0.07,0.078,unrest
2,2,"CHENNAI: As news of the violence spread, the a...",-0.9971,0.747,0.22,0.032,unrest
3,3,BHIWANDI: Three boys have been rescued from th...,-0.9919,0.844,0.119,0.037,unrest
4,4,CHANDIGARH: As chief of internal security and ...,0.8983,0.861,0.061,0.078,unrest


In [5]:
data_train.dropna(inplace=True)

In [6]:
data_train['cat_id'] = 0
for i,row in data_train.iterrows():
    if row['Category'] == 'unrest':
        data_train.set_value(i,'cat_id',0)
    else:
        data_train.set_value(i,'cat_id',1)

  after removing the cwd from sys.path.
  


In [7]:
from nltk import tokenize

reviews = []
labels = []
texts = []
i=0

for idx in data_train['Unnamed: 0']:
    i = idx
    text = BeautifulSoup(data_train.Content[idx])
    text = clean_str(text.get_text())
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)

    labels.append(data_train.cat_id[idx])

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')




In [8]:
# texts have the entire content as an entry. (25000,1)
# reviews have a list of sentences as an entry. (25000,number_of_sentences)
# labels have the label of a document. (25000,1)

In [9]:
for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            # convert sentence to words
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                # If the number of words in a sentence is less than 100 and it's index is < 20000 then assign it.
                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Total 100277 unique tokens.
Shape of data tensor: (17467, 15, 60)
Shape of label tensor: (17467, 2)


In [10]:
# Every document is converted to a 3d array. 
# 1st dimension is the index of the document.
# 2nd dimension is the number of sentences in the document (Atmost 15 sentences, rest ignored)
# 3rd dimension is the number of words in each sentence (Atmost 200 words, rest ignored)
# The words are converted into indices and atmost 20000 unique words are allowed (rest ignored)
# word index is a dictionary mapping from a word to a unique index.

In [11]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Number of positive and negative reviews in traing and validation set
[6871. 7103.]
[1704. 1789.]


In [12]:
#20000 are used for training and the remaining 5000 are used for validation.

In [13]:
GLOVE_DIR = "/Users/venkatakrishnamohansunkara/Desktop/DM/textClassifier"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [14]:
# embeddings_index is a dictionary mapping from a word to a 100 dimensional vector

In [15]:

# building Hierachical Attention network
# Embedding matrix should consist of all the unique words (which is the lenght of word index) as rows and their embeddings as columns (100 dimensions)
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
         #words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = 100*[0]

In [16]:
# Now embedding matrix consists of 81503 rows and 100 columns.


In [17]:
# First words (a sentence of 100 words) are provided to the embedding layer and are converted to 100 dim vector using the embedding matrix.
# Here the embedding matrix is learnt during the training process.
# First argument is the size of the vocabulary.
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True,
                            mask_zero=True)

In [18]:
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
    

class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        #a = K.exp(ait)
        a = K.softmax(ait)
        # apply mask after the exp. will be re-normalized next
        #if mask is not None:
        #    # Cast the mask to floatX to avoid float64 upcasting in theano
        #    a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        #a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [None]:
# None is present as the first dimension which represents the number of sentences passed.
# 100 words are input 
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
# These 100 words are converted to (100,100) array using the embedding matrix.
embedded_sequences = embedding_layer(sentence_input)
# These (100,100) are input to GRU at each time-step (100 time-steps) where each time step takes (1,100) as input and
# produces (1,100) as output. As bi-directional GRU is used, the 100 from front is concatenated with 100 from back and
# a 200 dimensional vector is output at each timestep.
# This occurs for 100 timesteps. so, (100,200) is the output shape of this layer.
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
#l_tan = TimeDistributed(Dense(1,activation='tanh'))(l_lstm)
# These 100 timesteps are converted to a single vector using the attention mechanism. So, output is 200 dim vector.
l_att = AttentionWithContext()(l_lstm)
sentEncoder = Model(sentence_input, l_att)

In [19]:
input_layer = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH),dtype='int32')
sentence_input = Input(shape=(MAX_SENT_LENGTH,),dtype='int32')
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True,
                            mask_zero=True)(sentence_input)
drop1 = SpatialDropout1D(0.3)(embedding_layer)
sent_lstm = Bidirectional(LSTM(100, name='blstm_1',
        activation='tanh',
        recurrent_activation='hard_sigmoid',
        recurrent_dropout=0.0,
        dropout=0.4, 
        kernel_initializer='glorot_uniform',
        return_sequences=True),
        merge_mode='concat')(drop1)
#sent_lstm_1 = BatchNormalization()(sent_lstm)
sent_att_layer = AttentionWithContext()(sent_lstm)
sentEncoder = Model(sentence_input, sent_att_layer)
sentEncoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 60)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 60, 100)           10027800  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 60, 100)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 60, 200)           160800    
_________________________________________________________________
attention_with_context_1 (At (None, 200)               40400     
Total params: 10,229,000
Trainable params: 10,229,000
Non-trainable params: 0
_________________________________________________________________


In [None]:
sentEncoder.summary()

In [None]:
# The input to the entire network is an entire document which consists of 15 sentences and each sentence have atmost 100 words.
# So, an array of size (15,100) is passed as input.
review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
# Now each sentence needs to go through the sentence encoder to obtain sentence representations. So, a time distributed 
# layer is used which distributes the same network across time (each sentence).
review_encoder = TimeDistributed(sentEncoder)(review_input)
# The output of the sentence encoder network is a 200 dim vector. But as we are passing 15 sentence every time we get 
# a 15,200 dim array as output.
# Now this 15,200 is input to a bi-direction GRU. Each time step takes 1,200 as input. Why not double??
l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
# Attention network converts the 15 sentences to a single representation. So, we get just a 200 dim vector.
#l_tan_sent = TimeDistributed(Dense(1,activation='tanh'))(l_lstm_sent)
l_att_sent = AttentionWithContext()(l_lstm_sent)
preds = Dense(2, activation='softmax')(l_att_sent)
model = Model(review_input, preds)

In [20]:
textEncoder = TimeDistributed(sentEncoder)(input_layer)
drop2 = Dropout(0.4)(textEncoder)

lstm_1 = Bidirectional(LSTM(100, name='blstm_2',
        activation='tanh',
        recurrent_activation='hard_sigmoid',
        recurrent_dropout=0.0,
        dropout=0.4, 
        kernel_initializer='glorot_uniform',
        return_sequences=True),
        merge_mode='concat')(drop2)
lstm_1 = BatchNormalization()(lstm_1)
att_layer = AttentionWithContext()(lstm_1)
drop3 = Dropout(0.5)(att_layer)
predictions = Dense(2, activation='softmax')(drop3)
model = Model(inputs=input_layer, outputs=predictions)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 15, 60)            0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 15, 200)           10229000  
_________________________________________________________________
dropout_1 (Dropout)          (None, 15, 200)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 15, 200)           240800    
_________________________________________________________________
batch_normalization_1 (Batch (None, 15, 200)           800       
_________________________________________________________________
attention_with_context_2 (At (None, 200)               40400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
__________

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [None]:
print("model fitting - Hierachical attention network")
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=10, batch_size=50)

In [None]:
# Get output before the attention layer
get_layer_output = K.function([model.layers[0].input, K.learning_phase()], [model.layers[4].output])
out = get_layer_output([x_train[0],0])  # test mode
print(out[0].shape)

In [None]:
# Get the attention weights of the sentence network.
eij = np.tanh(np.dot(out[0],model.layers[5].get_weights()[0]))
t = np.dot(eij,model.layers[5].get_weights()[2])
ai = np.exp(t)
weights = ai/np.sum(ai)

In [None]:
# 15 weights for 15 sentences.
weights.shape
#np.sum(out[0]*weights,axis=1).shape

In [21]:
# Get the attention weights of the word network for every sentence.
get_layer_op_words = K.function([sentEncoder.layers[0].input, K.learning_phase()], [sentEncoder.layers[4].output])
op_words = get_layer_op_words([x_train[0],0])
print(op_words[0].shape)                                                                                    

(15, 200)


In [22]:
weight_all_words = []
for i in range(MAX_SENTS):
    eij_words = np.tanh(np.dot(op_words[0][i],sentEncoder.layers[5].get_weights()[0]))
    t_words = np.dot(eij_words,sentEncoder.layers[5].get_weights()[2])
    ai_words = np.exp(t_words)
    weights_words = ai_words/np.sum(ai_words)
    weight_all_words.append(weights_words)

IndexError: list index out of range

In [None]:
# 60 weights for 60 words 
len(weight_all_words)

In [None]:
#weight_all_words

In [None]:
id2word = {v: k for k, v in word_index.items()}
# Iterate through the top sentences
sent_no = 0
for i in x_train[0]:
    # Iterate through the words in the sentence
    weights_words = weight_all_words[sent_no]
    sent = ''
    word_no =0
    for j in i:
        if j!=0:
            sent+=id2word[j]+' '+str(weights_words[word_no] * weights[0][sent_no])+' ' 
        word_no+=1
    print(str(weights[0][sent_no])+'---'+sent+'.')
    sent_no+=1

In [None]:
with open("visualization.html", "w") as html_file:
    html_file.write('<!DOCTYPE html>\n')
    html_file.write('<html>\n')
    html_file.write('<body>\n')
    sent_no = 0
    for i in x_train[0]:
        # Iterate through the words in the sentence
        weights_words = weight_all_words[sent_no]
        sent = ''
        word_no =0
        html_file.write('<font style="background-color: rgba(255, 0, 0, %f)">%s</font>\n' %(weights[0][sent_no],str(weights[0][sent_no])))
        for j in i:
            if j!=0:
                alpha = weights_words[word_no]*50
                word = id2word[j]
                html_file.write('<font style="background-color: rgba(0, 0, 255, %f)">%s</font>\n' % (alpha, word))
                sent+=id2word[j]+' '+str(weights_words[word_no])+' ' 
            word_no+=1
        html_file.write('<br />\n')
        print(str(weights[0][sent_no])+'---'+sent+'.')
        sent_no+=1
    html_file.write('</body>\n')
    html_file.write('</html>\n')

In [None]:
import matplotlib.pyplot as plt

In [None]:
Blues = plt.get_cmap('Blues')

In [None]:
sentEncoder.layers[5]

In [30]:
sentEncoder.layers[4]

<__main__.AttentionWithContext at 0x1a3185ca20>