In [1]:
import numpy as np
import pandas as pd

## Load Corpus

We will use the *Reuters-21578 Text Categorization Collection* from NTLK

In [3]:
from keras.datasets import reuters

In [4]:
from nltk.corpus import LazyCorpusLoader, CategorizedPlaintextCorpusReader

In [5]:
reuters = LazyCorpusLoader('reuters', CategorizedPlaintextCorpusReader, '(training|test).*', cat_file='cats.txt', encoding='ISO-8859-2')

In [6]:
df = pd.DataFrame({'id': reuters.fileids(), 
                   'text': [reuters.raw(i) for i in reuters.fileids()],
                   'cats': [reuters.categories(i) for i in reuters.fileids()]})

In [7]:
df.head()

Unnamed: 0,id,text,cats
0,test/14826,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...,[trade]
1,test/14828,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...,[grain]
2,test/14829,JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...,"[crude, nat-gas]"
3,test/14832,THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...,"[corn, grain, rice, rubber, sugar, tin, trade]"
4,test/14833,INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...,"[palm-oil, veg-oil]"


In [8]:
df['test'] = df['id'].apply(lambda x: x.split('/')[0] == 'test')

In [9]:
df.head()

Unnamed: 0,id,text,cats,test
0,test/14826,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...,[trade],True
1,test/14828,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...,[grain],True
2,test/14829,JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...,"[crude, nat-gas]",True
3,test/14832,THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...,"[corn, grain, rice, rubber, sugar, tin, trade]",True
4,test/14833,INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...,"[palm-oil, veg-oil]",True


## Get Tokens & Text Features

In [10]:
import nltk

In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize

Split off the text title

In [12]:
df['title'] = df['text'].apply(lambda x: x.split('\n')[0])

In [15]:
df['body'] = df['text'].apply(lambda x: ' '.join("".join(x.split('\n')[1:]).lower().split()))

In [16]:
df['body'] = df['body'].apply(lambda x: ' '.join([i for i in word_tokenize(x) if not i.isdigit()]))

Tokenize sentences

In [23]:
df['sentences'] = df['body'].apply(sent_tokenize)

In [24]:
df['n_sentences'] = df['sentences'].apply(lambda x: len(x))

In [25]:
df['max_sen_length'] = df['sentences'].apply(lambda x: max([len(word_tokenize(i)) for i in x]))

In [26]:
df['text_length'] = df['body'].apply(lambda x: len(word_tokenize(x)))

We will need theses properties to use as matrices dimensions in the network

In [27]:
MAX_TEXT_LENGTH = df['text_length'].max()

In [28]:
MAX_NUM_SENTS = df['n_sentences'].max()

In [29]:
MAX_SEN_LENGTH = df['max_sen_length'].max()

In [30]:
print('Max text length in words: {}\nMax sent length in words: {}\nMax No sentences: {}'
      .format(MAX_TEXT_LENGTH, MAX_SEN_LENGTH, MAX_NUM_SENTS))

Max text length in words: 1661
Max sent length in words: 428
Max No sentences: 51


Pad the sentences for use in the attention network

In [31]:
df['pad_sentences'] = df['sentences'].apply(lambda x: x + ([''] * (MAX_NUM_SENTS-len(x))))

### Labels

In [32]:
from sklearn.preprocessing import MultiLabelBinarizer

In [33]:
one_hot = MultiLabelBinarizer()

In [34]:
oh_labels = one_hot.fit_transform(df['cats'])

In [35]:
N_LABELS = oh_labels.shape[1]

## Attention Network

In [36]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input
from keras.models import Model

In [37]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(df['body'])

In [38]:
def text_processer(raw_texts, maxlen=100):
    tokens = tokenizer.texts_to_sequences(raw_texts)
    return pad_sequences(tokens, maxlen=maxlen)

### Import GLOVE Embeddings

In [39]:
embeddings_index = {}

f = open('glove.6B.100d.txt')

for line in f:
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

f.close()

In [40]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [41]:
embedding_matrix.shape

(27545, 100)

## Network

In [44]:
from sklearn.model_selection import train_test_split
import keras.backend as K
from keras.layers import Layer, GRU, TimeDistributed, Bidirectional, Dense
from keras import initializers

Build an data-set in the format DOCUMENTS x SENTENCES x WORDS

In [45]:
sen_data = np.stack(df['pad_sentences'].apply(lambda x: text_processer(x, MAX_SEN_LENGTH)).values)

In [46]:
sen_data.shape

(10788, 51, 428)

In [47]:
txt_train, txt_test, x_train, x_test, y_train, y_test = train_test_split(df['body'].values, sen_data, oh_labels)

This network code was adapted from https://github.com/richliao/textClassifier

In [48]:
class AttLayer(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(AttLayer, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim, )))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(AttLayer, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)

        ait = K.exp(ait)

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = x * ait
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [49]:
embedding_layer = Embedding(len(tokenizer.word_index) + 1, 100,
                            weights=[embedding_matrix],
                            input_length=MAX_SEN_LENGTH,
                            trainable=False, name='glove_embedding')

In [50]:
sentence_input = Input(shape=(MAX_SEN_LENGTH,), dtype='int32', name='sentence_input')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_att = AttLayer(100)(l_lstm)

sentEncoder = Model(sentence_input, l_att)








In [51]:
review_input = Input(shape=(MAX_NUM_SENTS, MAX_SEN_LENGTH), dtype='int32', name='review_input')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
l_att_sent = AttLayer(100)(l_lstm_sent)
preds = Dense(N_LABELS, activation='sigmoid')(l_att_sent)

model = Model(review_input, preds)

In [52]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [53]:
sentEncoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sentence_input (InputLayer)  (None, 428)               0         
_________________________________________________________________
glove_embedding (Embedding)  (None, 428, 100)          2754500   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 428, 200)          120600    
_________________________________________________________________
att_layer_1 (AttLayer)       (None, 200)               20200     
Total params: 2,895,300
Trainable params: 140,800
Non-trainable params: 2,754,500
_________________________________________________________________


In [54]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
review_input (InputLayer)    (None, 51, 428)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 51, 200)           2895300   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 51, 200)           180600    
_________________________________________________________________
att_layer_2 (AttLayer)       (None, 200)               20200     
_________________________________________________________________
dense_1 (Dense)              (None, 90)                18090     
Total params: 3,114,190
Trainable params: 359,690
Non-trainable params: 2,754,500
_________________________________________________________________


In [None]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=50)

In [None]:
def test_example(n):
    p = model.predict(x_test[n:n+1])[0]
    top = sorted(list(zip(one_hot_a.classes_, p)), key=lambda x: x[1], reverse=True)
    print(txt_test[n],'\n')
    print('Tagged as:', one_hot_a.classes_[np.nonzero(y_test[n])],'\n')
    for i in top[:4]:
        print("{:25}:{:03.4f}".format(i[0], i[1]))
        
    print('=========')

### Attention Extraction

In [None]:
hidden_out_model = Model(sentEncoder.input, sentEncoder.layers[2].output)
word_ctx_0 = sentEncoder.layers[-1].get_weights()[0]
word_ctx_1 = sentEncoder.layers[-1].get_weights()[1]
word_ctx_2 = sentEncoder.layers[-1].get_weights()[2]

def word_importance(n):
    """Extract tokens and corresponding attention weight from an indexed test message"""
    example = x_test[n]
    example = example[~np.all(example == 0, axis=1)]
    word_enc = hidden_out_model.predict(example)
    
    u_watt = np.exp(np.dot(np.tanh(np.dot(word_enc, word_ctx_0)+word_ctx_1), word_ctx_2)[:,:,0])
    u_watt = normalize(u_watt, axis=1, norm='l2')
    
    wrd_wts = []
    
    for i, j in enumerate(example):
        wrds = np.trim_zeros(j)
        toks = ([tokenizer.sequences_to_texts([wrds[k:k+1]])[0] for k in range(wrds.shape[0])])
        wrd_wts.append([toks, u_watt[i, -len(toks):]])
    
    return wrd_wts

In [None]:
def plot_attention(n):
    """Plot a bar chart of normalized attention weights with corresponding text labels"""
    wts = word_importance(n)
    r = list(range(max([len(i[0]) for i in wts])))
    f, ax = plt.subplots(len(wts), sharex=True, sharey=True, figsize=(15, 2*len(wts)))
    
    if len(wts) > 1:
        for i, j in enumerate(wts):
            rects = ax[i].bar(r, j[1].tolist()+[0 for _ in range(len(r)-j[1].shape[0])], color='0.8')
            ax[i].set_xticks([])
            ax[i].set_yticks([])
            for k, l in enumerate(j[0]):
                ax[i].text(rects[k].get_x() + rects[k].get_width()/2., 0.5, l, ha='center', va='bottom')
    
    else:
        for i, j in enumerate(wts):
            rects = ax.bar(r, j[1].tolist()+[0 for _ in range(len(r)-j[1].shape[0])], color='0.8')
            ax.set_xticks([])
            ax.set_yticks([])
            for k, l in enumerate(j[0]):
                ax.text(rects[k].get_x() + rects[k].get_width()/2., 0.5, l, ha='center', va='bottom')
    
    plt.show()