In [46]:
import numpy as np
import pandas as pd

from keras import backend as K
from keras.models import Model
from keras import initializers
from keras.engine.topology import Layer
from keras.layers import Dense, Input
from keras.layers import Embedding, GRU, LSTM, Bidirectional, TimeDistributed
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.callbacks import TensorBoard

import os
import pickle

In [47]:
with open('data_final/reliable_final.pkl', 'rb') as infile:
    reliable = pickle.load(infile)

In [48]:
with open('data_final/unreliable_final.pkl', 'rb') as infile:
    unreliable = pickle.load(infile)

In [49]:
reliable['label'] = 0
unreliable['label'] = 1
data = reliable.append(unreliable)

In [50]:
max_words = 30  # max num words processed for each sentence
max_sentences = 30  # max num sentences processed for each article 
max_vocab = 50000
embedding_dim = 100  # size of pretrained word vectors
attention_dim = 128  # num units in attention layer
batch_size = 128
test_size = 0.2
word_index = {}
embeddings = {}

vector_dir = './embeddings'
vector_file = 'glove.6B.100d.txt'
model_dir = './model_output/glove_100'
tb_logs = './tb_logs/glove_100'

In [51]:
# Split data into train and test sets. Test set will not be used in any way until model is trained.

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['content'], data['label'], test_size=test_size,
                                                    random_state=77, stratify=data['label'])

y_train, y_test = np.asarray(y_train), np.asarray(y_test)

In [59]:
# Build vocab word index (dict) from training articles

from collections import Counter, 
words = Counter()
for article in x_train:
    for sentence in article:
        sentence = [word.lower() for word in sentence]
        words.update(sentence)
    
with open('data_final/words.pkl', 'wb') as outfile:
    pickle.dump(words, outfile)

An embedding matrix must be constructed so that the embedding layer can look up and use the vector associated with each word. As an intermediary step, the words are first replaced with their respective indices in the word index. The embedding matrix uses these same indices for the lookup.

Only the most common *n* words will be used from the word index. By pre-populating the data and embedding matrices with zeros, any words outside the *n*-word vocabulary or not found in the set of embedding vectors will be represented by a zero or vector of zeros. The GRU and attention layers support masking, which avoids having to process vectors of all zeros.

Trial runs to ensure that the network is working properly and to determine learning rate bounds (to be used with a cyclic learning rate approach) will be conducted with a small subset of the training data comprising 30,000 articles.

In [78]:
for ix, (word, _) in enumerate(words.most_common(max_vocab)):
    word_index[word] = ix + 1 # The zero index is reserved for masking out-of-vocab words

In [80]:
def create_data_matrix(data, max_sentences=max_sentences, max_words=max_words, max_vocab=max_vocab,
                      word_index=word_index):
    data_matrix = np.zeros((len(data), max_sentences, max_words), dtype='int32')
    for i, article in enumerate(data):
        for j, sentence in enumerate(article):
            if j == max_sentences:
                break
            k = 0
            for word in sentence:
                if k == max_words:
                    break
                ix = word_index.get(word.lower())
                if ix is not None and ix < max_vocab:
                    data_matrix[i, j, k] = ix
                k = k + 1
    return data_matrix  

In [81]:
sample_data = reliable.sample(15000).append(unreliable.sample(15000))

In [82]:
sample_x_train, sample_x_test, sample_y_train, sample_y_test = train_test_split(sample_data['content'],
                                                    sample_data['label'], test_size=test_size,
                                                    random_state=77, stratify=sample_data['label'])

In [83]:
sample_train_matrix = create_data_matrix(sample_x_train)
sample_test_matrix = create_data_matrix(sample_x_test)

In [95]:
sample_y_train, sample_y_test = np.asarray(to_categorical(pd.DataFrame(sample_y_train))), np.asarray(to_categorical(pd.DataFrame(sample_y_test)))

In [84]:
with open(os.path.join(vector_dir, vector_file)) as vectors:
    for line in vectors:
        values = line.split()
        word = values[0]
        weights = np.asarray(values[1:], dtype='float32')
        embeddings[word] = weights

In [85]:
embedding_matrix = np.zeros((max_vocab + 1, embedding_dim)) # max_vocab + 1 to account for 0 as masking index
for word, i in word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will remain all zeros.
        embedding_matrix[i] = embedding_vector

In [86]:
# Construct attention layer

from tensorflow import matmul
class HierarchicalAttentionNetwork(Layer):
    ''''''
    def __init__(self, attention_dim):
        self.init_weights = initializers.get('glorot_normal')
        self.init_bias = initializers.get('zeros')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super().__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init_weights((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init_bias((self.attention_dim,)))
        self.u = K.variable(self.init_weights((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super().build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return None

    def call(self, x, mask=None):        
        #uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1))
        #uit = matmul(x, uit)
        #uit = K.tanh(K.bias_add(uit, self.b))
        #ait = K.dot(uit, self.u)
        #ait = K.squeeze(ait, -1)

        #ait = K.exp(ait)
        
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))
        
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        
        weighted_input = x * K.expand_dims(ait)
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [87]:
K.clear_session()
        
embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
                            input_length=max_words, trainable=True, mask_zero=True)

In [88]:
# layers for processing words in each sentence with attention; output is encoded sentence vector 

sentence_input = Input(shape=(max_words,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
lstm_word = Bidirectional(GRU(128, return_sequences=True))(embedded_sequences)
attn_word = HierarchicalAttentionNetwork(128)(lstm_word)
sentence_encoder = Model(sentence_input, attn_word)

In [89]:
# layers for processing sentences in each article with attention; output is prediction

article_input = Input(shape=(max_sentences, max_words), dtype='int32')
article_encoder = TimeDistributed(sentence_encoder)(article_input)
lstm_sentence = Bidirectional(GRU(128, return_sequences=True))(article_encoder)
attn_sentence = HierarchicalAttentionNetwork(128)(lstm_sentence)
preds = Dense(2, activation='softmax')(attn_sentence)
model = Model(article_input, preds)

In [90]:
# create checkpoints to save information from each training epoch

model_checkpoint = ModelCheckpoint(filepath=model_dir+'weights.{epoch:02d}.hdf5')
tb_checkpoint = TensorBoard(log_dir=tb_logs, histogram_freq=1, batch_size=128, write_graph=False, write_grads=True,
                            write_images=True)

if not os.path.exists(model_dir):
        os.makedirs(model_dir, exist_ok=True)
        
if not os.path.exists(tb_logs):
        os.makedirs(tb_logs, exist_ok=True)

In [91]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [96]:
model.fit(sample_train_matrix, sample_y_train, validation_data=(sample_test_matrix, sample_y_test),
          epochs=4, batch_size=128, callbacks=[model_checkpoint, tb_checkpoint])

Train on 24000 samples, validate on 6000 samples
Epoch 1/4

KeyboardInterrupt: 

In [135]:
y_train = to_categorical(np.asarray(pd.DataFrame(y_train)))

In [136]:
y_test = to_categorical(np.asarray(pd.DataFrame(y_test)))

In [142]:
y_test

array([[ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       ..., 
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.]], dtype=float32)

In [155]:
np.shape(y_train)

(13043, 2)

In [156]:
np.shape(x_train)

(13043,)

In [180]:
sentence_encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        (None, 30)                0         
_________________________________________________________________
embedding_3 (Embedding)      multiple                  8047400   
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 30, 256)           175872    
_________________________________________________________________
hierarchical_attention_netwo (None, 256)               33024     
Total params: 8,256,296
Trainable params: 8,256,296
Non-trainable params: 0
_________________________________________________________________


In [181]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        (None, 30, 30)            0         
_________________________________________________________________
time_distributed_8 (TimeDist (None, 30, 256)           8256296   
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 30, 256)           295680    
_________________________________________________________________
hierarchical_attention_netwo (None, 256)               33024     
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 514       
Total params: 8,585,514
Trainable params: 8,585,514
Non-trainable params: 0
_________________________________________________________________
