In [23]:
import numpy as np
import pandas as pd
import pickle
import os

import utils.config as config
from cyclic.rate_cycler import CyclicLR

from sklearn.model_selection import train_test_split

from keras import backend as K
from keras.models import Model
from keras import initializers
from keras.engine.topology import Layer
from keras.layers import Input, Dropout, Dense
from keras.layers import Embedding, GRU, LSTM, Bidirectional, TimeDistributed
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint, Callback, LambdaCallback
from keras.optimizers import SGD, Adam

from tensorflow import set_random_seed

In [24]:
reliable = pd.read_csv('data_final/reliable_final.csv')

In [None]:
np.random.seed(3)
set_random_seed(24)

In [None]:
with open('data_final/reliable_final.pkl', 'rb') as infile:
    reliable = pickle.load(infile)
    
with open('data_final/unreliable_final.pkl', 'rb') as infile:
    unreliable = pickle.load(infile)
    
reliable['label'] = 0
unreliable['label'] = 1
data = reliable.append(unreliable).sample(1000)

In [None]:
max_words = config.max_words  # max num words processed for each sentence
max_sentences = config.max_sentences  # max num sentences processed for each article 
max_vocab = config.max_vocab
embedding_dim = config.embedding_dim  # size of pretrained word vectors
attention_dim = config.attention_dim  # num units in attention layer
GRU_dim = config.GRU_dim  # num units in GRU layer, but it is bidirectional so outputs double this number
epochs = config.epochs
batch_size = config.batch_size
test_size = config.test_size

vector_file = 'embeddings/glove.6B.200d.txt'
model_dir = 'model_output/'

In [None]:
# Split data into train and test sets. Test set will not be used in any way until model is trained.

data_train, X_test, labels_train, y_test = train_test_split(data['content'], data['label'], test_size=test_size,
                                                    random_state=77, stratify=data['label'])
X_train, X_val, y_train, y_val = train_test_split(data_train, labels_train, test_size=test_size,
                                                    random_state=77, stratify=labels_train)
y_train = np.asarray(to_categorical(y_train))
y_val = np.asarray(to_categorical(y_val))
y_test = np.asarray(to_categorical(y_test))

num_samples = X_train.shape[0]

In [None]:
with open('data_final/words.pkl', 'rb') as infile:
    words = pickle.load(infile)

In [None]:
word_index = {}
for ix, (word, _) in enumerate(words.most_common(max_vocab)):
    word_index[word] = ix + 1 # The zero index is reserved for masking out-of-vocab words

In [None]:
def create_data_matrix(data, max_sentences=max_sentences, max_words=max_words, max_vocab=max_vocab,
                      word_index=word_index):
    data_matrix = np.zeros((len(data), max_sentences, max_words), dtype='int32')
    for i, article in enumerate(data):
        for j, sentence in enumerate(article):
            if j == max_sentences:
                break
            k = 0
            for word in sentence:
                if k == max_words:
                    break
                ix = word_index.get(word.lower())
                if ix is not None and ix < max_vocab:
                    data_matrix[i, j, k] = ix
                k = k + 1
    return data_matrix  

In [None]:
train_matrix = create_data_matrix(X_train)
val_matrix = create_data_matrix(X_val)
test_matrix = create_data_matrix(X_test)

In [None]:
def store_embeddings(vector_file=vector_file):
    embeddings = {}
    with open(vector_file) as vectors:
        for line in vectors:
            values = line.split()
            word = values[0]
            weights = np.asarray(values[1:], dtype='float32')
            embeddings[word] = weights
    return embeddings
            
embeddings = store_embeddings()

In [None]:
def create_embedding_matrix(max_vocab=max_vocab, embeddings=embeddings, word_index=word_index,
                            embedding_dim=embedding_dim):
    embedding_matrix = np.zeros((max_vocab + 1, embedding_dim)) # max_vocab + 1 to account for 0 as masking index
    for word, i in word_index.items():
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will remain all zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix
            
embedding_matrix = create_embedding_matrix()

In [None]:
# Construct attention layer

from tensorflow import matmul
class HierarchicalAttentionNetwork(Layer):
    ''''''
    def __init__(self, attention_dim):
        self.init_weights = initializers.get('glorot_normal')
        self.init_bias = initializers.get('zeros')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super().__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init_weights((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init_bias((self.attention_dim,)))
        self.u = K.variable(self.init_weights((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super().build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return None

    def call(self, x, mask=None):        
        #uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1))
        #uit = matmul(x, uit)
        #uit = K.tanh(K.bias_add(uit, self.b))
        #ait = K.dot(uit, self.u)
        #ait = K.squeeze(ait, -1)

        #ait = K.exp(ait)
        
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))
        
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        
        weighted_input = x * K.expand_dims(ait)
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [None]:
def build_model(attention_dim=attention_dim, GRU_dim=GRU_dim, drop=False, drop_pct=None,
                embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, word_index=word_index):
    
    embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
                                input_length=max_words, trainable=False, mask_zero=True)

    #  Layers for processing words in each sentence with attention; output is encoded sentence vector 
    sentence_input = Input(shape=(max_words,), dtype='int32')
    embedded_sequences = embedding_layer(sentence_input)
    lstm_word = Bidirectional(GRU(GRU_dim, return_sequences=True))(embedded_sequences)
    attn_word = HierarchicalAttentionNetwork(attention_dim)(lstm_word)
    sentence_encoder = Model(sentence_input, attn_word)
    
    #  Layers for processing sentences in each article with attention; output is prediction
    article_input = Input(shape=(max_sentences, max_words), dtype='int32')
    article_encoder = TimeDistributed(sentence_encoder)(article_input)
    lstm_sentence = Bidirectional(GRU(GRU_dim, return_sequences=True))(article_encoder)
    attn_sentence = HierarchicalAttentionNetwork(attention_dim)(lstm_sentence)
    #  The Adam optimizer also will be tried and can take a dropout layer
    if drop:
        drop_sentence = Dropout(drop_pct)(attn_sentence)
        preds = Dense(2, activation='softmax')(drop_sentence)
    else:
        preds = Dense(2, activation='softmax')(attn_sentence)
    
    return Model(article_input, preds)

In [None]:
# create checkpoints to save information from each training epoch

model_checkpoint = ModelCheckpoint(filepath='model_output/weights.{epoch:02d}-{val_loss:.2f}.hdf5')

The CyclicLR class defaults to the min and max learning rates determined previously, $10^{-2.5}$ and $10^{-1}$.

In [None]:
clr = CyclicLR(epochs=epochs, num_samples=num_samples, batch_size=batch_size)

In [None]:
K.clear_session()
try:
    del model
except NameError:
    pass
model = build_model()
opt = SGD(momentum=0.9)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])

model.fit(train_matrix, y_train, validation_data=(val_matrix, y_val),
          batch_size=batch_size, epochs=epochs, callbacks=[clr, model_checkpoint])

model.save('model_output/model.h5')

In [None]:
clr.plot_lr()