In [1]:
import numpy as np
import pandas as pd

from keras import backend as K
from keras.models import Model
from keras import initializers
from keras.engine.topology import Layer
from keras.layers import Dense, Input
from keras.layers import Embedding, GRU, LSTM, Bidirectional, TimeDistributed
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.callbacks import TensorBoard

import os
import pickle

Using TensorFlow backend.


In [3]:
with open('cleaned/cred_sample', 'rb') as infile:
    cred = pickle.load(infile)

In [4]:
with open('cleaned/hate', 'rb') as infile:
    hate = pickle.load(infile)

In [5]:
len(hate) > len(cred)

True

In [6]:
hate = hate.sample(len(cred))

In [8]:
len(hate) == len(cred)

True

In [22]:
hate['label'] = 0
cred['label'] = 1
data = cred.append(hate)

In [101]:
max_words = 30  # max num words processed for each sentence
max_sentences = 30  # max num sentences processed for each article 
max_vocab = 50001  # size most-frequent vocab including 0 index for any out-of-vocab word
embedding_dim = 100  # size of pretrained word vectors
attention_dim = 128  # num units in attention layer
batch_size = 128
test_val_size = 0.2
articles = []
texts = []
embeddings = {}

vector_dir = './embeddings'
vector_file = 'glove.6B.100d.txt'
model_dir = './model_output/glove_100'
tb_logs = './tb_logs/glove_100'

In [196]:
from tensorflow import matmul
class HierarchicalAttentionNetwork(Layer):
    ''''''
    def __init__(self, attention_dim):
        self.init_weights = initializers.get('glorot_normal')
        self.init_bias = initializers.get('zeros')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(HierarchicalAttentionNetwork, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init_weights((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init_bias((self.attention_dim,)))
        self.u = K.variable(self.init_weights((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(HierarchicalAttentionNetwork, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return None

    def call(self, x, mask=None):
        # size of x: [batch_size, max_words/max_sentences, attention_dim]
        # size of u: [batch_size, attention_dim]
        # uit = tanh(Wx + b)
        # ait = softmax(uit*u)
        
        #uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1))
        #uit = matmul(x, uit)
        #uit = K.tanh(K.bias_add(uit, self.b))
        #ait = K.dot(uit, self.u)
        #ait = K.squeeze(ait, -1)

        #ait = K.exp(ait)
        
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))
        
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        
        weighted_input = x * K.expand_dims(ait)
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]



In [14]:
from sklearn.model_selection import train_test_split

In [27]:
x_train, x_test, y_train, y_test = train_test_split(data['content'], data['label'], test_size=test_val_size,
                                                    random_state=19, stratify=data['label'])

In [113]:
y_train, y_test = np.asarray(y_train), np.asarray(y_test)

In [28]:
tokenizer = Tokenizer()
for text in data['content']:
    texts.append(text)
tokenizer.fit_on_texts(texts)
tokenizer.num_words = max_vocab
word_index = tokenizer.word_index

In [29]:
len(word_index)

80473

In [31]:
import spacy
nlp = spacy.load('en')
def sentencize(article):
    return [sent for sent in nlp(article).sents]

In [32]:
x_train = x_train.apply(sentencize)
x_test = x_test.apply(sentencize)

In [58]:
def no_punct(article):
    stripped = []
    for sent in enumerate(article):
        stripped.append([token.text.lower() for token in sent[1] if token.pos_ !='PUNCT'])
    return stripped  

In [64]:
x_train = x_train.apply(no_punct)
x_test = x_test.apply(no_punct)

In [85]:
data_matrix = np.zeros((len(x_train), max_sentences, max_words), dtype='int32')

In [91]:
for i, sentences in enumerate(x_train):
    for j, sent in enumerate(sentences):
        if j < max_sentences:
            k = 0
            for _, word in enumerate(sent):
                if k < max_words:
                    ix = word_index.get(word.lower())
                    if ix is not None and ix < max_vocab:
                        data_matrix[i, j, k] = ix
                        k = k + 1

In [160]:
test_matrix = np.zeros((len(x_test), max_sentences, max_words), dtype='int32')
for i, sentences in enumerate(x_test):
    for j, sent in enumerate(sentences):
        if j < max_sentences:
            k = 0
            for _, word in enumerate(sent):
                if k < max_words:
                    ix = word_index.get(word.lower())
                    if ix is not None and ix < max_vocab:
                        test_matrix[i, j, k] = ix
                        k = k + 1

In [95]:
with open(os.path.join(vector_dir, vector_file)) as vectors:
    for line in vectors:
        values = line.split()
        word = values[0]
        weights = np.asarray(values[1:], dtype='float32')
        embeddings[word] = weights

In [220]:
K.clear_session()


# embedding layer

embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
                            input_length=max_words, trainable=True, mask_zero=True)

In [221]:
# layers for processing words in each sentence with attention; output is encoded sentence vector 

sentence_input = Input(shape=(max_words,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
lstm_word = Bidirectional(GRU(128, return_sequences=True))(embedded_sequences)
attn_word = HierarchicalAttentionNetwork(128)(lstm_word)
sentence_encoder = Model(sentence_input, attn_word)

In [222]:
# layers for processing sentences in each article with attention; output is prediction

article_input = Input(shape=(max_sentences, max_words), dtype='int32')
article_encoder = TimeDistributed(sentence_encoder)(article_input)
lstm_sentence = Bidirectional(GRU(128, return_sequences=True))(article_encoder)
attn_sentence = HierarchicalAttentionNetwork(128)(lstm_sentence)
preds = Dense(2, activation='softmax')(attn_sentence)
model = Model(article_input, preds)

In [223]:
# create checkpoints to save information from each training epoch

model_checkpoint = ModelCheckpoint(filepath=model_dir+'weights.{epoch:02d}.hdf5')
tb_checkpoint = TensorBoard(log_dir=tb_logs, histogram_freq=1, batch_size=128, write_graph=False, write_grads=True,
                            write_images=True)

if not os.path.exists(model_dir):
        os.makedirs(model_dir, exist_ok=True)
        
if not os.path.exists(tb_logs):
        os.makedirs(tb_logs, exist_ok=True)

In [224]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [225]:
model.fit(data_matrix, y_train, validation_data=(test_matrix, y_test), epochs=4, batch_size=128, callbacks=[model_checkpoint, tb_checkpoint])

Train on 13043 samples, validate on 3261 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1ee143e748>

In [135]:
y_train = to_categorical(np.asarray(pd.DataFrame(y_train)))

In [136]:
y_test = to_categorical(np.asarray(pd.DataFrame(y_test)))

In [142]:
y_test

array([[ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       ..., 
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.]], dtype=float32)

In [155]:
np.shape(y_train)

(13043, 2)

In [156]:
np.shape(x_train)

(13043,)

In [180]:
sentence_encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        (None, 30)                0         
_________________________________________________________________
embedding_3 (Embedding)      multiple                  8047400   
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 30, 256)           175872    
_________________________________________________________________
hierarchical_attention_netwo (None, 256)               33024     
Total params: 8,256,296
Trainable params: 8,256,296
Non-trainable params: 0
_________________________________________________________________


In [181]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        (None, 30, 30)            0         
_________________________________________________________________
time_distributed_8 (TimeDist (None, 30, 256)           8256296   
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 30, 256)           295680    
_________________________________________________________________
hierarchical_attention_netwo (None, 256)               33024     
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 514       
Total params: 8,585,514
Trainable params: 8,585,514
Non-trainable params: 0
_________________________________________________________________


In [166]:
data_matrix.shape

(13043, 30, 30)

In [170]:
data_matrix[1]

array([[   54,   907,    51,   211,    17,    80,   257,   117,   169,
           20,     8,   822,     2,  2855,    39,  1292,     8,     1,
         1435,   114,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0],
       [  262,   625,    34,     6,   474,     2,   591,     4,   857,
          584,     3,    26,  3921,    48,  4239,    62,     6,   613,
          474,     2,   877,     4,  1195,     0,     0,     0,     0,
            0,     0,     0],
       [ 2097,     3,     5,     4,   167,    24,     1,  1292,    12,
           24,   767,    15,  3825,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0],
       [   14,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0],
       [   33,    50,    26,