In [1]:
import numpy as np
import pandas as pd

from keras import backend as K
from keras.models import Model
from keras import initializers
from keras.engine.topology import Layer
from keras.layers import Dense, Input
from keras.layers import Embedding, GRU, LSTM, Bidirectional, TimeDistributed
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.callbacks import TensorBoard

import os
import pickle

Using TensorFlow backend.


In [3]:
with open('cleaned/cred_sample', 'rb') as infile:
    cred = pickle.load(infile)

In [4]:
with open('cleaned/hate', 'rb') as infile:
    hate = pickle.load(infile)

In [5]:
len(hate) > len(cred)

True

In [6]:
hate = hate.sample(len(cred))

In [8]:
len(hate) == len(cred)

True

In [22]:
hate['label'] = 0
cred['label'] = 1
data = cred.append(hate)

In [23]:
data.head()

Unnamed: 0,id,type,domain,content,label
411362,8147326,reliable,abcnews.go.com,Email A former college basketball star and org...,1
418499,8166066,reliable,www.theguardian.com,T he organization supporting press has mounted...,1
461099,8280513,reliable,www.nytimes.com,whose software helps police departments organi...,1
419932,8170120,reliable,abcnews.go.com,Email She was the secret power behind the Sout...,1
369785,8026492,reliable,www.politico.com,Share on Facebook Share on Twitter A typical b...,1


In [24]:
max_words = 100
max_sentences = 15
max_vocab = 50000
embedding_dim = 100
attention_dim = 128
test_val_size = 0.2
articles = []
texts = []
embeddings = {}

vector_dir = './embeddings'
vector_file = 'glove.6B.100d.txt'
model_dir = './model_output/glove_100'
tb_logs = './tb_logs/glove_100'

In [11]:
class HierarchicalAttentionNetwork(Layer):
    ''''''
    def __init__(self, attention_dim):
        self.init_weights = initializers.get('glorot_normal')
        self.init_bias = initializers.get('zeros')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(HierarchicalAttentionNetwork, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init_weights((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init_bias((self.attention_dim,)))
        self.u = K.variable(self.init_weights((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(HierarchicalAttentionNetwork, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x: [batch_size, max_words/max_sentences, attention_dim]
        # size of u: [batch_size, attention_dim]
        # uit = tanh(Wx + b)
        # ait = softmax(uit*u)
        
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))

        ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        
        weighted_input = x * K.expand_dims(ait)
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]



In [14]:
from sklearn.model_selection import train_test_split

In [27]:
x_train, x_test, y_train, y_test = train_test_split(data['content'], data['label'], test_size=test_val_size,
                                                    random_state=19, stratify=data['label'])

In [28]:
tokenizer = Tokenizer()
for text in data['content']:
    texts.append(text)
tokenizer.fit_on_texts(texts)

In [29]:
len(tokenizer.word_index)

80473

In [None]:
model_checkpoint = ModelCheckpoint(filepath=model_dir+'weights.{epoch:02d}.hdf5')
tb_checkpoint = TensorBoard(log_dir=tb_logs, histogram_freq=1, batch_size=128, write_graph=False, write_grads=True,
                            write_images=True)

# callbacks=[model_checkpoint, tb_checkpoint]