In [1]:
import pickle
with open('cleaned_texts.pickle', 'rb') as handle:
    texts = pickle.load(handle)
    
with open('labels.pickle', 'rb') as handle:
    labels = pickle.load(handle)

In [2]:
MAX_NB_WORDS = 100000 # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 400 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2
EMBEDDING_DIM = 50 # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "glove/glove.6B/glove.6B."+str(EMBEDDING_DIM)+"d.txt"

In [3]:
shortened_texts = []
for text in texts:
    shortened_texts.append(text[:MAX_SEQUENCE_LENGTH])

In [4]:
classes = ["fake", "satire", "bias", "conspiracy", "state", "junksci", "hate", "clickbait", "unreliable", "political", "reliable"]
cat_labels = []
for label in labels:
    cat_labels.append(classes.index(label))

In [5]:
import numpy as np
import pandas as pd
import re, sys, os, csv, keras, pickle
from keras import regularizers, initializers, optimizers, callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.layers import *
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
print("[i] Using Keras version",keras.__version__)

Using TensorFlow backend.


[i] Using Keras version 2.2.0


In [6]:
""" #uncomment this chunk to create a new Tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS/2)
tokenizer.fit_on_texts(shortened_texts)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("[i] Saved word tokenizer to file: tokenizer.pickle")
"""

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle) # load a previously generated Tokenizer

In [7]:
word_index = tokenizer.word_index
print('[i] Found %s unique tokens.' % len(word_index))

[i] Found 1404620 unique tokens.


In [8]:
sequences = tokenizer.texts_to_sequences(shortened_texts)
data = pad_sequences(sequences, padding='post', maxlen=(MAX_SEQUENCE_LENGTH))

In [9]:
labels = to_categorical(np.asarray(cat_labels)) # convert the category label to one-hot encoding
print('[i] Shape of data tensor:', data.shape)
print('[i] Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('[i] Number of entries in each category:')
print("[+] Training:",y_train.sum(axis=0))
print("[+] Validation:",y_val.sum(axis=0))

[i] Shape of data tensor: (6319863, 400)
[i] Shape of label tensor: (6319863, 11)
[i] Number of entries in each category:
[+] Training: [ 634544.   89082.  801549.  586993.       0.   81484.   48620.  171051.
  221723. 1098747. 1322098.]
[+] Validation: [158380.  22354. 200538. 146668.      0.  20437.  12209.  42702.  55470.
 274801. 330413.]


In [10]:
embeddings_index = {}
f = open(GLOVE_DIR)
print("[i] (long) Loading GloVe from:",GLOVE_DIR,"...",end="")
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()
print("Done.\n[+] Proceeding with Embedding Matrix...", end="")
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print(" Completed!")

[i] (long) Loading GloVe from: glove/glove.6B/glove.6B.50d.txt ...Done.
[+] Proceeding with Embedding Matrix... Completed!


In [11]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') # input to the model

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

embedded_sequences = embedding_layer(sequence_input)

In [12]:
l_conv_3 = Conv1D(filters=256,kernel_size=3,activation='relu')(embedded_sequences)
l_conv_4 = Conv1D(filters=256,kernel_size=5,activation='relu')(embedded_sequences)
l_conv_5 = Conv1D(filters=256,kernel_size=7,activation='relu',)(embedded_sequences)

l_conv = Concatenate(axis=1)([l_conv_3, l_conv_4, l_conv_5])

In [13]:
l_pool = MaxPooling1D(4)(l_conv)
l_drop = Dropout(0.3)(l_pool)
l_flat = GlobalAveragePooling1D()(l_drop)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(11, activation='softmax')(l_dense) #follows the number of classes

In [14]:
from keras.utils import multi_gpu_model
import tensorflow as tf

with tf.device('/cpu:0'):
    model = Model(sequence_input, preds)
    
adadelta = optimizers.Adadelta(lr=2.0, rho=0.95, epsilon=None, decay=0.1)

parallel_model = multi_gpu_model(model, gpus=2)
parallel_model.compile(loss='categorical_crossentropy',
              optimizer=adadelta,
              metrics=['acc'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 400, 50)      70231050    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 398, 256)     38656       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 396, 256)     64256       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [None]:
#adadelta = optimizers.Adadelta(lr=2.0, rho=0.95, epsilon=None, decay=0.1) # let's use a hipster optimizer because we can
#model.compile(loss='categorical_crossentropy',
#              optimizer=adadelta,
#              metrics=['acc'])
#model.summary()

In [16]:
print("Training Progress:")
model_log = parallel_model.fit(x_train, y_train, validation_data=(x_val, y_val),
                      epochs=30, batch_size=512)

Training Progress:
Train on 5055891 samples, validate on 1263972 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

KeyboardInterrupt: 