In [1]:
import pickle
with open('cleaned_texts.pickle', 'rb') as handle:
    texts = pickle.load(handle)
    
with open('labels.pickle', 'rb') as handle:
    labels = pickle.load(handle)

In [2]:
shortened_texts = []
for text in texts:
    shortened_texts.append(text[:401])

In [3]:
classes = ["fake", "satire", "bias", "conspiracy", "state", "junksci", "hate", "clickbait", "unreliable", "political", "reliable"]
cat_labels = []
for label in labels:
    cat_labels.append(classes.index(label))

In [4]:
MAX_NB_WORDS = 100000 # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 400 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2
EMBEDDING_DIM = 100 # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "/home/timothy/simple-news-classifier/glove/glove.6B/glove.6B."+str(EMBEDDING_DIM)+"d.txt"

In [6]:
import numpy as np
import pandas as pd
import re, sys, os, csv, keras, pickle
from keras import regularizers, initializers, optimizers, callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.layers import *
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
print("[i] Using Keras version",keras.__version__)

Using TensorFlow backend.


[i] Using Keras version 2.2.0


In [7]:
""" #uncomment this chunk to create a new Tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(shortened_texts)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("[i] Saved word tokenizer to file: tokenizer.pickle")
"""
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle) # load a previously generated Tokenizer

In [8]:
sequences = tokenizer.texts_to_sequences(shortened_texts)
word_index = tokenizer.word_index
print('[i] Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, padding='post', maxlen=(MAX_SEQUENCE_LENGTH))

[i] Found 1406265 unique tokens.


In [9]:
labels = to_categorical(np.asarray(cat_labels)) # convert the category label to one-hot encoding
print('[i] Shape of data tensor:', data.shape)
print('[i] Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('[i] Number of entries in each category:')
print("[+] Training:",y_train.sum(axis=0))
print("[+] Validation:",y_val.sum(axis=0))

[i] Shape of data tensor: (6319863, 400)
[i] Shape of label tensor: (6319863, 11)
[i] Number of entries in each category:
[+] Training: [ 634232.   89155.  801751.  586909.       0.   81467.   48805.  171146.
  222371. 1098503. 1321552.]
[+] Validation: [158692.  22281. 200336. 146752.      0.  20454.  12024.  42607.  54822.
 275045. 330959.]


In [10]:
print("Tokenized sequence:\n", data[0])
print("One-hot label:", labels[0])

Tokenized sequence:
 [  553 14420 92705   344   640 99046  5138   400 22194  2002 48948   344
  8923  2162 64524     4 13988 31022 18986  1189 10075  1789 29237 21718
  1102  1189    66   840     4 24777 29237 16545  2517  2454  2517   686
 35359  1102  2577  4938   657 54213  2171  9863  2162 26873  9170  1189
 10075   553 14420   365     4  9034  2171  2162  1872     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0

In [19]:
embeddings_index = {}
f = open(GLOVE_DIR)
print("[i] (long) Loading GloVe from:",GLOVE_DIR,"...",end="")
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()
print("Done.\n[+] Proceeding with Embedding Matrix...", end="")
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print(" Completed!")

[i] (long) Loading GloVe from: /home/timothy/simple-news-classifier/glove/glove.6B/glove.6B.100d.txt ...Done.
[+] Proceeding with Embedding Matrix... Completed!


In [20]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') # input to the model

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

embedded_sequences = embedding_layer(sequence_input)

In [21]:
l_conv_3 = Conv1D(filters=128,kernel_size=3,activation='relu')(embedded_sequences)
l_conv_4 = Conv1D(filters=128,kernel_size=5,activation='relu')(embedded_sequences)
l_conv_5 = Conv1D(filters=128,kernel_size=7,activation='relu',)(embedded_sequences)

l_conv = Concatenate(axis=1)([l_conv_3, l_conv_4, l_conv_5])

In [22]:
l_pool = MaxPooling1D(4)(l_conv)
l_drop = Dropout(0.3)(l_pool)
l_flat = GlobalAveragePooling1D()(l_drop)
l_dense = Dense(64, activation='relu')(l_flat)
preds = Dense(11, activation='softmax')(l_dense) #follows the number of classes

In [23]:
model = Model(sequence_input, preds)
adadelta = optimizers.Adadelta(lr=2.0, rho=0.95, epsilon=None, decay=0.1) # let's use a hipster optimizer because we can
model.compile(loss='categorical_crossentropy',
              optimizer=adadelta,
              metrics=['acc'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 400, 100)     140626600   input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 398, 128)     38528       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 396, 128)     64128       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_6 (

In [25]:
print("Training Progress:")
model_log = model.fit(x_train, y_train, validation_data=(x_val, y_val),
                      epochs=30, batch_size=128)

Training Progress:
Train on 5055891 samples, validate on 1263972 samples
Epoch 1/30
 312192/5055891 [>.............................] - ETA: 46:16 - loss: 1.6732 - acc: 0.4101

KeyboardInterrupt: 