# Setup

In [79]:
import pandas as pd
import numpy as np
import keras
import nltk

In [2]:
# for pkg in ['stopwords', 'punkt', 'wordnet']:
#   nltk.download(pkg)

# Loading data

In [37]:
import re

# Split rows of train.csv into [text, label] if possible
def split_text_label(text):
  # The first group is the text, the second is the label at the end
  match = re.match(r'^(.*)\s+?([^\s]+)$', text)
  
  if not match: raise ValueError(f'format wrong arg={text}')
  
  text = match.group(1)
  label = match.group(2)
  
  if label == '1':
    label = 1.0
  elif label == '0':
    label = 0.0
  elif label == 'label':
    return None # There's a line "content label" in the data
  else:
    raise ValueError(f'label wrong label={label}, text={text}')
  
  return [label, text.strip()]

def mapl(f, list):
  '''Map a list eagerly'''
  return [f(elem) for elem in list]

def load_csv(file_name):
  # Data obtained from https://www.kaggle.com/c/fakenewskdd2020/data
  with open(f'fakenewskdd2020/{file_name}.csv', 'r', encoding='utf8') as csv:
    next(csv) # Skip the header with the column titles
    return list(csv)

In [52]:
train_orig = mapl(split_text_label, load_csv('train'))
train_orig = [res for res in train_raw if res]

X_train_orig = mapl(lambda text_label: text_label[1], train_orig)
y_train_orig = mapl(lambda text_label: text_label[0], train_orig)
X_test_orig = mapl(lambda line: re.sub(re.compile('^\\d+\t'), '', line), load_csv('test'))
y_test_orig = mapl(lambda line: float(re.sub(re.compile('^\\d+,|\n'), '', line)), load_csv('sample_submission'))

assert len(X_train_orig) == len(y_train_orig)
assert len(X_test_orig) == len(y_test_orig)

### Make sure there are an equal number of fake and true articles

In [81]:
def split_pos_neg(X, y, num_vals):
  pos = [0] * num_vals
  neg = [0] * num_vals
  pos_ind, neg_ind = 0, 0
  for i in range(len(X)):
    if y[i]:
      if pos_ind < num_vals:
        pos[pos_ind] = X[i]
        pos_ind += 1
    else:
      if neg_ind < num_vals:
        neg[neg_ind] = X[i]
        neg_ind += 1
  
  # Associate each set of texts with their labels
  pos = mapl(lambda text: [1.0, text], pos)
  neg = mapl(lambda text: [0.0, text], neg)
  
  # Put them together into a DataFrame
  df = pd.DataFrame(pos + neg, columns=['label', 'text'])
  # Shuffle it just in case (from https://stackoverflow.com/a/34879805)
  df = df.sample(frac=1).reset_index(drop=True)
  
  X = df['text'].to_list()
  y = np.array(mapl(bool, df['label'].to_list()))

  return (X, y)

In [82]:
num_train = min(y_train_orig.count(0.0), y_train_orig.count(1.0))
num_test = min(y_test_orig.count(0.0), y_test_orig.count(1.0))

X_train_trimmed, y_train = split_pos_neg(X_train_orig, y_train_orig, num_train)
X_test_trimmed, y_test = split_pos_neg(X_test_orig, y_test_orig, num_test)

# Processing text

### Clean text

In [64]:
# Taken from Angad's LDA_Demo.ipynb
en_stop = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.WordNetLemmatizer()

def preprocess_text(document):
  # Remove all the special characters
  document = re.sub(r'\W', ' ', str(document))
  # remove all single characters
  document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
  # Remove single characters from the start
  document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
  # Substituting multiple spaces with single space
  document = re.sub(r'\s+', ' ', document, flags=re.I)
  # Converting to Lowercase
  document = document.lower()
  # Lemmatization
  tokens = mapl(stemmer.lemmatize, document.split())
  tokens = [word for word in tokens if len(word) > 3  and word not in en_stop]

  return tokens

In [66]:
X_train_clean = mapl(preprocess_text, X_train_trimmed)
X_test_clean = mapl(preprocess_text, X_test_trimmed)

### Encode text

In [70]:
# Much of the following is from https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/

tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train_clean)

max_length = max([len(text) for text in X_train_clean])
vocab_size = len(tokenizer.word_index) + 1
print('Max document length:', length)
print('Vocabulary size:', vocab_size)

Max document length: 100000
Vocabulary size: 49272


In [71]:
def encode(tokenizer, lines, length):
  encoded = tokenizer.texts_to_sequences(lines)
  return keras.preprocessing.sequence.pad_sequences(encoded, maxlen=max_length, padding='pad')

# encode data
X_train = encode(tokenizer, X_train_trimmed, length)
X_test = encode(tokenizer, X_test_trimmed, length)
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

X_train shape:  (4028, 8718)
X_test shape:  (1234, 8718)


(array([[ 3463,  1440,   669, ...,     0,     0,     0],
        [   12,   674,  1574, ...,     0,     0,     0],
        [  340,  4364,     0, ...,     0,     0,     0],
        ...,
        [ 5145,   104, 49263, ...,     0,     0,     0],
        [  498,   524,   607, ...,     0,     0,     0],
        [12832,  5372,  3753, ...,     0,     0,     0]]),
 array([[ 1186,  1186,   600, ...,     0,     0,     0],
        [   81,   434,    93, ...,     0,     0,     0],
        [ 4744,  2133,  6652, ...,     0,     0,     0],
        ...,
        [   23,    16,    23, ...,     0,     0,     0],
        [  860, 29915,   393, ...,     0,     0,     0],
        [  258,   406, 14045, ...,     0,     0,     0]]))

# Training model

In [69]:
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

In [73]:
model = keras.Sequential([
    Embedding(vocab_size, 100, input_length=max_length),
    Conv1D(filters=32, kernel_size=8, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid'),
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8718, 100)         4927200   
_________________________________________________________________
conv1d (Conv1D)              (None, 8711, 32)          25632     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 4355, 32)          0         
_________________________________________________________________
flatten (Flatten)            (None, 139360)            0         
_________________________________________________________________
dense (Dense)                (None, 10)                1393610   
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 6,346,453
Trainable params: 6,346,453
Non-trainable params: 0
______________________________________________

In [83]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X_train, y_train, epochs=10, verbose=2)

Epoch 1/10
126/126 - 119s - loss: 0.6983 - accuracy: 0.4888
Epoch 2/10
126/126 - 129s - loss: 0.6931 - accuracy: 0.5084
Epoch 3/10
126/126 - 141s - loss: 0.6577 - accuracy: 0.6209
Epoch 4/10
126/126 - 137s - loss: 0.3132 - accuracy: 0.8875
Epoch 5/10
126/126 - 127s - loss: 0.1419 - accuracy: 0.9580
Epoch 6/10
126/126 - 128s - loss: 0.1043 - accuracy: 0.9625
Epoch 7/10
126/126 - 127s - loss: 0.0820 - accuracy: 0.9630
Epoch 8/10
126/126 - 127s - loss: 0.0664 - accuracy: 0.9685
Epoch 9/10
126/126 - 121s - loss: 0.0628 - accuracy: 0.9662
Epoch 10/10
126/126 - 119s - loss: 0.0615 - accuracy: 0.9675


<keras.callbacks.History at 0x1d23ba3d040>

# Testing model

In [84]:
# evaluate
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 50.081038
