# Setup

In [2]:
import pandas as pd
import numpy as np
import keras
import nltk

In [2]:
# for pkg in ['stopwords', 'punkt', 'wordnet']:
#   nltk.download(pkg)

# Loading data

In [9]:
import re

# Split rows of train.csv into [text, label] if possible
def split_text_label(text):
  # The first group is the text, the second is the label at the end
  match = re.match(r'^(.*)\s+?([^\s]+)$', text)
  
  if not match: raise ValueError(f'format wrong arg={text}')
  
  text = match.group(1)
  label = match.group(2)
  
  if label == '1':
    label = 1.0
  elif label == '0':
    label = 0.0
  elif label == 'label':
    return None # There's a line "content label" in the data
  else:
    raise ValueError(f'label wrong label={label}, text={text}')
  
  return [label, text.strip()]

def mapl(f, list):
  '''Map a list eagerly'''
  return [f(elem) for elem in list]

def load_csv(file_name):
  # Data obtained from https://www.kaggle.com/c/fakenewskdd2020/data
  with open(f'fakenewskdd2020/{file_name}.csv', 'r', encoding='utf8') as csv:
    next(csv) # Skip the header with the column titles
    return list(csv)

In [10]:
train_orig = mapl(split_text_label, load_csv('train'))
train_orig = [res for res in train_orig if res]

X_train_orig = mapl(lambda text_label: text_label[1], train_orig)
y_train_orig = mapl(lambda text_label: text_label[0], train_orig)
X_test_orig = mapl(lambda line: re.sub(re.compile('^\\d+\t'), '', line), load_csv('test'))
y_test_orig = mapl(lambda line: float(re.sub(re.compile('^\\d+,|\n'), '', line)), load_csv('sample_submission'))

assert len(X_train_orig) == len(y_train_orig)
assert len(X_test_orig) == len(y_test_orig)

In [37]:
# Make sure there are an equal number of fake and true articles
def balance_pos_neg(X, y):
  num_vals = min(y.count(0.0), y.count(1.0))
  pos = [0] * num_vals
  neg = [0] * num_vals
  pos_ind, neg_ind = 0, 0
  for i in range(len(X)):
    if y[i]:
      if pos_ind < num_vals:
        pos[pos_ind] = X[i]
        pos_ind += 1
    else:
      if neg_ind < num_vals:
        neg[neg_ind] = X[i]
        neg_ind += 1
  
  # Associate each set of texts with their labels
  pos = mapl(lambda text: [1.0, text], pos)
  neg = mapl(lambda text: [0.0, text], neg)
  
  # Put them together into a DataFrame
  df = pd.DataFrame(pos + neg, columns=['label', 'text'])
  # Shuffle it just in case (from https://stackoverflow.com/a/34879805)
  df = df.sample(frac=1).reset_index(drop=True)
  
  X = df['text'].to_list()
  y = np.array(mapl(bool, df['label'].to_list()))

  return (X, y)

In [38]:
X_train_trimmed, y_train = balance_pos_neg(X_train_orig, y_train_orig)
X_test_trimmed, y_test = balance_pos_neg(X_test_orig, y_test_orig)

# Processing text

### Clean text

In [39]:
# Taken from Angad's LDA_Demo.ipynb
en_stop = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.WordNetLemmatizer()

def preprocess_text(document):
  # Remove all the special characters
  document = re.sub(r'\W', ' ', str(document))
  # remove all single characters
  document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
  # Remove single characters from the start
  document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
  # Substituting multiple spaces with single space
  document = re.sub(r'\s+', ' ', document, flags=re.I)
  # Converting to Lowercase
  document = document.lower()
  # Lemmatization
  tokens = mapl(stemmer.lemmatize, document.split())
  tokens = [word for word in tokens if len(word) > 3  and word not in en_stop]

  return tokens

In [40]:
X_train_clean = mapl(preprocess_text, X_train_trimmed)
X_test_clean = mapl(preprocess_text, X_test_trimmed)

### Encode text

In [34]:
# Much of the following is from https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_clean)

max_len = max(max(map(len, X_test_clean)), max(map(len, X_test_clean)))
vocab_size = len(tokenizer.word_index) + 1
print('Max document length:', max_len)
print('Vocabulary size:', vocab_size)

encode = lambda texts: keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(texts),
    maxlen=max_len,
    padding='post'
)

X_train = encode(X_train_clean)
X_test = encode(X_test_clean)
print(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}')

Max document length: 7812
Vocabulary size: 49272
X_train shape: (4028, 7812), X_test shape: (1234, 7812)


# Training model

In [41]:
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [42]:
model = keras.Sequential([
    Embedding(vocab_size, 100, input_length=max_len),
    Conv1D(filters=32, kernel_size=8, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid'),
])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 7812, 100)         4927200   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 7805, 32)          25632     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 3902, 32)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 124864)            0         
_________________________________________________________________
dense_4 (Dense)              (None, 10)                1248650   
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 6,201,493
Trainable params: 6,201,493
Non-trainable params: 0
____________________________________________

In [43]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X_train, y_train, epochs=10, verbose=2)
model.save('model_7-10-11-33changethis')

Epoch 1/10
126/126 - 115s - loss: 0.7062 - accuracy: 0.4935
Epoch 2/10
126/126 - 108s - loss: 0.6796 - accuracy: 0.5598
Epoch 3/10
126/126 - 111s - loss: 0.5454 - accuracy: 0.7783
Epoch 4/10
126/126 - 117s - loss: 0.2219 - accuracy: 0.9536
Epoch 5/10
126/126 - 122s - loss: 0.1351 - accuracy: 0.9640
Epoch 6/10
126/126 - 122s - loss: 0.0978 - accuracy: 0.9677
Epoch 7/10
126/126 - 118s - loss: 0.0803 - accuracy: 0.9677
Epoch 8/10
126/126 - 119s - loss: 0.0678 - accuracy: 0.9685
Epoch 9/10
126/126 - 123s - loss: 0.0590 - accuracy: 0.9697
Epoch 10/10
126/126 - 120s - loss: 0.0553 - accuracy: 0.9717
INFO:tensorflow:Assets written to: model_7-10-11-33\assets


# Testing model

In [44]:
# evaluate
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 49.432740
