# Setup

In [26]:
import pandas as pd
import nltk

In [27]:
for pkg in ['stopwords', 'punkt', 'wordnet']:
  nltk.download(pkg)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Loading data

In [43]:
import re

# Split rows of train.csv into [text, label] if possible
def split_text_label(text):
  # The first group is the text, the second is the label at the end
  match = re.match(r'^(.*)\s+?([^\s]+)$', text)
  
  if not match: raise ValueError(f'format wrong arg={text}')
  
  text = match.group(1)
  label = match.group(2)
  
  if label == '1':
    label = 1.0
  elif label == '0':
    label = 0.0
  elif label == 'label':
    return None # There's a line "content label" in the data
  else:
    raise ValueError(f'label wrong label={label}, text={text}')
  
  return [label, text.strip()]

def mapl(f, list):
  '''Map a list eagerly'''
  return [f(elem) for elem in list]

def load_csv(file_name):
  # Data obtained from https://www.kaggle.com/c/fakenewskdd2020/data
  with open(f'/content/drive/MyDrive/fakenewskdd2020/{file_name}.csv', 'r') as csv:
    next(csv) # Skip the header with the column titles
    return list(csv)

In [None]:
train_raw = mapl(split_text_label, load_csv('train'))
train_raw = [res for res in train_raw if res]

X_train_raw = mapl(lambda text_label: text_label[1], train_raw)
y_train = mapl(lambda text_label: text_label[0], train_raw)
X_test_raw = mapl(lambda line: re.sub(re.compile('^\\d+\t'), '', line), load_csv('test'))
y_test = mapl(lambda line: float(re.sub(re.compile('^\\d+,|\n'), '', line)), load_csv('sample_submission'))

assert len(X_train_raw) == len(y_train)
assert len(X_test_raw) == len(y_test)

# Processing text

In [40]:
en_stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.WordNetLemmatizer()

In [50]:
# Copied from Angad's LDA_Demo.ipynb
def preprocess_text(document):
  # Remove all the special characters
  document = re.sub(r'\W', ' ', str(document))

  # remove all single characters
  document = re.sub(r'\s+\w\s+', ' ', document)

  # Remove single characters from the start
  document = re.sub(r'^\w\s+', ' ', document)

  # Substituting multiple spaces with single space
  document = re.sub(r'\s+', ' ', document, flags=re.I)

  # Converting to Lowercase
  document = document.lower()

  # Lemmatization
  tokens = document.split()
  tokens = [stemmer.lemmatize(word) for word in tokens]
  tokens = [word for word in tokens if len(word) > 3 and word not in en_stopwords]

  return tokens

In [None]:
X_train = mapl(preprocess_text, X_train_raw)
X_test = mapl(preprocess_text, X_test_raw)
X_train, X_test

# Training model

In [51]:
import tensorflow as tf
import tensorflow.keras as tfkeras

model = tfkeras.Sequential([
  tfkeras.layers.Conv1D
])

TypeError: ignored

# Testing model