## Cleaning Dataset
Note: at this point the data has already been labelled.

1. Remove web links
2. Replace non alphanumeric characters
3. Replace contractions (e.g. I've -> I have)

In [17]:
import re
import pandas as pd
import contractions

In [12]:
file = 'datasets/all_reddit_labelled.csv'
df = pd.read_csv(file)

In [15]:
def remove_https_links(text):
    return re.sub(r'https?://\S+', '', text, flags=re.MULTILINE)

In [19]:
def replace_non_alphanumeric(text):
    return re.sub(r'[^\w\'\$ ]', ' ', text, flags=re.MULTILINE)

In [20]:
def replace_contractions(text):
    return contractions.fix(text)

In [21]:
def normalize(text):
    text = remove_https_links(text)
    text = replace_non_alphanumeric(text)
    text = replace_contractions(text)
    return text.lower()

In [23]:
df['cleaned'] = df['text'].apply(normalize)

## Tokenize

In [27]:
import spacy
import unicodedata
import inflect
nlp = spacy.load('en_core_web_sm')
STOPWORD_EXCEPTIONS = ["whatever", "whenever", "about", "nothing", "empty", "none", "more", "somewhere", "most", "not", "never"]
nlp.Defaults.stop_words -= set(STOPWORD_EXCEPTIONS)

In [28]:
def lemmatize_and_remove_stop_words(text):
    return [t.lemma_ for t in nlp(text) if not t.is_stop and len(t.lemma_) > 1]

In [29]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words."""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

In [30]:
def replace_numbers(words):
    """Replace all integer occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

In [34]:
def normalize(text):
    words = lemmatize_and_remove_stop_words(text)
    words = remove_non_ascii(words)
    words = replace_numbers(words)
    return ' '.join(words)

In [38]:
df['tokenized'] = df['cleaned'].apply(normalize)

In [41]:
df.to_csv('datasets/tokenized_text.csv', index=False)