# Twitter Preprocessor

## Algorithm

1. Normalise unicode
2. Remove accents
3. Strip html tags
4. Tokenise with spacy, removing URLs
5. Return a list of tokens that do not match the following criteria according to spaCy's parsing:

	- token is an emoji
	- token is not in a list of text emoticons
	- token is a stop word
	- token is a punctuation mark
	- token is a quotation mark
	- token is a space
	- token is like a number
	- token is like a url
	- token starts with `pic.twitter.com`
    - token is classified as a MONEY entity
    - token is classified as a DATE entity
    - token is classified as a TIME entity
    - token is classified as a QUANTITY entity
	- token is only one character
	- token is `'s`
    
6. Remove any tokens that can be parsed as dates by Python's dateutil package.
7. Join the tokens into a space-separated string.

This still leaves a good deal of garbage, but the results are probably as clean as we can get Twitter data. The Twitter stoplist is the same as the standard WE1S stoplist, but with added contractions and common abbreviations on Twitter (a total of 629 stop words).

In [120]:
# Configuration
json_file   = 'project_data/json/2014-2017_humanities_tweets_deduped.json'
output_file = 'project_data/json/2014-2017_humanities_tweets_scrubbed.json'
# Use a list or a string file path
STOPWORDS   = 'twitter_stoplist.txt'

In [121]:
import pandas as pd
import re
import spacy
import unicodedata
import ujson as json
from bs4 import BeautifulSoup
from dateutil.parser import parse
from ftfy import fix_text
from spacy.language import Language
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.symbols import ORTH, LEMMA, POS, TAG
from spacy.tokenizer import Tokenizer
from spacy.tokens import Token
from spacymoji import Emoji

# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)
EMOTICONS = emoticons_happy.union(emoticons_sad)

# Handle lemmatisation exceptions
LEMMATIZATION_CASES = {
    "humanities": [{ORTH: u'humanities', LEMMA: u'humanities', POS: u'NOUN', TAG: u'NNS'}]
}
for k, v in LEMMATIZATION_CASES.items():
    nlp.tokenizer.add_special_case(k, v)
    
# Import stopwords
if isinstance(STOPWORDS, str):
    with open(STOPWORDS, 'r') as f:
        STOPWORDS = f.read().split('\n')
# if len(STOP_WORDS) is not 0:
#     for item in STOP_WORDS:
#         STOP_WORDS.remove(item)
for item in STOPWORDS:
    STOP_WORDS.add(item)
    nlp.vocab[item].is_stop = True

def skip_ents(doc, skip=['CARDINAL', 'DATE', 'QUANTITY', 'TIME']):
    # Match months
    months = re.compile(r'(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sept(?:ember)?|oct(?:ober)?|nov(?:ember)?|Dec(?:ember)?)')
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            merge = True
            if ent.label_ in skip:
                merge = False
#             if ent.label_ == 'DATE' and re.match(months, ent.text.lower()):
#                 merge = True
            if merge == True:
                attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label}
                retokenizer.merge(ent, attrs=attrs)
    return doc

# Custom tokeniser for hashtags, @, and urls
def create_tokenizer(nlp):
    # contains the regex to match all sorts of urls:
    from spacy.lang.tokenizer_exceptions import URL_PATTERN

    # spacy defaults: when the standard behaviour is required, they
    # need to be included when subclassing the tokenizer
    prefix_re = spacy.util.compile_prefix_regex(Language.Defaults.prefixes)
    infix_re = spacy.util.compile_infix_regex(Language.Defaults.infixes)
    suffix_re = spacy.util.compile_suffix_regex(Language.Defaults.suffixes)

    # extending the default url regex with regex for hashtags with "or" = |
    hashtag_pattern = r'''|^(#[\w_-]+)$'''
    url_and_hashtag = URL_PATTERN + hashtag_pattern
    url_and_hashtag_re = re.compile(url_and_hashtag)

    # set a custom extension to match if token is a hashtag
#     hashtag_getter = lambda token: token.text.startswith('#')
#     Token.set_extension('is_hashtag', getter=hashtag_getter)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=url_and_hashtag_re.match
                     )

def remove_accents(text, method='unicode'):
    """Replace accents with unaccented letters"""
    if method == 'unicode':
        return ''.join(
            c
            for c in unicodedata.normalize('NFKD', text)
            if not unicodedata.combining(c)
        )
    elif method == 'ascii':
        return (
            unicodedata.normalize('NFKD', text)
            .encode('ascii', errors='ignore')
            .decode('ascii')
        )
    else:
        msg = '`method` must be either "unicode" and "ascii", not {}'.format(method)
        raise ValueError(msg)
        
def stripHtmlTags(html):
    if html is None:
        return None
    else:
        return ''.join(BeautifulSoup(html).findAll(text=True)) 

def preprocess(tweet):
    tweet = fix_text(tweet, normalization='NFC')
    tweet = remove_accents(tweet, method='unicode')
    tweet = stripHtmlTags(tweet).strip()
    doc = nlp(tweet)
    tokens = [token.norm_.strip().replace(' ', '_') for token in doc 
              if not token._.is_emoji 
              and token.text not in EMOTICONS
              and not token.is_stop 
              and not token.is_punct 
              and not token.is_quote
              and not token.is_space 
              and not token.like_num
              and not token.like_url
              and not token.text.startswith('pic.twitter.com')
              and not token.ent_type_ == 'MONEY'
              and not token.ent_type_ == 'DATE'
              and not token.ent_type_ == 'TIME'
              and not token.ent_type_ == 'QUANTITY'
              and token.text != "'s"
              and len(token.text) > 1
             ]
    new_tokens = []
    for token in tokens:
        try:
            parse(token, fuzzy_with_tokens=True)
        except:
            new_tokens.append(token)
# #     ents = [(e.text, e.label_) for e in doc.ents]
# #     print(ents)
    return ' '.join(new_tokens)  

# Load the language model
nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = create_tokenizer(nlp)
# Add spacymoji to the pipeline
emoji = Emoji(nlp, merge_spans=False)
nlp.add_pipe(emoji, first=True)

# Add entity skipping to the pipeline
nlp.add_pipe(skip_ents, after='ner')

print('Preprocessor ready.')

Preprocessor ready.


In [122]:
%%capture output

%%time

# Load records into dataframe
records = list(map(json.loads, open(json_file, encoding='utf-8')))
with open(output_file, 'w') as f:
    for row in records:
        row['tidy_tweet'] = preprocess(row['tweet'])
        row['name'] = row['date'] + row['link'].replace('https://twitter.com/', '__').replace('/', '_')
        f.write(json.dumps(row) + '\n')
print('Done')

In [123]:
output.show()

Done
CPU times: user 5h 42min 13s, sys: 3min 37s, total: 5h 45min 50s
Wall time: 13h 41min 51s


## Notes

On a data 446 MB data file, the processing time was recorded as:

```
CPU times: user 5h 42min 13s, sys: 3min 37s, total: 5h 45min 50s
Wall time: 13h 41min 51s
```