In [1]:
# import necessary modules
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import html
import emoji
from string import punctuation as punctuations
from nltk.corpus import stopwords
from collections import defaultdict
from wordcloud import STOPWORDS

%matplotlib inline

In [None]:
data = pd.read_csv("congressional_tweet_training_data.csv") # if training data

In [50]:
data = pd.read_csv("congressional_tweet_test_data.csv") # if test data

In [3]:
downloads = ('punkt', 'averaged_perceptron_tagger', 'wordnet', 'stopwords')
for download in downloads:
    nltk.download(download)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Interpreting byte-strings and unescaping HTML sequences

In [84]:
def decode(tweet):
    return html.unescape(eval(tweet).decode('utf-8'))

data.full_text = data.full_text.map(decode)

#### Identifying features to incorporate into learning

In [85]:
import re # regex

def count_chars(tweet):
    return len(tweet)

def count_words(tweet):
    return len(tweet.split())

def count_htags(htags):
    return len(htags.split())

def count_mentions(tweet):
    mention_matches = re.findall(r'(@[A-Za-z0-9]*)', tweet)
    return len(mention_matches)

def get_mentions(tweet):
    return re.findall(r'(@[A-Za-z0-9]*)', tweet)

# Adding some features
data['char_count'] = data["full_text"].apply(lambda tweet:count_chars(tweet))
data['word_count'] = data["full_text"].apply(lambda tweet:count_words(tweet))
data['htag_count'] = data["hashtags"].apply(lambda htags:count_htags(htags))
data['mention_count'] = data["full_text"].apply(lambda tweet:count_mentions(tweet))
data['mentions'] = data['full_text'].apply(lambda tweet:get_mentions(tweet))

#### Cleaning

The sequence was as follows:

- Populating any empty tweet cells
- Tokenization
- Part-of-speech tagging
- Lemmatization
- Eliminating stopwords (in English)
- Eliminating unwanted tokens, including:
    - Emojis
    - Hyperlinks
    - Numbers
    - Twitter noise: RT/VIDEO/AUDIO
    - Other HTML characters

In [86]:
# fill NAN cells with empty strings
data.full_text = data.full_text.fillna('').apply(nltk.word_tokenize)

In [87]:
data.full_text = data.full_text.apply(nltk.pos_tag)

In [88]:
wnl = WordNetLemmatizer()

# nltk to wordnet tag
# https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
def pos_tagger(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [90]:
# Lemmatization with our in-house POS tagger. How fancy!
for index1 in tqdm(range(len(data.full_text))):
    tagged_pairs = data.full_text[index1]
    for index2 in range(len(tagged_pairs)):
        tagged_pair = tagged_pairs[index2]
        word, tag = tagged_pair
        try:
            word = wnl.lemmatize(word, pos_tagger(tag))
        except KeyError:
            pass
        data.full_text[index1][index2] = word

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 265000/265000 [01:35<00:00, 2763.77it/s]


In [92]:
stopwords = set(stopwords.words('english'))

for index in tqdm(range(len(data.full_text))):
    words = data.full_text[index]
    for word in words:
        if word in stopwords:
            data.full_text[index].remove(word)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 265000/265000 [00:13<00:00, 18959.00it/s]


In [93]:
# Filtering punctuations, emoji, and other undesirable tokens
def is_punctuation(string: str) -> bool:    
    for ch in string:
        if ch in punctuations:
            return True
    return False


def is_emoji(string: str) -> bool:
    for ch in string:
        if ch in emoji.UNICODE_EMOJI:
            return True
    return False

def should_remove(string: str) -> bool:
    """
    Flags a string that is either:
        - a number
        - less than 2 characters long
        - a punctuation
        - a link
        - an emoji
    """
    return string.isnumeric() or \
            len(string) < 2 or \
            string == 'RT' or \
            string == 'amp' or \
            string == 'http' or \
            string == 'https' or \
            string == 'VIDEO' or \
            string == 'AUDIO' or \
            is_punctuation(string) or \
            re.search(r'^https?:\/\/.*[\r\n]*', string, flags=re.MULTILINE) is not None or \
            re.search(r'^http?:\/\/.*[\r\n]*', string, flags=re.MULTILINE) is not None or \
            is_emoji(string)  



In [94]:
# Going through each tweet and cleaning according to the prescriptions above
data['text_clean'] = [''] * len(data.full_text)
for index in tqdm(range(len(data.full_text))):
    words = data.full_text[index]
    clean_string = ''
    for word in words:
        if not should_remove(word):
            clean_string += word + ' '
    data.text_clean[index] = clean_string[:-1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.text_clean[index] = clean_string[:-1]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 265000/265000 [11:06<00:00, 397.69it/s]


#### Generating bigrams to incorporate as features

In [79]:
def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(' ') if token != '' if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]

In [97]:
data['bigrams'] = data["text_clean"].apply(lambda tweet:generate_ngrams(tweet, n_gram=2))

In [82]:
data.to_csv("cleaned_test_data.csv", index=False) # if test data

In [98]:
data.to_csv("cleaned_training_data.csv", index=False) # if training data