In [3]:
import re
from string import punctuation

from nltk.tokenize.casual import reduce_lengthening, remove_handles
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm_notebook as tqdm_n

!python -m spacy download en

tqdm_n().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




# INTRNLP MCO: Twitter Emoji Prediction
# Preprocessing
In this step, we preprocess the raw tweet text into tokens.

Note: Some of the "magic" cells contain Bash shell commands; these may not work on all platforms. However, all cells important in producing the final output have been written in Python for portability.

In [92]:
!ls data-raw

[31mMapping.csv[m[m      [31mOutputFormat.csv[m[m [31mTest.csv[m[m         [31mTrain.csv[m[m


There are two files containing the tweets themselves: Test.csv and Train.csv. However, only Train.csv is labeled with corresponding emoji. We use Train.csv as our basis for the final preprocessed dataset.

In [93]:
tweets = pd.read_csv('data-raw/Train.csv').iloc[:, 1:].rename(columns={'TEXT':'text', 'Label':'emoji'})
tweets.head()

Unnamed: 0,text,emoji
0,Vacation wasted ! #vacation2017 #photobomb #ti...,0
1,"Oh Wynwood, you’re so funny! : @user #Wynwood ...",1
2,Been friends since 7th grade. Look at us now w...,2
3,This is what it looks like when someone loves ...,3
4,RT @user this white family was invited to a Bl...,3


# Cleaning tweets

Checking for null rows:

In [94]:
tweets.isna().sum()

text     0
emoji    0
dtype: int64

None of the rows are null.

Before feeding the raw tweets into the tokenizer, we first remove the &#x2022; `U+2022` symbol. The bullet appears in several tweets in the dataset and tends to throw off tokenizers that don't recognize it.

In [95]:
tweets['text'] = tweets['text'].str.replace('\u2022+', ' ')
tweets['text'].head()

0    Vacation wasted ! #vacation2017 #photobomb #ti...
1    Oh Wynwood, you’re so funny! : @user #Wynwood ...
2    Been friends since 7th grade. Look at us now w...
3    This is what it looks like when someone loves ...
4    RT @user this white family was invited to a Bl...
Name: text, dtype: object

Saving the tweets to file

In [96]:
tweets.to_csv('data-clean/tweets.csv', index=False)

## Tokenization

We use SpaCy's `en_core_web_sm` language model as the basis for our tokenizer.
SpaCy's documentation describes this model:
> English multi-task CNN trained on OntoNotes. Assigns context-specific token vectors, POS tags, dependency parse and named entities.

We load the model as a pipeline of operations. By default, a SpaCy language processing pipeline includes:

|Part|Description|
|---|---|
|tokenizer|Segment text into tokens.|
|tagger|Assign part-of-speech tags.|
|parser|Assign dependency labels.|
|ner|Detect and label named entities.|
|textcat|Assign document labels.|

To shorten processing time, we disable `parser`, `ner`, and `textcat`.

In [111]:
nlp = spacy.load('en', disable=['parser', 'ner', 'textcat'])

We store the tokens in a tabular format, with each token assigned the index of the tweet it originated from.

In [112]:
tokens = tweets['text'].progress_apply(nlp)\
    .apply(pd.Series).stack().reset_index().drop(['level_1'], axis=1)\
    .rename(columns={'level_0':'index', 0:'token'})
tokens.head()

HBox(children=(IntProgress(value=0, max=70000), HTML(value='')))

Unnamed: 0,index,token
0,0,Vacation
1,0,wasted
2,0,!
3,0,#
4,0,vacation2017


Extracting only the lemma of each token.

In [113]:
tokens['token'] = tokens['token'].apply(lambda x : x.lemma_)
tokens.head()

Unnamed: 0,index,token
0,0,vacation
1,0,waste
2,0,!
3,0,#
4,0,vacation2017


Saving tokens to file.

In [114]:
tokens.to_csv('data-clean/tokens_spacy.csv', index=False)

## Cleaning tokens

We now preprocess the SpaCy-obtained tokens.

In [115]:
tokens = pd.read_csv('data-clean/tokens_spacy.csv', dtype={'token':str}, keep_default_na=False)
tokens.head()

Unnamed: 0,index,token
0,0,vacation
1,0,waste
2,0,!
3,0,#
4,0,vacation2017


Converting tokens to lowercase.

In [116]:
tokens['token'] = tokens['token'].str.lower()
tokens.head()

Unnamed: 0,index,token
0,0,vacation
1,0,waste
2,0,!
3,0,#
4,0,vacation2017


Removing user mentions.

Mentions have already been anonymized in the dataset as "@user". The regular expression `@.+` is used to remove any tokens consisting of the `@` symbol and any character after.

In [117]:
tokens = tokens[~tokens['token'].str.match('@.+')]
tokens.head()

Unnamed: 0,index,token
0,0,vacation
1,0,waste
2,0,!
3,0,#
4,0,vacation2017


Removing whitespace tokens.

In [118]:
tokens = tokens[~tokens['token'].str.match('\s+')]
tokens.head()

Unnamed: 0,index,token
0,0,vacation
1,0,waste
2,0,!
3,0,#
4,0,vacation2017


Removing punctuation tokens. Punctuation is defined by the Python standard library's `string.punctuation`.

In [27]:
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [119]:
tokens = tokens[~tokens['token'].str.match('['+punctuation+']')]
tokens.head()

Unnamed: 0,index,token
0,0,vacation
1,0,waste
4,0,vacation2017
6,0,photobomb
8,0,tired


We use the `reduce_lengthening` function, which does the following:
> Replace[s] repeated character sequences of length 3 or greater with sequences of length 3.

For instance, "yes" and "yesss" will be treated separately, while "yessss" will be counted as an instance of "yesss". This is so we can capture words with greater emphasis. Since we are doing sentiment analysis, we place great value on these variations of words that might imply stronger emotions than their more plain counterparts.

In [120]:
tokens['token'] = tokens['token'].apply(reduce_lengthening)
tokens.head()

Unnamed: 0,index,token
0,0,vacation
1,0,waste
4,0,vacation2017
6,0,photobomb
8,0,tired


Saving tokens to file.

In [121]:
tokens.to_csv('data-clean/tokens_spacy_clean.csv', index=False)

<h1 style='background-color: red; color: white; padding: 0.5rem;'>DEPRECATED! Do not include in the final paper!</h1>

## Using the NRC Hashtag Emotion Lexicon

The NRC Hashtag Emotion Lexicon, automatically generated based on the presence of emotion hashtags in a corpus of tweets, provides mappings of emotions to unigrams. We use the Hashtag Emotion Lexicon to replace our tokens with their corresponding emotions.

For instance, if the original tweet contains
```
word_a word_b
```
where `word_a` is associated with `anticipation` and `fear`, and `word_b` is associated with `anger` and `trust`, then the final tokens will be
```
anticipation fear anger trust
```

In [15]:
tokens = pd.read_csv('data-clean/tokens_spacy_clean.csv', dtype={'token':str}, keep_default_na=False)
tokens.head()

Unnamed: 0,index,token
0,0,vacation
1,0,waste
2,0,vacation2017
3,0,photobomb
4,0,tired


In [16]:
emotions = pd.read_csv('data-raw/NRC-Hashtag-Emotion-Lexicon-v0.2.txt', 
                       delimiter='\t', 
                       header=None, dtype=str, keep_default_na=False).drop(2, axis=1)
emotions.columns = ['emotion', 'word']
emotions.head()

Unnamed: 0,emotion,word
0,anticipation,crae
1,anticipation,#mycolour
2,anticipation,#vigilance
3,anticipation,#vigilant
4,anticipation,#hmmmmm


In [18]:
emotions['emotion'].unique()

array(['anticipation', 'fear', 'anger', 'trust', 'surprise', 'sadness',
       'joy', 'disgust'], dtype=object)

In [19]:
emotions2 = emotions.groupby('word')['emotion'].apply(list)
emotions2.head()

word
#&lt                                      [anticipation, surprise, joy]
#1                           [anticipation, fear, trust, surprise, joy]
#100thingsaboutme                                               [anger]
#100thingsthatmakemehappy                                         [joy]
#121212concert                                               [surprise]
Name: emotion, dtype: object

In [20]:
tokens2 = tokens.groupby('index')['token'].apply(list)
tokens2.head()

index
0    [vacation, waste, vacation2017, photobomb, tir...
1    [oh, wynwood, be, so, funny, wynwood, art, itw...
2    [be, friend, since, 7th, grade, look, at, now,...
3    [this, be, what, look, like, when, someone, lo...
4    [rt, this, white, family, be, invite, to, a, b...
Name: token, dtype: object

In [21]:
sentence = tokens2[0]
new_words = []
for word in sentence:
    try:
        new_words += emotions2[word]
    except: pass
new_words

['anticipation',
 'fear',
 'sadness',
 'joy',
 'anger',
 'disgust',
 'anger',
 'anticipation',
 'sadness',
 'joy',
 'fear',
 'sadness']

In [22]:
def replace_tokens_with_emotions(doc):
    new_doc = []
    for token in doc:
        try:
            new_doc += emotions2[token]
        except: pass
    return new_doc

In [23]:
tokens3 = tokens2.progress_apply(replace_tokens_with_emotions)
tokens3.head()

HBox(children=(IntProgress(value=0, max=69999), HTML(value='')))




index
0    [anticipation, fear, sadness, joy, anger, disg...
1    [fear, surprise, fear, trust, anger, surprise,...
2    [fear, trust, trust, sadness, disgust, surpris...
3    [surprise, fear, trust, anticipation, surprise...
4    [anticipation, trust, sadness, surprise, antic...
Name: token, dtype: object

In [24]:
tokens4 = tokens3.progress_apply(pd.Series)\
    .stack().reset_index().drop(['level_1'], axis=1)\
    .rename(columns={'level_0':'index', 0:'token'})
tokens4.head()

HBox(children=(IntProgress(value=0, max=69999), HTML(value='')))




Unnamed: 0,index,token
0,0,anticipation
1,0,fear
2,0,sadness
3,0,joy
4,0,anger


In [25]:
tokens4.to_csv('data-clean/tokens_emotions.csv', index=False)