### Downloading and exploring the dataset

In [1]:
import pandas as pd
import nltk

In [2]:
df = pd.read_csv("../raw_data/go_emotions_dataset.csv")
df.head()

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,eemcysk,>sexuality shouldn’t be a grouping category I...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,eeibobj,Man I love reddit.,False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
df.shape

(211225, 31)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211225 entries, 0 to 211224
Data columns (total 31 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   id                    211225 non-null  object
 1   text                  211225 non-null  object
 2   example_very_unclear  211225 non-null  bool  
 3   admiration            211225 non-null  int64 
 4   amusement             211225 non-null  int64 
 5   anger                 211225 non-null  int64 
 6   annoyance             211225 non-null  int64 
 7   approval              211225 non-null  int64 
 8   caring                211225 non-null  int64 
 9   confusion             211225 non-null  int64 
 10  curiosity             211225 non-null  int64 
 11  desire                211225 non-null  int64 
 12  disappointment        211225 non-null  int64 
 13  disapproval           211225 non-null  int64 
 14  disgust               211225 non-null  int64 
 15  embarrassment    

In [5]:
df.columns

Index(['id', 'text', 'example_very_unclear', 'admiration', 'amusement',
       'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
       'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
       'sadness', 'surprise', 'neutral'],
      dtype='object')

In [6]:
# exploring unclear emotions

df.iloc[1]

id                                                                eemcysk
text                     >sexuality shouldn’t be a grouping category I...
example_very_unclear                                                 True
admiration                                                              0
amusement                                                               0
anger                                                                   0
annoyance                                                               0
approval                                                                0
caring                                                                  0
confusion                                                               0
curiosity                                                               0
desire                                                                  0
disappointment                                                          0
disapproval                           

In [7]:
#Transforming unclear emotions from booleans to 1 or 0
df["example_very_unclear"] = df["example_very_unclear"].astype(int)
df.head()

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,eemcysk,>sexuality shouldn’t be a grouping category I...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,eeibobj,Man I love reddit.,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


#### Frequency of the emotions types

In [8]:
df[['example_very_unclear','admiration', 'amusement',
       'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
       'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
       'sadness', 'surprise', 'neutral']].sum().sort_values(ascending=False)

neutral                 55298
approval                17620
admiration              17131
annoyance               13618
gratitude               11625
disapproval             11424
curiosity                9692
amusement                9245
realization              8785
optimism                 8715
disappointment           8469
love                     8191
anger                    8084
joy                      7983
confusion                7359
sadness                  6758
caring                   5999
excitement               5629
surprise                 5514
disgust                  5301
desire                   3817
example_very_unclear     3411
fear                     3197
remorse                  2525
embarrassment            2476
nervousness              1810
pride                    1302
relief                   1289
grief                     673
dtype: int64

In [9]:
#checking unclear examples

df[df['example_very_unclear']== True].sum()

id                      eemcyskee0ai3ted0rtl7eei8tjaee8zjasedyc9jkee7w...
text                     >sexuality shouldn’t be a grouping category I...
example_very_unclear                                                 3411
admiration                                                              0
amusement                                                               0
anger                                                                   0
annoyance                                                               0
approval                                                                0
caring                                                                  0
confusion                                                               0
curiosity                                                               0
desire                                                                  0
disappointment                                                          0
disapproval                           

### Cleaning the dataset - Text Preprocessing

#### pre-cleaning operations 

In [10]:
# dealing with lowercase + numbers + punctuation/symbols + strip
import string

def basic_cleaning(text):
    text = text.lower()
    text = ''.join(char for char in text if not char.isdigit())
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '') 
    
    text = text.strip()
    
    return text

In [11]:
df['clean_text'] = df.text.apply(basic_cleaning)
df.head()

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,clean_text
0,eew5j0j,That game hurt.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,that game hurt
1,eemcysk,>sexuality shouldn’t be a grouping category I...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sexuality shouldn’t be a grouping category it ...
2,ed2mah1,"You do right, if you don't care then fuck 'em!",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,you do right if you dont care then fuck em
3,eeibobj,Man I love reddit.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,man i love reddit
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,name was nowhere near them he was by the falcon


#### preprocessing techniques 

In [12]:
# Tokenize (Stopwords were not removed for sentimental analysis purporse)

from nltk import word_tokenize

def tokenize (text):
    tokenized = word_tokenize(text)
    
    return tokenized

In [13]:
df['clean_text'] = df.clean_text.apply(tokenize)
df.head()

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,clean_text
0,eew5j0j,That game hurt.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,"[that, game, hurt]"
1,eemcysk,>sexuality shouldn’t be a grouping category I...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[sexuality, shouldn, ’, t, be, a, grouping, ca..."
2,ed2mah1,"You do right, if you don't care then fuck 'em!",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,"[you, do, right, if, you, dont, care, then, fu..."
3,eeibobj,Man I love reddit.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[man, i, love, reddit]"
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,"[name, was, nowhere, near, them, he, was, by, ..."


In [14]:
#Lemmatize 
#(Technique used to find the root of words, in order to group them by their meaning rather than by their exact form)

from nltk.stem import WordNetLemmatizer

def lemma(text):
    lemmatizer = WordNetLemmatizer() # Initiate lemmatizer
    lemmatized = [lemmatizer.lemmatize(word) for word in text] # Lemmatize
    lemmatized_string = " ".join(lemmatized)
   
    return lemmatized_string


In [15]:
df['clean_text'] = df.clean_text.apply(lemma)
df.head(10)

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,clean_text
0,eew5j0j,That game hurt.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,that game hurt
1,eemcysk,>sexuality shouldn’t be a grouping category I...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sexuality shouldn ’ t be a grouping category i...
2,ed2mah1,"You do right, if you don't care then fuck 'em!",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,you do right if you dont care then fuck em
3,eeibobj,Man I love reddit.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,man i love reddit
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,name wa nowhere near them he wa by the falcon
5,eespn2i,Right? Considering it’s such an important docu...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,right considering it ’ s such an important doc...
6,eczuekb,"He isn't as big, but he's still quite popular....",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,he isnt a big but he still quite popular ive h...
7,ed5tx8y,That's crazy; I went to a super [RELIGION] hig...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,thats crazy i went to a super religion high sc...
8,ef961hv,that's adorable asf,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,thats adorable asf
9,edl7cr3,"""Sponge Blurb Pubs Quaw Haha GURR ha AAa!"" fin...",0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sponge blurb pub quaw haha gurr ha aaa finale ...


#### Function for cleaning and preprocessing the dataset

In [17]:

def cleaning_and_preprocessing(text):
    text = text.lower()
    text = ''.join(char for char in text if not char.isdigit())
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '') 
    
    text = text.strip()
    
    tokenized = word_tokenize(text)
    
    lemmatizer = WordNetLemmatizer() # Initiate lemmatizer
    lemmatized = [lemmatizer.lemmatize(word) for word in tokenized] # Lemmatize
    lemmatized_string = " ".join(lemmatized)
    
    return lemmatized_string

In [18]:
df['clean_text_2'] = df.text.apply(cleaning_and_preprocessing)


In [19]:
df.head()

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,clean_text,clean_text_2
0,eew5j0j,That game hurt.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,that game hurt,that game hurt
1,eemcysk,>sexuality shouldn’t be a grouping category I...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,sexuality shouldn ’ t be a grouping category i...,sexuality shouldn ’ t be a grouping category i...
2,ed2mah1,"You do right, if you don't care then fuck 'em!",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,you do right if you dont care then fuck em,you do right if you dont care then fuck em
3,eeibobj,Man I love reddit.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,man i love reddit,man i love reddit
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,name wa nowhere near them he wa by the falcon,name wa nowhere near them he wa by the falcon
