## Hugging Face

In [127]:
import pickle


In [128]:
!pip install datasets



In [129]:
from datasets import load_dataset
dataset = load_dataset(
   'emotion')

Using custom data configuration default
Reusing dataset emotion (/Users/vprentice/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


In [130]:
dataset.keys()

dict_keys(['train', 'validation', 'test'])

In [131]:
import pandas as pd
df = pd.DataFrame(dataset['train'])

In [132]:
df['label'].value_counts()

1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: label, dtype: int64

In [133]:
df2 = pd.DataFrame(dataset['validation'])
df3 = pd.DataFrame(dataset['test'])

In [134]:
full_df = pd.concat([df,df2,df3])

In [135]:
full_df.shape

(20000, 2)

In [136]:
full_df.dtypes

text     object
label     int64
dtype: object

In [137]:
full_df['word_label'] = full_df['label'].map({0:'sad',1:'happy', 2:'love', 3:'angry', 4:'fear', 5:'surprised'})

In [138]:
full_df

Unnamed: 0,text,label,word_label
0,i didnt feel humiliated,0,sad
1,i can go from feeling so hopeless to so damned...,0,sad
2,im grabbing a minute to post i feel greedy wrong,3,angry
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,angry
...,...,...,...
1995,i just keep feeling like someone is being unki...,3,angry
1996,im feeling a little cranky negative after this...,3,angry
1997,i feel that i am useful to my people and that ...,1,happy
1998,im feeling more comfortable with derby i feel ...,1,happy


In [139]:
simplified_df = full_df[full_df['word_label'].isin(['sad','angry','happy'])]

In [140]:
simplified_df = simplified_df[['text', 'word_label']]

In [141]:
simplified_df

Unnamed: 0,text,word_label
0,i didnt feel humiliated,sad
1,i can go from feeling so hopeless to so damned...,sad
2,im grabbing a minute to post i feel greedy wrong,angry
4,i am feeling grouchy,angry
5,ive been feeling a little burdened lately wasn...,sad
...,...,...
1994,i can feel its suffering,sad
1995,i just keep feeling like someone is being unki...,angry
1996,im feeling a little cranky negative after this...,angry
1997,i feel that i am useful to my people and that ...,happy


In [142]:
simplified_df['source'] = 'HuggingFace'

In [143]:
simplified_df

Unnamed: 0,text,word_label,source
0,i didnt feel humiliated,sad,HuggingFace
1,i can go from feeling so hopeless to so damned...,sad,HuggingFace
2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace
4,i am feeling grouchy,angry,HuggingFace
5,ive been feeling a little burdened lately wasn...,sad,HuggingFace
...,...,...,...
1994,i can feel its suffering,sad,HuggingFace
1995,i just keep feeling like someone is being unki...,angry,HuggingFace
1996,im feeling a little cranky negative after this...,angry,HuggingFace
1997,i feel that i am useful to my people and that ...,happy,HuggingFace


In [144]:
simplified_df.to_csv('../raw_data/huggingface_happysadangry.csv')

## SemEval2018

In [146]:
semeval = pd.read_csv('../raw_data/emotion_data/2018-E-c-En-train-dev.csv')

In [147]:
semeval

Unnamed: 0,ID,Tweet,anger,joy,sadness
0,2017-En-21441,‚ÄúWorry is a down payment on a problem you ma...,0,0,0
1,2017-En-31535,Whatever you decide to do make sure it makes y...,0,1,0
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,1,1,0
3,2017-En-31436,Accept the challenges so that you can literall...,0,1,0
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,1,0,0
...,...,...,...,...,...
7719,2018-En-01993,@BadHombreNPS @SecretaryPerry If this didn't m...,1,0,0
7720,2018-En-01784,Excited to watch #stateoforigin tonight! Come ...,0,1,0
7721,2018-En-04047,"Blah blah blah Kyrie, IT, etc. @CJC9BOSS leavi...",1,0,1
7722,2018-En-03041,#ThingsIveLearned The wise #shepherd never tru...,0,0,0


In [148]:
semeval_anger = semeval[semeval['anger'] == 1]
semeval_anger['word_label'] = 'angry'
semeval_joy = semeval[semeval['joy'] == 1]
semeval_joy['word_label'] = 'happy'
semeval_sadness =semeval[semeval['sadness'] == 1]
semeval_sadness['word_label'] = 'sad'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [149]:
semeval_hsa = pd.concat([semeval_anger, semeval_joy, semeval_sadness])
semeval_hsa.reset_index(inplace=True)

In [150]:
semeval_hsa.drop(columns=['index','anger','joy','sadness'], inplace=True)

In [151]:
semeval_hsa.drop(columns=['ID'], inplace=True)

In [152]:
semeval_hsa.rename(columns={'Tweet':'text'}, inplace=True)

In [153]:
semeval_hsa

Unnamed: 0,text,word_label
0,@Max_Kellerman it also helps that the majorit...,angry
1,My roommate: it's okay that we can't spell bec...,angry
2,Rooneys fucking untouchable isn't he? Been fuc...,angry
3,@BossUpJaee but your pussy was weak from what ...,angry
4,S/O to the girl that just hit my car...not onl...,angry
...,...,...
8004,@ProfessorF @Mediaite @law_newz Childish tempe...,sad
8005,Still 19 days left before I go home to the Phi...,sad
8006,Literally hanging on by a thread need some tay...,sad
8007,was one moron driving his oversize tonka truc...,sad


In [154]:
semeval_hsa['source'] = 'SemEval-2018'

In [104]:
semeval_hsa.to_csv('../raw_data/semeval-2018_happysadangry.csv')

## Meld-Crowdflower

In [101]:
mc = pd.read_csv('../raw_data/emotion_data/meld-crowdflower_happysadangry.csv')

In [104]:
mc[mc['source'] == 'https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/train_sent_emo.csv']['sentiment'].unique()

array(['sadness', 'anger'], dtype=object)

In [93]:
mc = mc[['text', 'sentiment', 'source']]

In [94]:
mc.rename(columns={'sentiment':'word_label'}, inplace=True)
mc

Unnamed: 0,text,word_label,source
0,Layin n bed with a headache ughhhh...waitin o...,sadness,https://data.world/crowdflower/sentiment-analy...
1,Funeral ceremony...gloomy friday...,sadness,https://data.world/crowdflower/sentiment-analy...
2,"I should be sleep, but im not! thinking about ...",sadness,https://data.world/crowdflower/sentiment-analy...
3,@charviray Charlene my love. I miss you,sadness,https://data.world/crowdflower/sentiment-analy...
4,@kelcouch I'm sorry at least it's Friday?,sadness,https://data.world/crowdflower/sentiment-analy...
...,...,...,...
12271,"It's a money thing, we don't have any.",sadness,https://raw.githubusercontent.com/declare-lab/...
12272,"Uh, hang out?! How long?",sadness,https://raw.githubusercontent.com/declare-lab/...
12273,A week?,anger,https://raw.githubusercontent.com/declare-lab/...
12274,No!,anger,https://raw.githubusercontent.com/declare-lab/...


In [95]:
mc['source'].unique()

array(['https://data.world/crowdflower/sentiment-analysis-in-text/workspace/file?filename=text_emotion.csv',
       'https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/train_sent_emo.csv'],
      dtype=object)

In [96]:
mc['source'] = mc['source'].map({'https://data.world/crowdflower/sentiment-analysis-in-text/workspace/file?filename=text_emotion.csv':'CrowdFlower',
       'https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/train_sent_emo.csv':'MELD-Friends'})
mc

Unnamed: 0,text,word_label,source
0,Layin n bed with a headache ughhhh...waitin o...,sadness,CrowdFlower
1,Funeral ceremony...gloomy friday...,sadness,CrowdFlower
2,"I should be sleep, but im not! thinking about ...",sadness,CrowdFlower
3,@charviray Charlene my love. I miss you,sadness,CrowdFlower
4,@kelcouch I'm sorry at least it's Friday?,sadness,CrowdFlower
...,...,...,...
12271,"It's a money thing, we don't have any.",sadness,MELD-Friends
12272,"Uh, hang out?! How long?",sadness,MELD-Friends
12273,A week?,anger,MELD-Friends
12274,No!,anger,MELD-Friends


In [97]:
mc['word_label'] = mc['word_label'].map({'sadness':'sad', 'anger':'angry', 'happiness':'happy'})


In [98]:
mc['word_label'].unique()

array(['sad', 'happy', 'angry'], dtype=object)

In [99]:
mc.to_csv('../meld-and-crowdflower_happysadangry.csv')

## Combine

In [105]:
full_hsa_df = pd.concat([simplified_df, semeval_hsa, mc])
full_hsa_df

Unnamed: 0,text,word_label,source
0,i didnt feel humiliated,sad,HuggingFace
1,i can go from feeling so hopeless to so damned...,sad,HuggingFace
2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace
4,i am feeling grouchy,angry,HuggingFace
5,ive been feeling a little burdened lately wasn...,sad,HuggingFace
...,...,...,...
12271,"It's a money thing, we don't have any.",sad,MELD-Friends
12272,"Uh, hang out?! How long?",sad,MELD-Friends
12273,A week?,angry,MELD-Friends
12274,No!,angry,MELD-Friends


In [106]:
full_hsa_df.to_csv('../raw_data/combined_happysadangry.csv')

In [107]:
full_hsa_df['source'].unique()

array(['HuggingFace', 'SemEval-2018', 'CrowdFlower', 'MELD-Friends'],
      dtype=object)

In [108]:
full_hsa_df['word_label'].value_counts()

happy    14847
sad      13918
angry     6787
Name: word_label, dtype: int64

## Baseline Model 
Most frequent class in training set: Happy. 

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("../raw_data/combined_happysadangry.csv")

In [6]:
14847/df.shape[0] # Percent proba of Happy in training set

0.41761363636363635

## Preprocessing

Steps:
* Lowercase
* Dealing with Twitter usernames, numbers and punctuation
* Removing "stopwords"
* Tokenizing
* Stemming or Lemmatizing

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vprentice/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/vprentice/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vprentice/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Lower

In [41]:
df_preproc = df.copy()
df_preproc['preproc_text'] = df_preproc['text'].str.lower()

In [17]:
df_preproc

Unnamed: 0.1,Unnamed: 0,text,word_label,source,preproc_text
0,0,i didnt feel humiliated,sad,HuggingFace,i didnt feel humiliated
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace,i can go from feeling so hopeless to so damned...
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace,im grabbing a minute to post i feel greedy wrong
3,4,i am feeling grouchy,angry,HuggingFace,i am feeling grouchy
4,5,ive been feeling a little burdened lately wasn...,sad,HuggingFace,ive been feeling a little burdened lately wasn...
...,...,...,...,...,...
35547,12271,"It's a money thing, we don't have any.",sad,MELD-Friends,"it's a money thing, we don't have any."
35548,12272,"Uh, hang out?! How long?",sad,MELD-Friends,"uh, hang out?! how long?"
35549,12273,A week?,angry,MELD-Friends,a week?
35550,12274,No!,angry,MELD-Friends,no!


### Remove Twitter handles

In [12]:
import re

In [13]:
username_pattern = r"@\w+"

In [15]:
def delete_username(text):
    new_text = re.sub(username_pattern, '', text)
    return new_text

In [42]:
df_preproc['preproc_text'] = df_preproc['preproc_text'].apply(delete_username)

In [36]:
df_preproc[df_preproc['source'] == 'SemEval-2018']

Unnamed: 0.1,Unnamed: 0,text,word_label,source,preproc_text
15267,0,@Max_Kellerman it also helps that the majorit...,angry,SemEval-2018,it also helps that the majority of nfl coach...
15268,1,My roommate: it's okay that we can't spell bec...,angry,SemEval-2018,my roommate: it's okay that we can't spell bec...
15269,2,Rooneys fucking untouchable isn't he? Been fuc...,angry,SemEval-2018,rooneys fucking untouchable isn't he? been fuc...
15270,3,@BossUpJaee but your pussy was weak from what ...,angry,SemEval-2018,but your pussy was weak from what i heard so ...
15271,4,S/O to the girl that just hit my car...not onl...,angry,SemEval-2018,s/o to the girl that just hit my car...not onl...
...,...,...,...,...,...
23271,8004,@ProfessorF @Mediaite @law_newz Childish tempe...,sad,SemEval-2018,childish temper #tantrums are all the left ...
23272,8005,Still 19 days left before I go home to the Phi...,sad,SemEval-2018,still 19 days left before i go home to the phi...
23273,8006,Literally hanging on by a thread need some tay...,sad,SemEval-2018,literally hanging on by a thread need some tay...
23274,8007,was one moron driving his oversize tonka truc...,sad,SemEval-2018,was one moron driving his oversize tonka truc...


### Remove numbers

In [43]:
def remove_num(text):
    return ''.join(word for word in text if not word.isdigit())
df_preproc['preproc_text'] = df_preproc['preproc_text'].apply(remove_num)

In [38]:
df_preproc[df_preproc['source'] == 'SemEval-2018']

Unnamed: 0.1,Unnamed: 0,text,word_label,source,preproc_text
15267,0,@Max_Kellerman it also helps that the majorit...,angry,SemEval-2018,it also helps that the majority of nfl coach...
15268,1,My roommate: it's okay that we can't spell bec...,angry,SemEval-2018,my roommate: it's okay that we can't spell bec...
15269,2,Rooneys fucking untouchable isn't he? Been fuc...,angry,SemEval-2018,rooneys fucking untouchable isn't he? been fuc...
15270,3,@BossUpJaee but your pussy was weak from what ...,angry,SemEval-2018,but your pussy was weak from what i heard so ...
15271,4,S/O to the girl that just hit my car...not onl...,angry,SemEval-2018,s/o to the girl that just hit my car...not onl...
...,...,...,...,...,...
23271,8004,@ProfessorF @Mediaite @law_newz Childish tempe...,sad,SemEval-2018,childish temper #tantrums are all the left ...
23272,8005,Still 19 days left before I go home to the Phi...,sad,SemEval-2018,still days left before i go home to the phili...
23273,8006,Literally hanging on by a thread need some tay...,sad,SemEval-2018,literally hanging on by a thread need some tay...
23274,8007,was one moron driving his oversize tonka truc...,sad,SemEval-2018,was one moron driving his oversize tonka truc...


### Remove Punctuation

In [26]:
import string 

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [44]:
def remove_punct(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '') 
    return text
df_preproc['preproc_text'] = df_preproc['preproc_text'].apply(remove_punct)

In [45]:
df_preproc

Unnamed: 0.1,Unnamed: 0,text,word_label,source,preproc_text
0,0,i didnt feel humiliated,sad,HuggingFace,i didnt feel humiliated
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace,i can go from feeling so hopeless to so damned...
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace,im grabbing a minute to post i feel greedy wrong
3,4,i am feeling grouchy,angry,HuggingFace,i am feeling grouchy
4,5,ive been feeling a little burdened lately wasn...,sad,HuggingFace,ive been feeling a little burdened lately wasn...
...,...,...,...,...,...
35547,12271,"It's a money thing, we don't have any.",sad,MELD-Friends,its a money thing we dont have any
35548,12272,"Uh, hang out?! How long?",sad,MELD-Friends,uh hang out how long
35549,12273,A week?,angry,MELD-Friends,a week
35550,12274,No!,angry,MELD-Friends,no


### Remove Stopwords

In [52]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) - {'into', 'against', 'myself', 'doing', 'own', 'above', 'our', 'now', 'up', 'down', 'been', 'not', 'no', 'would', 'should', 'again', 'won', 'if', 'only', 'yours', 'your', 'you', 'ours', 'here', 'there', 'below', 'before'}
print(stop_words)

{"you're", 'wasn', 'can', "isn't", 'didn', "mightn't", 'having', 'o', 'shouldn', 'she', 'the', 'hadn', 'because', 'on', 'these', 've', 'does', 'such', 'its', 'they', 'him', "should've", 'a', 'yourselves', 'itself', 'than', 'we', 'during', 'same', 'their', 'he', 'any', 'through', 'was', 'themselves', "she's", "that'll", "hasn't", 'hers', 'm', 'too', "hadn't", 'do', 'ain', 'at', "shouldn't", 'between', "won't", 'will', 'has', "you'll", 'ourselves', 'most', 'this', 'and', 'so', 'isn', "needn't", 'where', "don't", 'i', "haven't", 'd', "you've", 'am', 'as', "mustn't", 'just', 'were', "aren't", 'weren', 'is', 'haven', 'off', 'himself', 'couldn', 'her', "it's", 'shan', 'what', 'that', 're', 'of', "wasn't", 'yourself', 'have', 'those', 'an', 'wouldn', 'more', 'to', 'needn', 'his', "couldn't", 'nor', 'but', 'very', 'being', 'aren', 'in', 'while', 'did', 'out', 'who', 'some', 'don', "shan't", 'under', 'when', 'further', 'each', 'over', "weren't", 'doesn', 'other', 'until', 'for', 'then', 'why', 

In [53]:
def remove_stopwords(text):
    word_tokens = word_tokenize(text) 
    word_list = [w for w in word_tokens if not w in stop_words]
    return ' '.join(word_list)

In [54]:
df_preproc['simple_clean_text'] = df_preproc['preproc_text'].apply(remove_stopwords)

In [55]:
df_preproc

Unnamed: 0.1,Unnamed: 0,text,word_label,source,preproc_text,simple_clean_text
0,0,i didnt feel humiliated,sad,HuggingFace,i didnt feel humiliated,didnt feel humiliated
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace,i can go from feeling so hopeless to so damned...,go feeling hopeless damned hopeful around some...
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace,im grabbing a minute to post i feel greedy wrong,im grabbing minute post feel greedy wrong
3,4,i am feeling grouchy,angry,HuggingFace,i am feeling grouchy,feeling grouchy
4,5,ive been feeling a little burdened lately wasn...,sad,HuggingFace,ive been feeling a little burdened lately wasn...,ive been feeling little burdened lately wasnt ...
...,...,...,...,...,...,...
35547,12271,"It's a money thing, we don't have any.",sad,MELD-Friends,its a money thing we dont have any,money thing dont
35548,12272,"Uh, hang out?! How long?",sad,MELD-Friends,uh hang out how long,uh hang long
35549,12273,A week?,angry,MELD-Friends,a week,week
35550,12274,No!,angry,MELD-Friends,no,no


### Combined Function

In [57]:
import re
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english')) - {'into', 'against', 'myself', 'doing', 'own', 'above', 'our',
                                                'now', 'up', 'down', 'been', 'not', 'no', 'would', 'should',
                                                'again', 'won', 'if', 'only', 'yours', 'your', 'you', 'ours',
                                                'here', 'there', 'below', 'before'}

def clean(text):
    # lowercase
    new_text = text.lower()
    
    # remove twitter handles
    new_text = re.sub(r"@\w+", '', new_text)
    
    # remove punctuation 
    for punctuation in string.punctuation:
        new_text = new_text.replace(punctuation, '')

    # remove numbers
    new_text = ''.join(word for word in new_text if not word.isdigit())
    
    return new_text

def remove_stopwords(text):
    word_list = [word for word in word_tokenize(text) if not word in stop_words]
    return ' '.join(word_list)

def lemma_text(text):
    lemmatized = [lemmatizer.lemmatize(word) for word in word_tokenize(text)]
    return ' '.join(lemmatized)

In [58]:
df_pre = df.copy()
df_pre['clean_text'] = df_pre['text'].apply(clean)
df_pre

Unnamed: 0.1,Unnamed: 0,text,word_label,source,clean_text
0,0,i didnt feel humiliated,sad,HuggingFace,i didnt feel humiliated
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace,i can go from feeling so hopeless to so damned...
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace,im grabbing a minute to post i feel greedy wrong
3,4,i am feeling grouchy,angry,HuggingFace,i am feeling grouchy
4,5,ive been feeling a little burdened lately wasn...,sad,HuggingFace,ive been feeling a little burdened lately wasn...
...,...,...,...,...,...
35547,12271,"It's a money thing, we don't have any.",sad,MELD-Friends,its a money thing we dont have any
35548,12272,"Uh, hang out?! How long?",sad,MELD-Friends,uh hang out how long
35549,12273,A week?,angry,MELD-Friends,a week
35550,12274,No!,angry,MELD-Friends,no


In [59]:
df_pre['simplified_text'] = df_pre['clean_text'].apply(remove_stopwords)
df_pre

Unnamed: 0.1,Unnamed: 0,text,word_label,source,clean_text,simplified_text
0,0,i didnt feel humiliated,sad,HuggingFace,i didnt feel humiliated,didnt feel humiliated
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace,i can go from feeling so hopeless to so damned...,go feeling hopeless damned hopeful around some...
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace,im grabbing a minute to post i feel greedy wrong,im grabbing minute post feel greedy wrong
3,4,i am feeling grouchy,angry,HuggingFace,i am feeling grouchy,feeling grouchy
4,5,ive been feeling a little burdened lately wasn...,sad,HuggingFace,ive been feeling a little burdened lately wasn...,ive been feeling little burdened lately wasnt ...
...,...,...,...,...,...,...
35547,12271,"It's a money thing, we don't have any.",sad,MELD-Friends,its a money thing we dont have any,money thing dont
35548,12272,"Uh, hang out?! How long?",sad,MELD-Friends,uh hang out how long,uh hang long
35549,12273,A week?,angry,MELD-Friends,a week,week
35550,12274,No!,angry,MELD-Friends,no,no


## Add angry data from Google Reddit Dataset

In [60]:
df_pre['clean_lemmas'] = df_pre['clean_text'].apply(lemma_text)
df_pre['simplified_lemmas'] = df_pre['simplified_text'].apply(lemma_text)
df_pre

Unnamed: 0.1,Unnamed: 0,text,word_label,source,clean_text,simplified_text,clean_lemmas,simplified_lemmas
0,0,i didnt feel humiliated,sad,HuggingFace,i didnt feel humiliated,didnt feel humiliated,i didnt feel humiliated,didnt feel humiliated
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace,i can go from feeling so hopeless to so damned...,go feeling hopeless damned hopeful around some...,i can go from feeling so hopeless to so damned...,go feeling hopeless damned hopeful around some...
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace,im grabbing a minute to post i feel greedy wrong,im grabbing minute post feel greedy wrong,im grabbing a minute to post i feel greedy wrong,im grabbing minute post feel greedy wrong
3,4,i am feeling grouchy,angry,HuggingFace,i am feeling grouchy,feeling grouchy,i am feeling grouchy,feeling grouchy
4,5,ive been feeling a little burdened lately wasn...,sad,HuggingFace,ive been feeling a little burdened lately wasn...,ive been feeling little burdened lately wasnt ...,ive been feeling a little burdened lately wasn...,ive been feeling little burdened lately wasnt ...
...,...,...,...,...,...,...,...,...
35547,12271,"It's a money thing, we don't have any.",sad,MELD-Friends,its a money thing we dont have any,money thing dont,it a money thing we dont have any,money thing dont
35548,12272,"Uh, hang out?! How long?",sad,MELD-Friends,uh hang out how long,uh hang long,uh hang out how long,uh hang long
35549,12273,A week?,angry,MELD-Friends,a week,week,a week,week
35550,12274,No!,angry,MELD-Friends,no,no,no,no


In [2]:
import pandas as pd

In [6]:
google_df = pd.read_excel("../raw_data/emotion_data/google_reddit_emotion_data.xls")

In [22]:
google_df.drop(columns=['emotion_4', 'emotion_5'], inplace=True)

In [26]:
google_angry = google_df[google_df['emotion_3'].isin([2,3,10])]

In [28]:
google_angry = google_angry.append(google_df[google_df['emotion_2'].isin([2,3,10])])

In [29]:
google_angry = google_angry.append(google_df[google_df['emotion_1'].isin([2,3,10])])

In [32]:
google_angry.duplicated().sum()

484

In [33]:
google_angry.drop_duplicates(inplace=True)

In [36]:
google_angry.reset_index(inplace=True, drop=True)

In [37]:
google_angry

Unnamed: 0,text,emotion_1,emotion_2,emotion_3
0,- But mom! I don‚Äôt wanna go to school today!...,2,3.0,10.0
1,"It's not his fault, it's your wife's. Shes the...",2,3.0,10.0
2,I'm not the disgusting. I take good care of he...,5,7.0,10.0
3,"""Feed the wars and fuck the poors"" is pretty m...",2,3.0,10.0
4,It‚Äôs amazing what they can justify to themse...,0,6.0,10.0
...,...,...,...,...
5570,There's doing stupid things when you're young....,3,,
5571,So now disabled people don‚Äôt have rights . Y...,2,9.0,
5572,It is. I bet chantex and all the other 'quit s...,10,,
5573,[NAME] was half-blood but it wasn't a smart id...,10,,


In [38]:
google_angry_df = google_angry.copy()[['text']]

In [41]:
google_angry_df['word_label'] = 'angry'

In [42]:
google_angry_df['source'] = 'Google_GoEmotions'

In [46]:
google_angry_df.loc[5571, 'text']

'So now disabled people don‚Äôt have rights . You get worse .'

In [47]:
google_angry_df.to_csv("../raw_data/emotion_data/google_angry.csv")

In [48]:
from happysadsongs.data import get_training_data

In [49]:
df = get_training_data()

In [50]:
df

Unnamed: 0.1,Unnamed: 0,text,word_label,source
0,0,i didnt feel humiliated,sad,HuggingFace
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace
3,3,i am feeling grouchy,angry,HuggingFace
4,4,ive been feeling a little burdened lately wasn...,sad,HuggingFace
...,...,...,...,...
35547,35547,"It's a money thing, we don't have any.",sad,MELD-Friends
35548,35548,"Uh, hang out?! How long?",sad,MELD-Friends
35549,35549,A week?,angry,MELD-Friends
35550,35550,No!,angry,MELD-Friends


In [51]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [53]:
df_append = df.append(google_angry_df)

In [54]:
df_append

Unnamed: 0,text,word_label,source
0,i didnt feel humiliated,sad,HuggingFace
1,i can go from feeling so hopeless to so damned...,sad,HuggingFace
2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace
3,i am feeling grouchy,angry,HuggingFace
4,ive been feeling a little burdened lately wasn...,sad,HuggingFace
...,...,...,...
5570,There's doing stupid things when you're young....,angry,Google_GoEmotions
5571,So now disabled people don‚Äôt have rights . Y...,angry,Google_GoEmotions
5572,It is. I bet chantex and all the other 'quit s...,angry,Google_GoEmotions
5573,[NAME] was half-blood but it wasn't a smart id...,angry,Google_GoEmotions


In [56]:
df_append['word_label'].value_counts()

happy    14847
sad      13918
angry    12362
Name: word_label, dtype: int64

In [57]:
df_append.to_csv("../raw_data/emotion_dataset.csv")

In [58]:
df_append[df_append['source'] == 'HuggingFace']['word_label'].value_counts()

happy    6761
sad      5797
angry    2709
Name: word_label, dtype: int64

In [65]:
df_append[df_append['source'] == 'CrowdFlower']['word_label'].value_counts()

happy    5209
sad      5165
angry     110
Name: word_label, dtype: int64

In [61]:
df_append['source'].unique()

array(['HuggingFace', 'SemEval-2018', 'CrowdFlower', 'MELD-Friends',
       'Google_GoEmotions'], dtype=object)

In [64]:
df_append[df_append['source'] == 'CrowdFlower']

Unnamed: 0,text,word_label,source
23276,Layin n bed with a headache ughhhh...waitin o...,sad,CrowdFlower
23277,Funeral ceremony...gloomy friday...,sad,CrowdFlower
23278,"I should be sleep, but im not! thinking about ...",sad,CrowdFlower
23279,@charviray Charlene my love. I miss you,sad,CrowdFlower
23280,@kelcouch I'm sorry at least it's Friday?,sad,CrowdFlower
...,...,...,...
33755,going to watch boy in the striped pj's hope i ...,happy,CrowdFlower
33756,"gave the bikes a thorough wash, degrease it an...",happy,CrowdFlower
33757,"had SUCH and AMAZING time last night, McFly we...",happy,CrowdFlower
33758,Succesfully following Tayla!!,happy,CrowdFlower


## Recheck Crowdflower Data 

In [66]:
crowdflower = pd.read_csv('../raw_data/emotion_data/crowdflower_text_emotion.csv')

In [68]:
crowdflower['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [69]:
crowdflower['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [76]:
crowdflower_append = crowdflower[crowdflower['sentiment'] == 'hate']['content'].reset_index(drop=True)

In [79]:
type(crowdflower_append)

pandas.core.series.Series

In [82]:
crowdflower_dict = {'text': crowdflower_append, 'word_label': ['angry' for i in range(len(crowdflower_append))], 'source': ['CrowdFlower' for i in range(len(crowdflower_append))]}

In [85]:
crowdflower_append_df = pd.DataFrame(crowdflower_dict)

In [86]:
df_append_2 = df_append.append(crowdflower_append_df).reset_index(drop=True)

In [88]:
df_append_2['word_label'].value_counts()

happy    14847
sad      13918
angry    13685
Name: word_label, dtype: int64

In [90]:
df_append_2[df_append_2['source'] == 'CrowdFlower']['word_label'].value_counts()

happy    5209
sad      5165
angry    1433
Name: word_label, dtype: int64

In [97]:
df_append_2[(df_append_2['source'] == 'MELD-Friends') & (df_append_2['word_label'] == 'angry')]

Unnamed: 0,text,word_label,source
33765,"Oh no-no-no, give me some specifics.",angry,MELD-Friends
33766,You fell asleep!!,angry,MELD-Friends
33767,There was no kangaroo!,angry,MELD-Friends
33768,They didn’t take any of my suggestions!,angry,MELD-Friends
33772,This guy fell asleep!,angry,MELD-Friends
...,...,...,...
35543,What about me?! You-you just said I could!,angry,MELD-Friends
35544,I can’t believe you’re not picking me.,angry,MELD-Friends
35545,"Fine, y’know what, that’s it. From now on, Joe...",angry,MELD-Friends
35549,A week?,angry,MELD-Friends


In [98]:
df_append_2.to_csv("../raw_data/emotion_data/emotion_dataset_with_additional_crowdflower.csv")

## Re-check Friends data

In [107]:
meld_df = pd.read_excel("../raw_data/emotion_data/MELD_train_sent_emo.xls")

In [109]:
meld_happy = meld_df[meld_df['Emotion'] == 'joy']

In [110]:
meld_sad = meld_df[meld_df['Emotion'] == 'sadness']

In [114]:
meld_angry = meld_df[meld_df['Emotion'].isin(['anger', 'disgust'])]

In [115]:
meld_hsa = pd.concat([meld_happy, meld_sad, meld_angry])

In [118]:
meld_hsa['word_label'] = meld_hsa['Emotion'].map({'joy':'happy', 'sadness':'sad', 'anger':'angry', 'disgust': 'angry'})

In [120]:
meld_hsa['source'] = 'MELD-Friends'

In [122]:
meld_hsa.drop(columns=['Emotion'], inplace=True)

In [124]:
meld_hsa.rename(columns={'Utterance':'text'}, inplace=True)

In [125]:
meld_hsa

Unnamed: 0,text,word_label,source
23,Do I ever.,happy,MELD-Friends
31,You betcha!,happy,MELD-Friends
33,"Um-mm, yeah right!",happy,MELD-Friends
44,Hi!,happy,MELD-Friends
51,"Oh well, the woman I interviewed with was pret...",happy,MELD-Friends
...,...,...,...
9962,No!,angry,MELD-Friends
9976,"They were huge. When she sneezed, bats flew ou...",angry,MELD-Friends
9978,"I'm tellin' you, she leaned back; I could see ...",angry,MELD-Friends
9982,"When I first moved to the city, I went out a c...",angry,MELD-Friends
