## Hugging Face

In [127]:
import pickle


In [128]:
!pip install datasets



In [129]:
from datasets import load_dataset
dataset = load_dataset(
   'emotion')

Using custom data configuration default
Reusing dataset emotion (/Users/vprentice/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


In [130]:
dataset.keys()

dict_keys(['train', 'validation', 'test'])

In [131]:
import pandas as pd
df = pd.DataFrame(dataset['train'])

In [132]:
df['label'].value_counts()

1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: label, dtype: int64

In [133]:
df2 = pd.DataFrame(dataset['validation'])
df3 = pd.DataFrame(dataset['test'])

In [134]:
full_df = pd.concat([df,df2,df3])

In [135]:
full_df.shape

(20000, 2)

In [136]:
full_df.dtypes

text     object
label     int64
dtype: object

In [137]:
full_df['word_label'] = full_df['label'].map({0:'sad',1:'happy', 2:'love', 3:'angry', 4:'fear', 5:'surprised'})

In [138]:
full_df

Unnamed: 0,text,label,word_label
0,i didnt feel humiliated,0,sad
1,i can go from feeling so hopeless to so damned...,0,sad
2,im grabbing a minute to post i feel greedy wrong,3,angry
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,angry
...,...,...,...
1995,i just keep feeling like someone is being unki...,3,angry
1996,im feeling a little cranky negative after this...,3,angry
1997,i feel that i am useful to my people and that ...,1,happy
1998,im feeling more comfortable with derby i feel ...,1,happy


In [139]:
simplified_df = full_df[full_df['word_label'].isin(['sad','angry','happy'])]

In [140]:
simplified_df = simplified_df[['text', 'word_label']]

In [141]:
simplified_df

Unnamed: 0,text,word_label
0,i didnt feel humiliated,sad
1,i can go from feeling so hopeless to so damned...,sad
2,im grabbing a minute to post i feel greedy wrong,angry
4,i am feeling grouchy,angry
5,ive been feeling a little burdened lately wasn...,sad
...,...,...
1994,i can feel its suffering,sad
1995,i just keep feeling like someone is being unki...,angry
1996,im feeling a little cranky negative after this...,angry
1997,i feel that i am useful to my people and that ...,happy


In [142]:
simplified_df['source'] = 'HuggingFace'

In [143]:
simplified_df

Unnamed: 0,text,word_label,source
0,i didnt feel humiliated,sad,HuggingFace
1,i can go from feeling so hopeless to so damned...,sad,HuggingFace
2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace
4,i am feeling grouchy,angry,HuggingFace
5,ive been feeling a little burdened lately wasn...,sad,HuggingFace
...,...,...,...
1994,i can feel its suffering,sad,HuggingFace
1995,i just keep feeling like someone is being unki...,angry,HuggingFace
1996,im feeling a little cranky negative after this...,angry,HuggingFace
1997,i feel that i am useful to my people and that ...,happy,HuggingFace


In [240]:
simplified_df['word_label'].value_counts()

happy    6761
sad      5797
angry    2709
Name: word_label, dtype: int64

In [144]:
simplified_df.to_csv('../raw_data/huggingface_happysadangry.csv')

## SemEval2018

In [146]:
semeval = pd.read_csv('../raw_data/emotion_data/2018-E-c-En-train-dev.csv')

In [147]:
semeval

Unnamed: 0,ID,Tweet,anger,joy,sadness
0,2017-En-21441,‚ÄúWorry is a down payment on a problem you ma...,0,0,0
1,2017-En-31535,Whatever you decide to do make sure it makes y...,0,1,0
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,1,1,0
3,2017-En-31436,Accept the challenges so that you can literall...,0,1,0
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,1,0,0
...,...,...,...,...,...
7719,2018-En-01993,@BadHombreNPS @SecretaryPerry If this didn't m...,1,0,0
7720,2018-En-01784,Excited to watch #stateoforigin tonight! Come ...,0,1,0
7721,2018-En-04047,"Blah blah blah Kyrie, IT, etc. @CJC9BOSS leavi...",1,0,1
7722,2018-En-03041,#ThingsIveLearned The wise #shepherd never tru...,0,0,0


In [148]:
semeval_anger = semeval[semeval['anger'] == 1]
semeval_anger['word_label'] = 'angry'
semeval_joy = semeval[semeval['joy'] == 1]
semeval_joy['word_label'] = 'happy'
semeval_sadness =semeval[semeval['sadness'] == 1]
semeval_sadness['word_label'] = 'sad'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [149]:
semeval_hsa = pd.concat([semeval_anger, semeval_joy, semeval_sadness])
semeval_hsa.reset_index(inplace=True)

In [150]:
semeval_hsa.drop(columns=['index','anger','joy','sadness'], inplace=True)

In [151]:
semeval_hsa.drop(columns=['ID'], inplace=True)

In [152]:
semeval_hsa.rename(columns={'Tweet':'text'}, inplace=True)

In [153]:
semeval_hsa

Unnamed: 0,text,word_label
0,@Max_Kellerman it also helps that the majorit...,angry
1,My roommate: it's okay that we can't spell bec...,angry
2,Rooneys fucking untouchable isn't he? Been fuc...,angry
3,@BossUpJaee but your pussy was weak from what ...,angry
4,S/O to the girl that just hit my car...not onl...,angry
...,...,...
8004,@ProfessorF @Mediaite @law_newz Childish tempe...,sad
8005,Still 19 days left before I go home to the Phi...,sad
8006,Literally hanging on by a thread need some tay...,sad
8007,was one moron driving his oversize tonka truc...,sad


In [154]:
semeval_hsa['source'] = 'SemEval-2018'

In [239]:
semeval_hsa['word_label'].value_counts()

happy    2877
angry    2859
sad      2273
Name: word_label, dtype: int64

In [104]:
semeval_hsa.to_csv('../raw_data/semeval-2018_happysadangry.csv')

## Add angry data from Google Reddit Dataset

In [209]:
google_emotion_dict = {0:'admiration',
1:'amusement',
2:'anger',
3:'annoyance',
4:'approval',
5:'caring',
6:'confusion',
7:'curiosity',
8:'desire',
9:'disappointment',
10:'disapproval',
11:'disgust',
12:'embarrassment',
13:'excitement',
14:'fear',
15:'gratitude',
16:'grief',
17:'joy',
18:'love',
19:'nervousness',
20:'optimism',
21:'pride',
22:'realization',
23:'relief',
24:'remorse',
25:'sadness',
26:'surprise',
27:'neutral'}

In [155]:
import pandas as pd

In [156]:
google_df = pd.read_excel("../raw_data/emotion_data/google_reddit_emotion_data.xls")

In [157]:
google_df.drop(columns=['emotion_4', 'emotion_5'], inplace=True)

In [158]:
google_angry = google_df[google_df['emotion_3'].isin([2,3,10])]

In [159]:
google_angry = google_angry.append(google_df[google_df['emotion_2'].isin([2,3,10])])

In [160]:
google_angry = google_angry.append(google_df[google_df['emotion_1'].isin([2,3,10])])

In [161]:
google_angry.duplicated().sum()

484

In [162]:
google_angry.drop_duplicates(inplace=True)

In [163]:
google_angry.reset_index(inplace=True, drop=True)

In [164]:
google_angry

Unnamed: 0,text,emotion_1,emotion_2,emotion_3
0,- But mom! I don‚Äôt wanna go to school today!...,2,3.0,10.0
1,"It's not his fault, it's your wife's. Shes the...",2,3.0,10.0
2,I'm not the disgusting. I take good care of he...,5,7.0,10.0
3,"""Feed the wars and fuck the poors"" is pretty m...",2,3.0,10.0
4,It‚Äôs amazing what they can justify to themse...,0,6.0,10.0
...,...,...,...,...
5570,There's doing stupid things when you're young....,3,,
5571,So now disabled people don‚Äôt have rights . Y...,2,9.0,
5572,It is. I bet chantex and all the other 'quit s...,10,,
5573,[NAME] was half-blood but it wasn't a smart id...,10,,


In [165]:
google_angry_df = google_angry.copy()[['text']]

In [166]:
google_angry_df['word_label'] = 'angry'

In [167]:
google_angry_df['source'] = 'Google_GoEmotions'

In [168]:
google_angry_df.loc[5571, 'text']

'So now disabled people don‚Äôt have rights . You get worse .'

In [169]:
google_angry_df.to_csv("../raw_data/emotion_data/google_angry.csv")

### Google Happy and Sad

In [218]:
google_happy = pd.DataFrame(google_df[google_df['emotion_1'] == 17]['text']).reset_index(drop=True)

In [221]:
google_happy['word_label'] = 'happy'
google_happy['source'] = 'Google_GoEmotions'

In [230]:
google_sad1 = pd.DataFrame(google_df[google_df['emotion_1'] == 25]['text']).reset_index(drop=True)
google_sad2 = pd.DataFrame(google_df[google_df['emotion_2'] == 25]['text']).reset_index(drop=True)
google_sad3 = pd.DataFrame(google_df[google_df['emotion_3'] == 25]['text']).reset_index(drop=True)

In [231]:
google_sad = pd.concat([google_sad1, google_sad2, google_sad3]).reset_index(drop=True)

In [232]:
google_sad

Unnamed: 0,text
0,Pretty sure I‚Äôve seen this. He swings away w...
1,sorry [NAME]! üòòüòòüòò
2,my brain hurts...
3,Go pursue that education. Let [NAME] wallow in...
4,I was suffering a delirium flu during the phan...
...,...
1315,This hurts to hear because we know it's true. ...
1316,Yep happening to me all the time at my college...
1317,"Shit, I lost the original page. Please forgive..."
1318,I want to die


In [233]:
google_sad['word_label'] = 'sad'
google_sad['source'] = 'Google_GoEmotions'

In [235]:
google_hsa = pd.concat([google_angry_df, google_happy, google_sad])
google_hsa['word_label'].value_counts()

angry    5575
sad      1320
happy    1013
Name: word_label, dtype: int64

In [236]:
google_hsa.to_csv('../raw_data/emotion_data/google_hsa_mostly_angry.csv')

## Recheck Crowdflower Data 

In [66]:
crowdflower = pd.read_csv('../raw_data/emotion_data/crowdflower_text_emotion.csv')

In [68]:
crowdflower['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [69]:
crowdflower['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [178]:
crowdflower_hate = crowdflower[crowdflower['sentiment'] == 'hate'][['sentiment', 'content']].reset_index(drop=True)

In [180]:
crowdflower_angry = crowdflower[crowdflower['sentiment'] == 'anger'][['sentiment', 'content']].reset_index(drop=True)

In [181]:
crowdflower_happy = crowdflower[crowdflower['sentiment'] == 'happiness'][['sentiment', 'content']].reset_index(drop=True)

In [182]:
crowdflower_sad = crowdflower[crowdflower['sentiment'] == 'sadness'][['sentiment', 'content']].reset_index(drop=True)

In [194]:
crowdflower_hsa = pd.concat([crowdflower_hate, crowdflower_angry, crowdflower_happy, crowdflower_sad])

In [195]:
crowdflower_hsa = crowdflower_hsa[['content', 'sentiment']]

In [196]:
crowdflower_hsa.rename(columns={'content':'text','sentiment':'word_label'}, inplace=True)

In [197]:
crowdflower_hsa['word_label'] = crowdflower_hsa['word_label'].map({'hate':'angry', 'anger':'angry', 'happiness':'happy', 'sadness':'sad'})

In [241]:
crowdflower_hsa['word_label'].value_counts()

happy    5209
sad      5165
angry    1433
Name: word_label, dtype: int64

In [204]:
crowdflower_hsa['source'] = 'CrowdFlower'

In [205]:
crowdflower_hsa.to_csv('../raw_data/emotion_data/crowdflower_hsa.csv')

## Re-check Friends data

In [107]:
meld_df = pd.read_excel("../raw_data/emotion_data/MELD_train_sent_emo.xls")

In [109]:
meld_happy = meld_df[meld_df['Emotion'] == 'joy']

In [110]:
meld_sad = meld_df[meld_df['Emotion'] == 'sadness']

In [114]:
meld_angry = meld_df[meld_df['Emotion'].isin(['anger', 'disgust'])]

In [115]:
meld_hsa = pd.concat([meld_happy, meld_sad, meld_angry])

In [118]:
meld_hsa['word_label'] = meld_hsa['Emotion'].map({'joy':'happy', 'sadness':'sad', 'anger':'angry', 'disgust': 'angry'})

In [120]:
meld_hsa['source'] = 'MELD-Friends'

In [122]:
meld_hsa.drop(columns=['Emotion'], inplace=True)

In [124]:
meld_hsa.rename(columns={'Utterance':'text'}, inplace=True)

In [242]:
meld_hsa['word_label'].value_counts()

happy    1743
angry    1380
sad       683
Name: word_label, dtype: int64

In [202]:
meld_hsa.to_csv('../raw_data/emotion_data/meld_hsa.csv')

## Combine

In [243]:
new_df = pd.concat([simplified_df, semeval_hsa, crowdflower_hsa, meld_hsa, google_hsa])

In [244]:
new_df['word_label'].value_counts()

happy    17603
sad      15238
angry    13956
Name: word_label, dtype: int64

In [278]:
new_df

Unnamed: 0,text,word_label,source
0,i didnt feel humiliated,sad,HuggingFace
1,i can go from feeling so hopeless to so damned...,sad,HuggingFace
2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace
4,i am feeling grouchy,angry,HuggingFace
5,ive been feeling a little burdened lately wasn...,sad,HuggingFace
...,...,...,...
1315,This hurts to hear because we know it's true. ...,sad,Google_GoEmotions
1316,Yep happening to me all the time at my college...,sad,Google_GoEmotions
1317,"Shit, I lost the original page. Please forgive...",sad,Google_GoEmotions
1318,I want to die,sad,Google_GoEmotions


### Take Subsample of Happy / Sad observations from some sources to balance set

In [256]:
new_df['source'].unique()

array(['HuggingFace', 'SemEval-2018', 'CrowdFlower', 'MELD-Friends',
       'Google_GoEmotions'], dtype=object)

In [263]:
new_df[(new_df['word_label'] == 'sad')\
       & (new_df['source'].isin(['SemEval-2018', 'MELD-Friends', 'Google_GoEmotions']))]

Unnamed: 0,text,word_label,source
5736,it's pretty depressing when u hit pan on ur fa...,sad,SemEval-2018
5737,Making that yearly transition from excited and...,sad,SemEval-2018
5738,S/O to the girl that just hit my car...not onl...,sad,SemEval-2018
5739,People you need to look up the definition of p...,sad,SemEval-2018
5740,Star trek online has a update to download oh f...,sad,SemEval-2018
...,...,...,...
1315,This hurts to hear because we know it's true. ...,sad,Google_GoEmotions
1316,Yep happening to me all the time at my college...,sad,Google_GoEmotions
1317,"Shit, I lost the original page. Please forgive...",sad,Google_GoEmotions
1318,I want to die,sad,Google_GoEmotions


In [282]:
new_df[(new_df['word_label'] == 'happy') & (new_df['source'].isin(['HuggingFace', 'CrowdFlower']))]

Unnamed: 0,text,word_label,source
8,i have been with petronas for years i feel tha...,happy,HuggingFace
11,i do feel that running is a divine experience ...,happy,HuggingFace
14,i have immense sympathy with the general point...,happy,HuggingFace
15,i do not feel reassured anxiety is on each side,happy,HuggingFace
22,i have the feeling she was amused and delighted,happy,HuggingFace
...,...,...,...
5204,going to watch boy in the striped pj's hope i ...,happy,CrowdFlower
5205,"gave the bikes a thorough wash, degrease it an...",happy,CrowdFlower
5206,"had SUCH and AMAZING time last night, McFly we...",happy,CrowdFlower
5207,Succesfully following Tayla!!,happy,CrowdFlower


In [280]:
hugging_and_crowd_happy = new_df[(new_df['word_label'] == 'happy') & (new_df['source'].isin(['HuggingFace', 'CrowdFlower']))].sample(8896)

In [259]:
14500 - 5633

8867

In [281]:
hugging_and_crowd_happy

Unnamed: 0,text,word_label,source
13898,i am thankful that she continues to feel comfo...,happy,HuggingFace
11115,i want to do it the right way oh orihime whisp...,happy,HuggingFace
1956,i was gaining weight getting a lot stronger an...,happy,HuggingFace
7315,i feel that there is no way to determine if a ...,happy,HuggingFace
11722,i feel sure that i wouldnt have gained so much...,happy,HuggingFace
...,...,...,...
2134,I just went to Metro Diner for the first time....,happy,CrowdFlower
301,Heading to Beaumont,happy,CrowdFlower
1940,i could feel the radiant heat of emanating fro...,happy,HuggingFace
2041,i love lots of different kinds of sports and l...,happy,HuggingFace


In [265]:
14000 - 4276

9724

In [268]:
hugging_and_crowd_sad = new_df[(new_df['word_label'] == 'sad') & (new_df['source'].isin(['HuggingFace', 'CrowdFlower']))].sample(9724)

In [283]:
hugging_and_crowd_sad

Unnamed: 0,text,word_label,source
0,Layin n bed with a headache ughhhh...waitin o...,sad,CrowdFlower
11623,i hate feel needy,sad,HuggingFace
737,im not used to feeling the dependency or the n...,sad,HuggingFace
3018,Holidays are now over - not fair!!,sad,CrowdFlower
7967,im feeling horrible,sad,HuggingFace
...,...,...,...
5104,i got shots from as many likely angles as i co...,sad,HuggingFace
826,ive been feeling kinda gloomy lately,sad,HuggingFace
195,i died would alex and matt feel regretful for ...,sad,HuggingFace
910,i know what you feel like that when fake ones ...,sad,HuggingFace


In [284]:
hugging_and_crowd_angry = new_df[(new_df['word_label'] == 'angry') & (new_df['source'].isin(['HuggingFace', 'CrowdFlower']))]

In [285]:
hugging_and_crowd_angry

Unnamed: 0,text,word_label,source
2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace
4,i am feeling grouchy,angry,HuggingFace
12,i think it s the easiest time of year to feel ...,angry,HuggingFace
20,i feel irritated and rejected without anyone d...,angry,HuggingFace
24,i already feel like i fucked up though because...,angry,HuggingFace
...,...,...,...
105,my gawwddd ! 6 headshotss inna row? im on fyaa...,angry,CrowdFlower
106,I'm way to sleepy.. Ill watch my shows lata..G...,angry,CrowdFlower
107,@NerdIndian Take that back. I am insulted.,angry,CrowdFlower
108,@anieszkaa haha i did a ltiitle bit yesterday ...,angry,CrowdFlower


In [286]:
hugging_and_crowd_subsample = pd.concat([hugging_and_crowd_happy, hugging_and_crowd_sad, hugging_and_crowd_angry]).reset_index(drop=True)

In [287]:
hugging_and_crowd_subsample

Unnamed: 0,text,word_label,source
0,i am thankful that she continues to feel comfo...,happy,HuggingFace
1,i want to do it the right way oh orihime whisp...,happy,HuggingFace
2,i was gaining weight getting a lot stronger an...,happy,HuggingFace
3,i feel that there is no way to determine if a ...,happy,HuggingFace
4,i feel sure that i wouldnt have gained so much...,happy,HuggingFace
...,...,...,...
22757,my gawwddd ! 6 headshotss inna row? im on fyaa...,angry,CrowdFlower
22758,I'm way to sleepy.. Ill watch my shows lata..G...,angry,CrowdFlower
22759,@NerdIndian Take that back. I am insulted.,angry,CrowdFlower
22760,@anieszkaa haha i did a ltiitle bit yesterday ...,angry,CrowdFlower


In [288]:
hugging_and_crowd_subsample.to_csv('../raw_data/emotion_data/hugging_and_crowdflower_subsample_for_balancing.csv')

In [289]:
final_df = pd.concat([hugging_and_crowd_subsample, semeval_hsa, meld_hsa, google_hsa])

In [290]:
final_df['word_label'].value_counts()

happy    14529
sad      14000
angry    13956
Name: word_label, dtype: int64

In [291]:
final_df.to_csv('../raw_data/balanced_hsa_dataset.csv')