In [1]:
import pandas as pd

In [2]:
paris_data = pd.read_csv('full_paris.csv')

In [3]:
paris_data.head()

Unnamed: 0.1,Unnamed: 0,id_str,created_at,full_text,retweet_count,favorite_count,user_id,screen_name,name,location
0,0,1028954737675251712,Mon Aug 13 10:41:41 +0000 2018,looking forward to be back @melblawschool for ...,4,20,969493444677787648,AustHelmut,Helmut Aust,"Berlin, Deutschland"
1,1,1028961689679605760,Mon Aug 13 11:09:18 +0000 2018,@garden_hoe61 \nzero impact architecture\n#pod...,0,0,174786533,GPeconews,The GreenPreneur,United States
2,2,1028962674871128064,Mon Aug 13 11:13:13 +0000 2018,@genejharriscb @repdwightevans this isn't the ...,0,0,194308084,Bestsurvivor,Myra,
3,3,1028964314298896386,Mon Aug 13 11:19:44 +0000 2018,there is no planet “b”! there is no where to h...,1,0,86438480,CharlesBullar,Charles Bullard,"Wilmington, Delaware"
4,4,1028966105841315842,Mon Aug 13 11:26:51 +0000 2018,a really good development... we want all parti...,2,2,41349406,JamiePolitics,Jamie Livingstone,Edinburgh


In [4]:
filepath = 'NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
emolex_df = pd.read_csv(filepath,  names=["word", "emotion", "association"], sep='\t', keep_default_na=False)
emolex_df.head()

Unnamed: 0,word,emotion,association
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0


In [5]:
emolex_words = emolex_df.pivot(index='word', columns='emotion', values='association').reset_index()
emolex_words.head()

emotion,word,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,aback,0,0,0,0,0,0,0,0,0,0
1,abacus,0,0,0,0,0,0,0,0,0,1
2,abandon,0,0,0,1,0,1,0,1,0,0
3,abandoned,1,0,0,1,0,1,0,1,0,0
4,abandonment,1,0,0,1,0,1,0,1,1,0


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
matrix = vec.fit_transform(paris_data['full_text'])
vocab = vec.get_feature_names()
wordcount_df = pd.DataFrame(matrix.toarray(), columns=vocab)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(vocabulary=emolex_words.word,
                      use_idf=False, 
                      norm='l1')
matrix = vec.fit_transform(paris_data['full_text'])
vocab = vec.get_feature_names()
wordcount_df = pd.DataFrame(matrix.toarray(), columns=vocab)
wordcount_df.head()

Unnamed: 0,aback,abacus,abandon,abandoned,abandonment,abate,abatement,abba,abbot,abbreviate,...,zephyr,zeppelin,zest,zip,zodiac,zone,zoo,zoological,zoology,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
negative_words = emolex_words[emolex_words.negative == 1].word
paris_data['negative'] = wordcount_df[negative_words].sum(axis=1)
paris_data.head(3)

positive_words = emolex_words[emolex_words.positive == 1].word
paris_data['positive'] = wordcount_df[positive_words].sum(axis=1)
paris_data.head(3)

Unnamed: 0.1,Unnamed: 0,id_str,created_at,full_text,retweet_count,favorite_count,user_id,screen_name,name,location,negative,positive
0,0,1028954737675251712,Mon Aug 13 10:41:41 +0000 2018,looking forward to be back @melblawschool for ...,4,20,969493444677787648,AustHelmut,Helmut Aust,"Berlin, Deutschland",0.0,0.181818
1,1,1028961689679605760,Mon Aug 13 11:09:18 +0000 2018,@garden_hoe61 \nzero impact architecture\n#pod...,0,0,174786533,GPeconews,The GreenPreneur,United States,0.0,0.0
2,2,1028962674871128064,Mon Aug 13 11:13:13 +0000 2018,@genejharriscb @repdwightevans this isn't the ...,0,0,194308084,Bestsurvivor,Myra,,0.0,0.166667


In [9]:
# anger, anticipation, disgust, fear, joy, sadness, surprise, trust

anger_words = emolex_words[emolex_words.anger == 1].word
paris_data['anger'] = wordcount_df[anger_words].sum(axis=1)

anticipation_words = emolex_words[emolex_words.anticipation == 1].word
paris_data['anticipation'] = wordcount_df[anticipation_words].sum(axis=1)

disgust_words = emolex_words[emolex_words.disgust == 1].word
paris_data['disgust'] = wordcount_df[disgust_words].sum(axis=1)

fear_words = emolex_words[emolex_words.fear == 1].word
paris_data['fear'] = wordcount_df[fear_words].sum(axis=1)

joy_words = emolex_words[emolex_words.joy == 1].word
paris_data['joy'] = wordcount_df[joy_words].sum(axis=1)

sadness_words = emolex_words[emolex_words.sadness == 1].word
paris_data['sadness'] = wordcount_df[sadness_words].sum(axis=1)

surprise_words = emolex_words[emolex_words.surprise == 1].word
paris_data['surprise'] = wordcount_df[surprise_words].sum(axis=1)

trust_words = emolex_words[emolex_words.trust == 1].word
paris_data['trust'] = wordcount_df[trust_words].sum(axis=1)

paris_data.head(3)

Unnamed: 0.1,Unnamed: 0,id_str,created_at,full_text,retweet_count,favorite_count,user_id,screen_name,name,location,negative,positive,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
0,0,1028954737675251712,Mon Aug 13 10:41:41 +0000 2018,looking forward to be back @melblawschool for ...,4,20,969493444677787648,AustHelmut,Helmut Aust,"Berlin, Deutschland",0.0,0.181818,0.0,0.0,0.0,0.181818,0.0,0.0,0.0,0.0
1,1,1028961689679605760,Mon Aug 13 11:09:18 +0000 2018,@garden_hoe61 \nzero impact architecture\n#pod...,0,0,174786533,GPeconews,The GreenPreneur,United States,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111
2,2,1028962674871128064,Mon Aug 13 11:13:13 +0000 2018,@genejharriscb @repdwightevans this isn't the ...,0,0,194308084,Bestsurvivor,Myra,,0.0,0.166667,0.0,0.083333,0.0,0.083333,0.0,0.0,0.0,0.166667


In [10]:
paris_data.drop(columns={"Unnamed: 0"}, inplace=True)
paris_data

Unnamed: 0,id_str,created_at,full_text,retweet_count,favorite_count,user_id,screen_name,name,location,negative,positive,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
0,1028954737675251712,Mon Aug 13 10:41:41 +0000 2018,looking forward to be back @melblawschool for ...,4,20,969493444677787648,AustHelmut,Helmut Aust,"Berlin, Deutschland",0.000000,0.181818,0.000000,0.000000,0.000000,0.181818,0.000000,0.000000,0.000000,0.000000
1,1028961689679605760,Mon Aug 13 11:09:18 +0000 2018,@garden_hoe61 \nzero impact architecture\n#pod...,0,0,174786533,GPeconews,The GreenPreneur,United States,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.111111
2,1028962674871128064,Mon Aug 13 11:13:13 +0000 2018,@genejharriscb @repdwightevans this isn't the ...,0,0,194308084,Bestsurvivor,Myra,,0.000000,0.166667,0.000000,0.083333,0.000000,0.083333,0.000000,0.000000,0.000000,0.166667
3,1028964314298896386,Mon Aug 13 11:19:44 +0000 2018,there is no planet “b”! there is no where to h...,1,0,86438480,CharlesBullar,Charles Bullard,"Wilmington, Delaware",0.266667,0.400000,0.066667,0.266667,0.000000,0.266667,0.200000,0.133333,0.000000,0.200000
4,1028966105841315842,Mon Aug 13 11:26:51 +0000 2018,a really good development... we want all parti...,2,2,41349406,JamiePolitics,Jamie Livingstone,Edinburgh,0.000000,0.250000,0.000000,0.250000,0.000000,0.125000,0.250000,0.000000,0.250000,0.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67861,1121448529904455680,2019-04-25 16:18:59+00:00,"@warriorwoman91 like net-neutrality, pulling o...",0,0,847482300,ALowd34,"⚒Andrew Lowden, EA ⚒",Tampa FL,0.142857,0.142857,0.000000,0.000000,0.142857,0.285714,0.000000,0.000000,0.000000,0.285714
67862,1121448820976566272,2019-04-25 16:20:09+00:00,@NPR How about teaching the truth!\nThe Paris ...,0,0,966188899365158912,KodiaksPal2,KodiaksPal2 ⭐️⭐️⭐️,Native Texan living in TN,0.111111,0.555556,0.111111,0.000000,0.111111,0.111111,0.111111,0.111111,0.111111,0.555556
67863,1121450110943223808,2019-04-25 16:25:16+00:00,UNDP Development Chief Calls for Action on Deb...,1,1,4088842828,SDGscameroon,SDGsCameroon,"Yaounde,Cameroon",0.300000,0.100000,0.100000,0.100000,0.000000,0.100000,0.000000,0.200000,0.000000,0.100000
67864,1121457890328502272,2019-04-25 16:56:11+00:00,Write today and urge your representative to fu...,0,0,874127060,WendyDianaLukow,Wendy Diana Lukowitz,,0.000000,0.300000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.200000


In [11]:
paris_data.to_csv('full_paris_emolex.csv', index=False)