Program Objective:
    - load two files from EmoLex: NRC-Emotion-Lexicon-v0.92\NRC-Emotion-Lexicon-Wordlevel-v0.92.txt
                                  NRC-Sentiment-Emotion-Lexicons\NRC-AffectIntensity-Lexicon.txt
    - load opinion lexicon from nltk (Liu, 2011)
    - load words & associated sentiment ( pos = 1; neg = 0) into csv 
    - export csv 

In [3]:
import string
import numpy as np
import pandas as pd
import os 
import nltk

In [4]:
def get_rel_path(file_name):
    absolutepath = os.path.abspath('')
    #print(absolutepath)
    fileDirectory = os.path.dirname(absolutepath)
    file_path = os.path.join(fileDirectory, 'Data/' + str(file_name))   
    #print(file_path)
    return file_path

In [6]:
filepath1 = get_rel_path("NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
filepath2 = get_rel_path("NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/NRC-Affect-Intensity-Lexicon/NRC-AffectIntensity-Lexicon.txt")

In [2]:
# load files 
f1 = open(filepath1, 'r')
f2 = open(filepath2, 'r')

In [3]:
# assemble words & associated sentiment into dataframe
emo_lex_dict = {} # key = words; value = sentiment (pos = 1; neg = 0 )

lines = f1.readlines()
for line in lines:
    line = line.split()
    word = line[0]
    sentiment = line[1]
    #print(line)
    if sentiment == "positive": 
        emo_lex_dict[word] = 1
    elif sentiment == "negative": emo_lex_dict[word] = 0
    else: continue 

In [4]:
print(len(emo_lex_dict.keys()))

14182


In [5]:
lines = f2.readlines()
for line in lines:
    line = line.split()
    word = line[0]
    emotion = line[2]
    if emotion == 'anger' or emotion == 'fear' or emotion == 'sadness': emo_lex_dict[word] = 0
    elif emotion == 'joy': emo_lex_dict[word] = 1
    else: continue 

In [6]:
print(len(emo_lex_dict.keys()))

15090


In [7]:
df = pd.DataFrame(emo_lex_dict.items(), columns = ['Word', 'Sentiment'] )
df

Unnamed: 0,Word,Sentiment
0,aback,1
1,abacus,1
2,abandon,0
3,abandoned,0
4,abandonment,0
...,...,...
15085,latte,1
15086,sipping,1
15087,troubles,1
15088,explosions,1


In [8]:
from nltk.corpus import opinion_lexicon
positive = opinion_lexicon.positive()
negative = opinion_lexicon.negative()
print(len(positive))
print(len(negative))

2006
4783


In [9]:
opinion_lex = positive + negative
print(len(opinion_lex))
sentiment = ([1] * len(positive)) + ([0] * len(negative))
data = {"Word": opinion_lex, 'Sentiment': sentiment}
df2 = pd.DataFrame(data)
df2

6789


Unnamed: 0,Word,Sentiment
0,a+,1
1,abound,1
2,abounds,1
3,abundance,1
4,abundant,1
...,...,...
6784,zaps,0
6785,zealot,0
6786,zealous,0
6787,zealously,0


In [10]:
frames = [df, df2]
result = pd.concat(frames, ignore_index=True)
result

Unnamed: 0,Word,Sentiment
0,aback,1
1,abacus,1
2,abandon,0
3,abandoned,0
4,abandonment,0
...,...,...
21874,zaps,0
21875,zealot,0
21876,zealous,0
21877,zealously,0


In [11]:
result = result.drop_duplicates(subset=['Word'], keep='last')
result

Unnamed: 0,Word,Sentiment
0,aback,1
1,abacus,1
2,abandon,0
3,abandoned,0
4,abandonment,0
...,...,...
21874,zaps,0
21875,zealot,0
21876,zealous,0
21877,zealously,0


In [12]:
filepath = get_rel_path("sentiment_lexicon.csv")
result.to_csv(filepath, index = False)