# A dictionary based sentiment analyzer which reads in a dictionary of words matched to sentiments

In [1]:
# libraries for: regular expressions, file I/O
import re
import sys
import unicodecsv  # this may need to be installed through pip or conda install via the command line

In [2]:
text = "hello I am happy.:) :)Are you?"

In [3]:
def preProcess(text):
    print("original:", text)
    # sentence segmentation - assume already done
    # word tokenisation
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text) # separates punctuation at ends of strings
    text = re.sub(r"([.,;:!?'\"“\(\)])(\w)", r"\1 \2", text) # separates punctuation at beginning of strings
    print("tokenising:", text)
    tokens = re.split(r"\s+",text)
    # normalisation - only by lower casing for now
    tokens = [t.lower() for t in tokens]
    return tokens

In [4]:
print(preProcess(text))

original: hello I am happy.:) :)Are you?
tokenising: hello I am happy .:) :) Are you ?
['hello', 'i', 'am', 'happy', '.:)', ':)', 'are', 'you', '?']


In [6]:
# load an external dictionary from local file
sentimentDict = {}
with open('sentiment.csv', 'rb') as f:
    reader = unicodecsv.reader(f, encoding='utf-8')
    for line in reader:
        sentimentDict[line[0]] = float(line[1])
print(sentimentDict)

{'happy': 1.0, 'sad': -1.0, 'good': 1.0, 'bad': -1.0, 'angry': -2.0, 'ecstatic': 2.0}


In [7]:
def getSentiment(word):
    try:
        # if the word is in the sentimentDict, return its value
        return sentimentDict[word]
    except KeyError:
        # else a netural 0
        return 0.0

In [8]:
# An alternative way of coding the above function
if False:
    def getSentiment(word):
        if word in sentimentDict.keys():
            return sentimentDict[word]
        else:
            # else a netural 0
            return 0.0

In [9]:
def analyseSentiment(text):
    words = preProcess(text)
    print("words:", words)
    scores = [getSentiment(w) for w in words]
    print("scores", scores)
    return sum(scores)

In [10]:
s = analyseSentiment(text)
print("sentiment = ", s)

original: hello I am happy.:) :)Are you?
tokenising: hello I am happy .:) :) Are you ?
words: ['hello', 'i', 'am', 'happy', '.:)', ':)', 'are', 'you', '?']
scores [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
sentiment =  1.0


In [11]:
analyseSentiment("i am very angry")

original: i am very angry
tokenising: i am very angry
words: ['i', 'am', 'very', 'angry']
scores [0.0, 0.0, 0.0, -2.0]


-2.0

# Exercise: Try adding new entries to the file 'sentiment.csv' and re-running the code from the top, trying different sentences with the target words in.