# A simple dictionary-based sentiment analyser

In [11]:
# library for regular expressions
import re

In [12]:
text = "hello I am happy.:) :)Are you?"

In [13]:
def preProcess(text):
    print("original:", text)
    # sentence segmentation - assume already done
    # word tokenisation
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text) # separates punctuation at ends of strings
    text = re.sub(r"([.,;:!?'\"“\(\)])(\w)", r"\1 \2", text) # separates punctuation at beginning of strings
    print("tokenising:", text)
    tokens = re.split(r"\s+",text)
    # normalisation - only by lower casing for now
    tokens = [t.lower() for t in tokens]
    return tokens

In [14]:
print(preProcess(text))
print(preProcess("I am sad :("))

original: hello I am happy.:) :)Are you?
tokenising: hello I am happy .:) :) Are you ?
['hello', 'i', 'am', 'happy', '.:)', ':)', 'are', 'you', '?']
original: I am sad :(
tokenising: I am sad :(
['i', 'am', 'sad', ':(']


In [15]:
# a very simple dictionary
sentimentDict = { 'happy': +1.0, 'sad': -1.0 , 'good': +1.0, 'bad': -1.0}
print(sentimentDict)

{'happy': 1.0, 'sad': -1.0, 'good': 1.0, 'bad': -1.0}


In [16]:
def getSentiment(word):
    try:
        # if the word is in the sentimentDict, return its value
        return sentimentDict[word]
    except KeyError:
        # else a netural 0
        return 0.0

In [17]:
# An alternative way of coding the above function
if False:
    def getSentiment(word):
        if word in sentimentDict.keys():
            return sentimentDict[word]
        else:
            # else a netural 0
            return 0.0

In [18]:
def analyseSentiment(text):
    words = preProcess(text)
    print("words:", words)
    scores = [getSentiment(w) for w in words]
    print("scores", scores)
    return sum(scores)

In [19]:
s = analyseSentiment(text)
print("sentiment = ", s)

original: hello I am happy.:) :)Are you?
tokenising: hello I am happy .:) :) Are you ?
words: ['hello', 'i', 'am', 'happy', '.:)', ':)', 'are', 'you', '?']
scores [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
sentiment =  1.0


In [20]:
analyseSentiment("i am very sad")

original: i am very sad
tokenising: i am very sad
words: ['i', 'am', 'very', 'sad']
scores [0.0, 0.0, 0.0, -1.0]


-1.0

# Exercise: Try adding new entries to the dictionary 'sentimentDict' above and re-running the code from the top, trying different sentences with the target words in.

In [21]:
analyseSentiment("this is bad")

original: this is bad
tokenising: this is bad
words: ['this', 'is', 'bad']
scores [0.0, 0.0, -1.0]


-1.0