# A Naive Bayes classifier for sentiment analysis learning probabilities from examples

In [6]:
import re
import unicodecsv
import collections

In [7]:
text = "hello I am happy.:) :)Are you?"

In [8]:
def preProcess(text):
    # print("original:", text) # uncomment for debugging
    # sentence segmentation - assume already done
    # word tokenisation
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text) # separates punctuation at ends of strings
    text = re.sub(r"([.,;:!?'\"“\(\)])(\w)", r"\1 \2", text) # separates punctuation at beginning of strings
    # print("tokenising:", text) # uncomment for debugging
    tokens = re.split(r"\s+",text)
    # normalisation - only by lower casing for now
    tokens = [t.lower() for t in tokens]
    return tokens

In [9]:
print(preProcess(text))

['hello', 'i', 'am', 'happy', '.:)', ':)', 'are', 'you', '?']


In [12]:
# turn each line in the file into a feature vectors
featureDict = {}  # Global feature dictionary maps from words/features to unique index
i = 0 # index counter variable for the global feature dict

def toFeatureVector(words, training=False):
    """Convert word tokens into a feature vector dictionary of
    feature index keys and weight values.
    
    Update the featureVector if in training phase, i.e. where training=True.
    """
    
    featureVector = {}  # local feature vector for this example (in dict form)
    for w in words:
        
        # First add to the global feature dictionary, and give it a position
        # if in training
        try:
            # if the word w is in the dictionary,
            # then assign i to its value in the dictionary
            i = featureDict[w]
        except KeyError:
            # else if it's not in the dictionary,
            # it's a new feature in training, add new entry
            if training:
                i = len(featureDict) + 1
                featureDict[w] = i
            else:
                # At test time don't add new entry
                i = featureDict["<unk/>"]  # get the special unknown word token
        
        # Add to local feature vector (as a dictionary entry with a weight)

        try:
            # if the word w is in the local feature vector (repeat word), add 1/n weight
            featureVector[i] += 1 #(1.0/len(words))
        except KeyError:
            # else if it's not in the local feature vector, create the entry with 1/n weight
            featureVector[i] = 1 #(1.0/len(words))
            
        if False:
            # example binary alternative to the above (just add it as 1 if present)
            if not i in featureVector.keys():
                featureVector[i] = 1
        
            
    return featureVector

In [13]:
print(toFeatureVector(preProcess(text),training=True))

{1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}


In [14]:
print(featureDict)

{'hello': 1, 'i': 2, 'am': 3, 'happy': 4, '.:)': 5, ':)': 6, 'are': 7, 'you': 8, '?': 9}


In [15]:
# load training data from external files
trainData = []
trainLabels = []
featureDict = {}  # Global feature dictionary

i = 0 # index variable for the global feature dict

# load positive examples
with open('pos.txt', encoding='UTF-8') as f:
    for line in f:
        trainData.append(toFeatureVector(preProcess(line), training=True))
        trainLabels.append(1.0)

# load negative examples
with open('neg.txt', encoding='UTF-8') as f:
    for line in f:
        trainData.append(toFeatureVector(preProcess(line), training=True))
        trainLabels.append(-1.0)
print(len(featureDict), "features")
print(len(trainData), "training instances")

8761 features
2000 training instances


In [16]:
# Add an unknown token at the end for unknown words
featureDict["<unk/>"] = len(featureDict)+1

In [22]:
# have a look at the feature dict with all features and their index
featureDict

{'@zenalison': 1,
 'thanks': 2,
 ',': 3,
 'sweetie': 4,
 ':)': 5,
 'particularly': 6,
 'with': 7,
 'hip': 8,
 'tendonitis': 9,
 'it': 10,
 'was': 11,
 'a': 12,
 'wiggle': 13,
 'alright': 14,
 '(': 15,
 'heatmat': 16,
 '': 17,
 'thankyou': 18,
 'mutia': 19,
 'rt': 20,
 '@mutiandn': 21,
 ':': 22,
 '@ariezzo': 23,
 '<---': 24,
 'go': 25,
 'follow': 26,
 'him': 27,
 'kalo': 28,
 'tl': 29,
 'lo': 30,
 'mau': 31,
 'rame': 32,
 '.': 33,
 'folback': 34,
 '??': 35,
 'just': 36,
 'mention': 37,
 'd': 38,
 '@isma_nara': 39,
 'asiikk': 40,
 'ada': 41,
 'testimonial': 42,
 'boleh': 43,
 'diemail': 44,
 'ke': 45,
 'admin@mommeworld': 46,
 'com': 47,
 'mom': 48,
 '@melegner': 49,
 'you': 50,
 "'": 51,
 're': 52,
 'the': 53,
 'best': 54,
 '!': 55,
 'this': 56,
 'is': 57,
 'option': 58,
 '#2': 59,
 'http': 60,
 '://twitpic': 61,
 'com/3cpb06': 62,
 '@sophiegeldard': 63,
 '://j': 64,
 'mp/gjj6wt': 65,
 'listen': 66,
 'to': 67,
 'that': 68,
 '@ashlilliott': 69,
 'as': 70,
 'if': 71,
 'ewww': 72,
 'mcdona

In [23]:
print("features = ", trainData[0])
print("label = ", trainLabels[0])

features =  {1: 1, 2: 1, 3: 1, 4: 1, 5: 3, 6: 1, 7: 2, 8: 1, 9: 1, 10: 1, 11: 1, 12: 2, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1}
label =  1.0


In [24]:
# Zip up the training set into one list of tuple of (feature_vals, labels)
train_set = [(fv, l) for fv, l in zip(trainData, trainLabels)] 
train_set[0]

({1: 1,
  2: 1,
  3: 1,
  4: 1,
  5: 3,
  6: 1,
  7: 2,
  8: 1,
  9: 1,
  10: 1,
  11: 1,
  12: 2,
  13: 1,
  14: 1,
  15: 1,
  16: 1,
  17: 1},
 1.0)

In [27]:
# check the balance of the dataset
from collections import Counter
Counter([label for df, label in train_set])

Counter({1.0: 1000, -1.0: 1000})

# A simple machine learning/statistical model: Naive Bayes classifier

In [28]:
# libraries for: NLP, machine learning
import nltk 

In [29]:
# train the NB classifier
model = nltk.NaiveBayesClassifier.train(train_set)

In [30]:
# have a little look at what it's learned/induced:
model.show_most_informative_features(20)

Most Informative Features
                      75 = 1                 1.0 : -1.0   =     47.0 : 1.0
                      38 = 1                 1.0 : -1.0   =     11.0 : 1.0
                    1715 = 1                -1.0 : 1.0    =     11.0 : 1.0
                    1300 = 1                -1.0 : 1.0    =     10.3 : 1.0
                    1349 = 1                -1.0 : 1.0    =     10.1 : 1.0
                     861 = 1                 1.0 : -1.0   =      9.7 : 1.0
                    3083 = 1                -1.0 : 1.0    =      9.0 : 1.0
                     804 = 1                 1.0 : -1.0   =      8.3 : 1.0
                     934 = 1                 1.0 : -1.0   =      8.3 : 1.0
                    2388 = 1                -1.0 : 1.0    =      7.7 : 1.0
                     187 = 1                 1.0 : -1.0   =      7.7 : 1.0
                      38 = 2                 1.0 : -1.0   =      7.6 : 1.0
                    5369 = None              1.0 : -1.0   =      7.2 : 1.0

In [31]:
# have a look at the priors for each class
for s in model._label_probdist.samples():
    print(s, model._label_probdist.prob(s))

1.0 0.5
-1.0 0.5


In [32]:
# have a look at the feature weights
# for each class and feature pair, this is a distribution over values
# (if binary counts only 0 or 1, if full Bag-of-words counts a distribution over counts None/0,1,2,...m)
count = 100  # only print some out for space
for c,x in model._feature_probdist.items():
    print("class=", c[0], "feature=",c[1])
    for s in x.samples():
        print(s, x.prob(s))
    print("*****")
    count-=1
    if count < 1:
        break

class= 1.0 feature= 1
1 0.0014985014985014985
None 0.9985014985014985
*****
class= 1.0 feature= 2
1 0.045454545454545456
None 0.9545454545454546
*****
class= 1.0 feature= 3
1 0.1630922693266833
2 0.02144638403990025
3 0.009476309226932668
4 0.003491271820448878
None 0.8024937655860349
*****
class= 1.0 feature= 4
1 0.0034965034965034965
None 0.9965034965034965
*****
class= 1.0 feature= 5
3 0.004488778054862843
1 0.5740648379052369
2 0.05236907730673317
4 0.0024937655860349127
None 0.36658354114713215
*****
class= 1.0 feature= 6
1 0.0014985014985014985
None 0.9985014985014985
*****
class= 1.0 feature= 7
2 0.003494757863205192
1 0.04942586120818772
None 0.9470793809286071
*****
class= 1.0 feature= 8
1 0.0014985014985014985
None 0.9985014985014985
*****
class= 1.0 feature= 9
1 0.0014985014985014985
None 0.9985014985014985
*****
class= 1.0 feature= 10
1 0.11427145708582834
2 0.010479041916167664
3 0.0054890219560878245
None 0.8697604790419161
*****
class= 1.0 feature= 11
1 0.033433133732534

In [33]:
# predict from model
def analyseSentiment(text):
    v = toFeatureVector(preProcess(text))
    print(v)
    return model.classify(v)

In [34]:
# sentiment analyse the example at the top of the notebook
print(text)
s = analyseSentiment(text)
print("sentiment = ", s)

hello I am happy.:) :)Are you?
{934: 1, 110: 1, 527: 1, 674: 1, 8762: 1, 5: 1, 95: 1, 50: 1, 74: 1}
sentiment =  1.0


In [35]:
# See if we can make it output the negative label (-1)
analyseSentiment("oh no :(")

{372: 1, 438: 1, 5369: 1}


-1.0

# Exercise: Try inputting different texts to analyseSentiment which you think have negative or positive sentiment to see how the sentiment classification changes, and how good/bad it is. HINT: it can only be as good as the data it's trained on, so do look at pos.txt and neg.txt.

In [36]:
analyseSentiment("I don't like today's weather")

{110: 1, 191: 1, 51: 2, 192: 1, 128: 1, 1921: 1, 89: 1, 1259: 1}


-1.0

In [37]:
analyseSentiment("I like eating apple")

{110: 1, 128: 1, 994: 1, 1682: 1}


1.0

In [38]:
analyseSentiment("I like eating apple but I don't like eating orange")

{110: 2, 128: 2, 994: 2, 1682: 1, 332: 1, 191: 1, 51: 1, 192: 1, 8762: 1}


-1.0