# Naive Bayes for Sentiment Analysis

## Load Data

In [4]:
import pyspark
from pyspark import SparkContext 
import numpy as np
from collections import Counter

In [7]:
sc = pyspark.SparkContext(appName="NB")
path = "/Users/yinterian/teaching/ML-2/data/large-movie-review/aclImdb/"
train_path = path + "train/"
test_path = path + "test/"

In [8]:
data_raw_pos = sc.textFile(train_path + "pos/*.txt")
data_raw_neg = sc.textFile(train_path + "neg/*.txt")

In [9]:
# note that this is a whole review
data_raw_pos.first()

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [10]:
# sample 20% of the data
data_raw_pos = data_raw_pos.sample(False, 0.2, 1)
data_raw_neg = data_raw_neg.sample(False, 0.2, 1)

In [11]:
# number of partitions
data_raw_pos.getNumPartitions()

12500

In [12]:
# You may OR may NOT want to repartition or coalesce
# num_partitions = 3 or 4 times the number of CPUs
num_partitions = 8
data_raw_pos = data_raw_pos.repartition(num_partitions)
data_raw_neg = data_raw_neg.repartition(num_partitions)

In [14]:
# count 2529 elements
print(data_raw_pos.count())
print(data_raw_neg.count())

2482
2482


## Training NB

In [15]:
# split into words (here we could filter stepwords, clean, rm punctuation)
data_pos = data_raw_pos.flatMap(lambda x: x.split())
data_pos.take(10)

['I',
 'really',
 'dislike',
 'both',
 'Shrek',
 'films.',
 '(Since',
 'their',
 'both',
 '"PG"']

In [16]:
# transform to value pairs to be able to count
data_pos = data_pos.map(lambda x: (x, 1))
data_pos.take(10)

[('I', 1),
 ('really', 1),
 ('dislike', 1),
 ('both', 1),
 ('Shrek', 1),
 ('films.', 1),
 ('(Since', 1),
 ('their', 1),
 ('both', 1),
 ('"PG"', 1)]

In [17]:
# counting number of words
data_pos = data_pos.reduceByKey(lambda x,y:x+y)
data_pos.take(10)

[('really', 1033),
 ('dislike', 12),
 ('them', 536),
 ('would', 1003),
 ('disliked', 11),
 ('/><br', 4990),
 ('when', 1208),
 ('of', 14958),
 ('other', 794),
 ('Pixar.', 1)]

In [18]:
# we can do all together
data_neg = data_raw_neg.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x,y:x+y)
data_neg.take(10)

[('movie', 3480),
 ('scandal,', 3),
 ('like', 2062),
 ('where', 560),
 ('believe', 238),
 ('but', 3518),
 ('he', 1949),
 ('of', 13635),
 ('Cradle', 7),
 ('Will', 16)]

How should we compute count(pos) and count(neg)?

In [19]:
count_pos = data_pos.map(lambda x: x[1]).reduce(lambda x,y:x+y)
count_neg = data_neg.map(lambda x: x[1]).reduce(lambda x,y:x+y)

In [21]:
print(count_pos, count_neg)

589459 575558


In [23]:
## Let's get V
v1 = data_pos.map(lambda x: x[0]) # pos vocabulary
v2 = data_neg.map(lambda x: x[0]) # neg vocabulary
v = v1.union(v2)
#v.count()
v0 = v.distinct()
V = v0.count()
print(V)

101238


In [24]:
pos_denom = float(count_pos + V + 1)
neg_denom = float(count_neg + V + 1)

In [25]:
# log probabities
pos_prob = data_pos.map(lambda x: (x[0], np.log(float(x[1] + 1)/pos_denom)))

neg_prob = data_neg.map(lambda x: (x[0], np.log(float(x[1] + 1)/neg_denom))) 

In [26]:
pos_prob.take(10)

[('really', -6.5042679043913516),
 ('dislike', -10.880508601998189),
 ('them', -7.1594598649508612),
 ('would', -6.5337106592080509),
 ('disliked', -10.960551309671725),
 ('/><br', -4.9300663899901167),
 ('when', -6.3479091088449335),
 ('of', -3.8323895550852649),
 ('other', -6.7671158448053941),
 ('Pixar.', -12.75231077889978)]

In [27]:
pos_prob = dict(pos_prob.collect())
neg_prob = dict(neg_prob.collect())
print(pos_prob)



In [29]:
# broadcast = shared by all nodes
pos_prob_b = sc.broadcast(pos_prob)
neg_prob_b = sc.broadcast(neg_prob)

## Prediction

In [31]:
test_raw_pos = sc.textFile(test_path + "pos/*.txt")
test_raw_neg = sc.textFile(test_path + "neg/*.txt")

test_raw_pos = test_raw_pos.sample(False, 0.2, 1)
test_raw_neg = test_raw_neg.sample(False, 0.2, 1)

num_partitions = 8
test_raw_pos = test_raw_pos.repartition(num_partitions)
test_raw_neg = test_raw_neg.repartition(num_partitions)

print(test_raw_pos.count())
print(test_raw_neg.count())

2482
2482


In [36]:
doc = test_raw_pos.first()
print(doc)

"Seeing Other People" is a daring romantic comedy about a couple named Ed and Alice (Jay Mohr and Julianne Nicholson) who are engaged and plan to be wed soon. They live together but are both having doubts about their relationship. Alice realizes she's had so few sexual relationships in the past, she might just be marrying Ed because she's never felt anything else. So they agree to begin fooling around with other people for a while to test their own relationship.<br /><br />The movie balances a prescient question - by focusing too much on the "What if?" aspects of life, can it in fact do the opposite and only make you feel more constrained? When Ed begins having sex with a college girl he begins to become addicted and almost forget about Alice - when he realizes this, it scares him.<br /><br />I hadn't heard anything about this film in advance but I enjoyed it. It's not extremely well-made and definitely has that purposefully low-budget indie feel to it - but it's a lot better than most

In [37]:
def pred_class(doc):
    words = doc.split(" ")
    counts = Counter(words)
    log_pos = 0.0
    log_neg = 0.0
    for w in counts:
        log_pos += counts[w]* pos_prob_b.value.get(w, np.log(1.0/pos_denom))
        log_neg += counts[w]* neg_prob_b.value.get(w, np.log(1.0/neg_denom))
    if log_pos > log_neg:
        return "pos"
    return "neg"

In [38]:
pred_class(doc)

'pos'

In [39]:
test_pos_res = test_raw_pos.map(pred_class)
test_pos_res.take(10)

['pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'neg', 'pos', 'pos']

In [41]:
test_pos_res = test_raw_pos.map(pred_class).map(lambda x: (x, 1)).reduceByKey(lambda x,y:x+y)
pos_results = dict(test_pos_res.collect())
print(pos_results)

{'neg': 592, 'pos': 1890}


In [42]:
test_neg_res = test_raw_neg.map(pred_class).map(lambda x: (x, 1)).reduceByKey(lambda x,y:x+y)
neg_results = dict(test_neg_res.collect())
print(neg_results)

{'neg': 2156, 'pos': 326}


In [43]:
# compute accuracy
total = sum(neg_results.values()) + sum(pos_results.values())
acc = float(neg_results["neg"] + pos_results["pos"]) / float(total)
print(acc)

0.815068493150685


Your turn now. Improve data cleaning and try again. Can you add top K bi-grams?

Here is how we can  