## Load Data

In [1]:
import numpy as np
from collections import Counter

In [1]:
path = "/Users/zefeng-zhang/data/aclImdb/"
train_path = path + 'train/'
test_path = path + 'test/'

In [2]:
data_raw_pos = sc.textFile(train_path + "pos/*.txt")
data_raw_neg = sc.textFile(train_path + "neg/*.txt")

In [3]:
data_raw_pos.first()

u'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [4]:
data_raw_pos = data_raw_pos.sample(False, 0.9, 1)
data_raw_neg = data_raw_neg.sample(False, 0.9, 1)

In [29]:
data_raw_pos.getNumPartitions()

12500

In [30]:
num_partitions = 8
data_raw_pos = data_raw_pos.repartition(num_partitions)
data_raw_neg = data_raw_neg.repartition(num_partitions)

## Training NB

In [31]:
# split into words
data_pos = data_raw_pos.flatMap(lambda x : x.split()).map(lambda x : (x, 1)).reduceByKey(lambda x, y : x+y)
data_pos.take(10)

[(u'convolute', 1),
 (u'remastered', 8),
 (u'drunkenness.', 1),
 (u'Northam)', 4),
 (u'gangs.', 1),
 (u'anyways.I', 1),
 (u'Vindhyan.', 1),
 (u'bar!', 2),
 (u'actioner,', 1),
 (u'revelers', 3)]

In [32]:
data_neg = data_raw_neg.flatMap(lambda x : x.split()).map(lambda x : (x, 1)).reduceByKey(lambda x, y : x+y)
data_neg.take(10)

[(u'fawn', 2),
 (u'Dh\xe9ry,', 1),
 (u'boys),', 1),
 (u'"SPOILER', 1),
 (u'planted-in-life', 1),
 (u'gangs.', 1),
 (u'her,in', 1),
 (u'Megalodon"', 1),
 (u'(269.)', 1),
 (u"syberberg's", 1)]

In [33]:
count_pos = data_pos.map(lambda x : x[1]).reduce(lambda x, y : x+y)
count_neg = data_neg.map(lambda x : x[1]).reduce(lambda x, y : x+y)
print count_pos, count_neg

2661940 2593243


In [34]:
v1 = data_pos.map(lambda x : x[0])
v2 = data_neg.map(lambda x : x[0])
v0 = v1.union(v2).distinct()
v = v0.count()
print v

262059


In [35]:
pos_denom = float(count_pos + v + 1)
neg_denom = float(count_neg + v + 1)

In [36]:
pos_prob = data_pos.map(lambda x : (x[0], np.log(float(x[1] + 1)/pos_denom)))
neg_prob = data_neg.map(lambda x : (x[0], np.log(float(x[1] + 1)/neg_denom)))

In [37]:
pos_prob.take(10)
neg_prob.take(10)

[(u'fawn', -13.766076236369988),
 (u'Dh\xe9ry,', -14.171541344478152),
 (u'boys),', -14.171541344478152),
 (u'"SPOILER', -14.171541344478152),
 (u'planted-in-life', -14.171541344478152),
 (u'gangs.', -14.171541344478152),
 (u'her,in', -14.171541344478152),
 (u'Megalodon"', -14.171541344478152),
 (u'(269.)', -14.171541344478152),
 (u"syberberg's", -14.171541344478152)]

In [38]:
pos_prob = dict(pos_prob.collect())
neg_prob = dict(neg_prob.collect())

In [39]:
pos_prob_b = sc.broadcast(pos_prob)
neg_prob_b = sc.broadcast(neg_prob)

## Prediction

In [40]:
def pred_class(doc):
    words = doc.split(" ")
    counts = Counter(words)
    log_pos = 0.0
    log_neg = 0.0
    for w in counts:
        log_pos += counts[w] * pos_prob_b.value.get(w, np.log(1.0/pos_denom)) # default probability of "unk"
        log_neg += counts[w] * neg_prob_b.value.get(w, np.log(1.0/neg_denom))
    if log_pos > log_neg:
        return "pos"
    return "neg"

In [6]:
test_raw_pos = sc.textFile(test_path + "pos/*.txt")
print test_raw_pos.first()
test_raw_neg = sc.textFile(test_path + "neg/*.txt")

I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge.


In [42]:
test_pos_res = test_raw_pos.map(pred_class)
pos_results = Counter(test_pos_res.collect())

test_neg_res = test_raw_neg.map(pred_class)
neg_results = Counter(test_neg_res.collect())

In [45]:
accuracy = (pos_results['pos'] + neg_results['neg'])/float(pos_results['pos'] + pos_results['neg'] + neg_results['pos'] + neg_results['neg'])
print accuracy

0.8248
