In [1]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
from pyspark.mllib.linalg import Vectors
import re

In [3]:
train_pos = sc.textFile('NB_files/train_pos.txt')
train_neg = sc.textFile('NB_files/train_neg.txt')
test_pos = sc.textFile('NB_files/test_pos.txt')
test_neg = sc.textFile('NB_files/test_neg.txt')

In [4]:
stop_words = ['a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also',
              'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because',
              'been', 'but', 'by', 'can', 'cannot', 'could', 'dear',
              'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for',
              'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers',
              'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is',
              'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may',
              'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'nor',
              'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our',
              'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since',
              'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then',
              'there', 'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us',
              've', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which',
              'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet',
              'you', 'your']

In [5]:
def parse(blob):
    blob = blob.split('\n')
    blob = [re.sub('[^A-Za-z]', ' ', i) for i in blob]
    blob = [i.split() for i in blob]
    blob = [item for sublist in blob for item in sublist]
    blob = [word.strip().lower() for word in blob if word not in stop_words if len(word) >= 3]
    return blob

In [6]:
# Parse Text into Words
train_pos_bag = train_pos.map(parse)
train_neg_bag = train_neg.map(parse)
test_pos_bag = test_pos.map(parse)
test_neg_bag = test_neg.map(parse)

In [7]:
# TF Mapping
train_tf_pos = HashingTF().transform(train_pos_bag)
train_tf_neg = HashingTF().transform(train_neg_bag)
test_tf_pos = HashingTF().transform(test_pos_bag)
test_tf_neg = HashingTF().transform(test_neg_bag)

In [8]:
# IDF Transformation
train_idf_pos = IDF().fit(train_tf_pos)
train_idf_neg = IDF().fit(train_tf_neg)
test_idf_pos = IDF().fit(test_tf_pos)
test_idf_neg = IDF().fit(test_tf_neg)

train_tfidf_pos = train_idf_pos.transform(train_tf_pos)
train_tfidf_neg = train_idf_neg.transform(train_tf_neg)
test_tfidf_pos = test_idf_pos.transform(test_tf_pos)
test_tfidf_neg = test_idf_neg.transform(test_tf_neg)

In [9]:
# Label Positive and Negative
train_tfidf_pos = train_tfidf_pos.map(lambda x: LabeledPoint(1, x))
train_tfidf_neg = train_tfidf_neg.map(lambda x: LabeledPoint(0, x))
test_tfidf_pos = test_tfidf_pos.map(lambda x: LabeledPoint(1, x))
test_tfidf_neg = test_tfidf_neg.map(lambda x: LabeledPoint(0, x))

In [10]:
# Join Positive and Negative
train_all_tfidf = train_tfidf_pos.union(train_tfidf_neg)
test_all_tfidf = test_tfidf_pos.union(test_tfidf_neg)

train_all_tfidf.cache()
test_all_tfidf.cache()

UnionRDD[41] at union at NativeMethodAccessorImpl.java:-2

### Model Accuracy

In [11]:
train_data, test_data = train_all_tfidf.randomSplit([0.6, 0.4], seed = 0)

model_train = NaiveBayes.train(train_data)
predictionAndLabel = test_data.map(lambda p : (model_train.predict(p.features), p.label))
train_accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test_data.count()
print "TRAINING ACCURACY:  ", train_accuracy

TRAINING ACCURACY:   0.782765531062


In [12]:
#### TRAIN SET ACCURACY ####

train_data, test_data = train_all_tfidf.randomSplit([0.6, 0.4], seed = 0)

model_train = NaiveBayes.train(train_data)
predictionAndLabel = test_data.map(lambda p : (model_train.predict(p.features), p.label))
train_accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test_data.count()
print "TRAINING ACCURACY:  ", train_accuracy

TRAINING ACCURACY:   0.782765531062


In [13]:
#### TEST SET ACCURACY ####

model_test = NaiveBayes.train(train_all_tfidf)
predictionAndLabel = test_all_tfidf.map(lambda p : (model_test.predict(p.features), p.label))
test_accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test_all_tfidf.count()
print "TEST ACCURACY:  ", test_accuracy

TEST ACCURACY:   0.72904
