In [3]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('EmailSpam')
sc = SparkContext(conf=conf)

In [4]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD

spam = sc.textFile('spam.txt')
ham = sc.textFile('ham.txt')

In [5]:
tf = HashingTF(numFeatures = 10000)
spamFeatures = spam.map(lambda email: tf.transform(email.split(' ')))
hamFeatures = ham.map(lambda email: tf.transform(email.split(' ')))

In [6]:
# Create LabeledPoint datasets for positive (spam) and negative (ham) examples.

positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))

trainingData = positiveExamples.union(negativeExamples)
trainingData.cache()

UnionRDD[6] at union at NativeMethodAccessorImpl.java:0

In [7]:
trainingData.take(2)

[LabeledPoint(1.0, (10000,[0,365,455,509,1320,1363,1583,2321,2403,3289,3342,4995,5336,5706,5831,6052,6300,6582,6744,8971,8977,9232,9604,9646,9878],[1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])),
 LabeledPoint(1.0, (10000,[0,365,940,2220,3122,4460,4671,5336,5849,8479,9604],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]))]

### Logistic Regression

In [8]:
model = LogisticRegressionWithSGD.train(trainingData)

In [9]:
posTest = tf.transform('O M G GET cheap stuff by sending money to ...'.split(' '))
negTest = tf.transform('Hi Dad, I started studying Spark the other ...'.split(' '))
print('Prediction for positive test example: %g' % model.predict(posTest))
print('Prediction for negative test example: %g' % model.predict(negTest))

Prediction for positive test example: 0
Prediction for negative test example: 0


### Creating vectors

In [12]:
from numpy import array
from pyspark.mllib.linalg import Vectors

denseVec1 = array([1.0, 2.0, 3.0])
denseVec2 = Vectors.dense([1.0, 2.0, 3.0])

sparseVec1 = Vectors.sparse(4, {0:1.0, 2:2.0})
sparseVec2 = Vectors.sparse(4, [0, 2], [1.0, 2.0])

### Using HashingTF

In [17]:
sentence = 'hello hello world'
words = sentence.split()
tf = HashingTF(10000)
tf.transform(words)

SparseVector(10000, {7772: 2.0, 9657: 1.0})

In [20]:
rdd = sc.wholeTextFiles("data").map(lambda (name, text): text.split())
tfVectors = tf.transform(rdd)

SyntaxError: invalid syntax (<ipython-input-20-fe39362ec14d>, line 1)