In [1]:
import os
os.environ['PYSPARK_PYTHON'] = '/opt/conda/envs/python2/bin/python2'

from pyspark import SparkConf, SparkContext
sconf = SparkConf()
sconf.setMaster("local[*]")
sc = SparkContext(conf=sconf)

In [2]:
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)

In [3]:
data = sqlCtx.read.json('../notebook_data/20newsgroups.labelled.json.gz')

In [4]:
data.printSchema()

root
 |-- approved: string (nullable = true)
 |-- article-id: string (nullable = true)
 |-- content: string (nullable = true)
 |-- date: string (nullable = true)
 |-- distribution: string (nullable = true)
 |-- followup-to: string (nullable = true)
 |-- from: string (nullable = true)
 |-- in-reply-to: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- label: string (nullable = true)
 |-- lines: string (nullable = true)
 |-- message-id: string (nullable = true)
 |-- newsgroups: string (nullable = true)
 |-- nntp-posting-host: string (nullable = true)
 |-- organization: string (nullable = true)
 |-- originator: string (nullable = true)
 |-- path: string (nullable = true)
 |-- references: string (nullable = true)
 |-- reply-to: string (nullable = true)
 |-- sender: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- x-newsreader: string (nullable = true)
 |-- xref: string (nullable = true)



In [5]:
data.registerTempTable('newsgroups')
docs = sqlCtx.sql("SELECT label, content FROM newsgroups")
labelsDistinct = sqlCtx.sql("SELECT DISTINCT(label) FROM newsgroups")
print sqlCtx.sql("SELECT COUNT(DISTINCT(label)) as labelCount FROM newsgroups").collect()

[Row(labelCount=20)]


In [6]:
print labelsDistinct.count()
labelList = labelsDistinct.map(lambda r: r.label.encode('utf-8')).collect()
sc.broadcast(labelList)
from IPython.display import display, HTML
th = "<th>Label</th>"
td = ["<tr><td>" + d + "</td></tr>" for d in labelList]
display(HTML("<table><thead><tr>" + "".join(th) + "</tr></thead><tbody>" + "".join(td) + "</tbody></table>"))

20


Label
rec.sport.hockey
sci.electronics
sci.med
rec.autos
comp.sys.mac.hardware
comp.windows.x
rec.sport.baseball
comp.sys.ibm.pc.hardware
misc.forsale
rec.motorcycles


In [7]:
labels = sqlCtx.sql("SELECT label FROM newsgroups")
labelCounts = labels.map(lambda r: (str(r.label), 1)).reduceByKey(lambda v1, v2: v1 + v2).collect()

from IPython.display import display, HTML
th = "<th>ID</th><th>Label</th><th>Messages</th>"
td = ["<tr><td>" + str(labelList.index(l)) +"</td><td>" + l + "</td><td>" + str(m) +"</tr>" for (l,m) in labelCounts]
display(HTML("<table><thead><tr>" + "".join(th) + "</tr></thead><tbody>" + "".join(td) + "</tbody></table>"))

ID,Label,Messages
9,rec.motorcycles,1000
4,comp.sys.mac.hardware,1000
11,talk.politics.misc,1000
15,soc.religion.christian,997
12,comp.graphics,1000
19,talk.religion.misc,1000
5,comp.windows.x,1000
7,comp.sys.ibm.pc.hardware,1000
16,talk.politics.guns,1000
13,alt.atheism,1000


In [10]:
import re

labelledDocs = docs.map(lambda r: (labelList.index(r.label), r.label.encode('utf-8'), re.sub('\s+', ' ', r.content.encode('utf-8')).strip().decode('utf-8')))
print labelledDocs.take(10)
print labelledDocs.count()

[(13, 'alt.atheism', u'Archive-name: atheism/resources Alt-atheism-archive-name: resources Last-modified: 11 December 1992 Version: 1.0 Atheist Resources Addresses of Atheist Organizations USA FREEDOM FROM RELIGION FOUNDATION Darwin fish bumper stickers and assorted other atheist paraphernalia are available from the Freedom From Religion Foundation in the US. Write to: FFRF, P.O. Box 750, Madison, WI 53701. Telephone: (608) 256-8900 EVOLUTION DESIGNS Evolution Designs sell the "Darwin fish". It\'s a fish symbol, like the ones Christians stick on their cars, but with feet and the word "Darwin" written inside. The deluxe moulded 3D plastic fish is $4.95 postpaid in the US. Write to: Evolution Designs, 7119 Laurel Canyon #4, North Hollywood, CA 91605. People in the San Francisco Bay area can get Darwin Fish from Lynn Gold -- try mailing <figmo@netcom.com>. For net people who go to Lynn directly, the price is $4.95 per fish. AMERICAN ATHEIST PRESS AAP publish various atheist books -- criti

In [15]:
from pyspark.mllib.feature import HashingTF
#import nltk

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
#tokenizer.tokenize('Eighty-seven miles to go, yet.  Onward!')

htf = HashingTF()
#htfDocs = labelledDocs.map(lambda (i,l,c) : (i, l, htf.transform(nltk.word_tokenize(c))))
htfDocs = labelledDocs.map(lambda (i,l,c) : (i, l, htf.transform(tokenizer.tokenize(c))))
print htfDocs.take(1)
print htfDocs.count()

[(13, 'alt.atheism', SparseVector(1048576, {1542: 1.0, 2424: 1.0, 3407: 1.0, 3932: 5.0, 4048: 1.0, 4415: 3.0, 5438: 1.0, 7380: 1.0, 7801: 1.0, 9195: 1.0, 9669: 1.0, 10494: 2.0, 13193: 1.0, 14370: 1.0, 15702: 1.0, 16883: 1.0, 17001: 1.0, 19142: 1.0, 19278: 2.0, 19527: 3.0, 19550: 1.0, 19744: 1.0, 20838: 1.0, 21524: 3.0, 24165: 1.0, 24717: 1.0, 29899: 1.0, 30848: 1.0, 31335: 1.0, 31467: 1.0, 31529: 1.0, 32516: 1.0, 32626: 1.0, 33950: 1.0, 36749: 1.0, 36751: 3.0, 36754: 10.0, 36757: 4.0, 38019: 2.0, 38034: 1.0, 39073: 1.0, 39164: 1.0, 40517: 1.0, 43916: 1.0, 44593: 1.0, 46605: 1.0, 50270: 1.0, 50570: 24.0, 50573: 5.0, 50583: 18.0, 50591: 1.0, 52237: 1.0, 52574: 2.0, 53243: 1.0, 54818: 1.0, 56304: 1.0, 57166: 5.0, 57970: 1.0, 59273: 1.0, 59788: 1.0, 60338: 1.0, 62563: 1.0, 63131: 1.0, 65502: 1.0, 66413: 1.0, 66951: 5.0, 67434: 1.0, 69386: 1.0, 69970: 1.0, 71108: 1.0, 73900: 1.0, 74187: 1.0, 75111: 1.0, 75612: 1.0, 76149: 2.0, 76489: 1.0, 76746: 1.0, 76806: 1.0, 78123: 2.0, 80379: 2.0, 8093

In [16]:
from pyspark.mllib.regression import LabeledPoint
inputData = htfDocs.map(lambda (i,l,v): LabeledPoint(i,v))
print inputData.first()
training = inputData
testing = inputData
#training, testing, validation = inputData.randomSplit([0.7,0.2,0.1])
print training.first()
print testing.first()

(13.0,(1048576,[1542,2424,3407,3932,4048,4415,5438,7380,7801,9195,9669,10494,13193,14370,15702,16883,17001,19142,19278,19527,19550,19744,20838,21524,24165,24717,29899,30848,31335,31467,31529,32516,32626,33950,36749,36751,36754,36757,38019,38034,39073,39164,40517,43916,44593,46605,50270,50570,50573,50583,50591,52237,52574,53243,54818,56304,57166,57970,59273,59788,60338,62563,63131,65502,66413,66951,67434,69386,69970,71108,73900,74187,75111,75612,76149,76489,76746,76806,78123,80379,80932,81194,81581,82263,83223,87110,88470,90451,91216,93450,93972,94622,94754,94992,96914,96915,97060,98795,98990,101596,102446,103998,104961,105061,105859,106702,106922,107278,110066,110522,111058,111986,112640,117409,117618,118178,118992,120101,122311,122352,122601,123541,124474,124607,124775,125743,126234,129375,130175,130288,130546,132126,133535,134386,134751,136113,137162,139171,141783,141890,141912,145141,145380,147242,147433,147440,147842,149453,151459,151678,151890,156374,156454,163506,163843,164286,16

In [17]:
import timeit
start_time = timeit.default_timer()

from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
model = NaiveBayes.train(training, 1.0)

elapsed = timeit.default_timer() - start_time
print elapsed

325.652728081


In [18]:
predictionAndLabel = testing.map(lambda p : (model.predict(p.features), p.label))
print predictionAndLabel.take(10)
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / testing.count()
print accuracy

[(13.0, 13.0), (13.0, 13.0), (15.0, 13.0), (13.0, 13.0), (11.0, 13.0), (13.0, 13.0), (13.0, 13.0), (13.0, 13.0), (13.0, 13.0), (13.0, 13.0)]
0.793569035355


### Mess below

In [None]:
from pyspark.mllib.feature import HashingTF
htf = HashingTF()
hashedTuples = docs.map(lambda r : (labelList.index(r.label), htf.transform(r.content.encode('utf-8').replace('\n', ' ').replace('\r', ''))))
print hashedTuples.first()

In [None]:
from pyspark.mllib.regression import LabeledPoint
inputData = hashedTuples.map(lambda (c,v): LabeledPoint(c,v))
training, testing, validation = inputData.randomSplit([0.7,0.3,0.0], 5)

In [None]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
model = NaiveBayes.train(training, 1.0)

In [None]:
predictionAndLabel = testing.map(lambda p : (model.predict(p.features), p.label))
print predictionAndLabel.collect()
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / testing.count()
print accuracy

In [None]:
from pyspark.mllib.feature import HashingTF
htf = HashingTF()
labels = docs.map(lambda r: (labelList.index(r.label)))
tfDocs = htf.transform(docs.map(lambda r: r.content.encode('utf-8')))
tfDocs.cache()
print tfDocs.first()

In [None]:
from pyspark.mllib.feature import IDF
hidf = IDF()
hidf = hidf.fit(tfDocs)
tfIdfDocs = hidf.transform(tfDocs)
print tfIdfDocs.first()

featureAndLabelData = labels.zip(tfIdfDocs)

print featureAndLabelData.first()


In [None]:
from pyspark.mllib.regression import LabeledPoint
inputData = featureAndLabelData.map(lambda (c,v): LabeledPoint(c,v))

In [None]:
training, testing, validation = inputData.randomSplit([0.9,0.1,0.1], 5)
training.cache()
print training.count()

In [None]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
# Train a naive Bayes model.
model = NaiveBayes.train(training, 1.0)

# Make prediction and test accuracy.
#predictionAndLabel = testing.map(lambda p : (model.predict(p.features), p.label))
#accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / testing.count()

In [None]:
predictionAndLabel = testing.map(lambda p : (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / testing.count()
print accuracy