In [1]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setMaster("local[*]").setAppName("Naive_Bayes")
sc   = SparkContext(conf=conf)
print ("Running Spark Version %s" % (sc.version))

Running Spark Version 2.2.0


In [66]:
path = "mini_newsgroups/*"
newsgroupsRawData = sc.wholeTextFiles(path)

In [67]:
type(newsgroupsRawData)

pyspark.rdd.RDD

In [68]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rec').getOrCreate()

In [69]:
dfWithoutSchema = spark.createDataFrame(newsgroupsRawData)

In [70]:
dfWithoutSchema.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)



In [72]:
dfWithoutSchema = dfWithoutSchema.selectExpr("_1 as str_label", "_2 as text")

In [73]:
from pyspark.sql.functions import split
dfWithoutSchema = dfWithoutSchema.withColumn('str_label',split('str_label','/')[4])

In [74]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="str_label", outputCol="label")
indexed_df = indexer.fit(dfWithoutSchema).transform(dfWithoutSchema)

In [75]:
indexed_df.count()

2000

In [76]:
(training_df,test_df) = indexed_df.randomSplit([0.8, 0.2], seed = 42)

In [77]:
training_df.show()

+-----------+--------------------+-----+
|  str_label|                text|label|
+-----------+--------------------+-----+
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
+-----------+--------------------+-----+
only showing top

In [78]:
training_df.head()

Row(str_label='alt.atheism', text='Newsgroups: alt.atheism\nPath: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!bb3.andrew.cmu.edu!news.sei.cmu.edu!cis.ohio-state.edu!pacific.mps.ohio-state.edu!zaphod.mps.ohio-state.edu!howland.reston.ans.net!bogus.sura.net!darwin.sura.net!jhunix.hcf.jhu.edu!news.cs.jhu.edu!jyusenkyou!arromdee\nFrom: arromdee@jyusenkyou.cs.jhu.edu (Ken Arromdee)\nSubject: Re: Alt.Atheism FAQ: Constructing a Logical Argument\nMessage-ID: <C5uErx.HM@blaze.cs.jhu.edu>\nSender: news@blaze.cs.jhu.edu (Usenet news system)\nOrganization: Johns Hopkins University CS Dept.\nReferences: <19930419105214@mantis.co.uk>\nDate: Wed, 21 Apr 1993 16:50:20 GMT\nLines: 27\n\nHere\'s a suggestion for the logical argument FAQ.  I don\'t think it\'s covered,\nthough the fallacy probably has a better name than the one I used:  How about\nit, mathew?\n\nINCONSISTENCY AND COUNTEREXAMPLE\n\nThis occurs when one party points out that some source of information takes\nstand A, which is incon

In [79]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.classification import NaiveBayes
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features")
#lr = LogisticRegression(maxIter=10, regParam=0.001)
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
pipeline = Pipeline(stages=[tokenizer,remover, hashingTF, nb])

# Fit the pipeline to training documents.
model = pipeline.fit(indexed_df)

In [80]:
predictions = model.transform(test_df)

In [81]:
predictions.select("text","prediction").show()

+--------------------+----------+
|                text|prediction|
+--------------------+----------+
|Newsgroups: alt.a...|       4.0|
|Newsgroups: alt.a...|       4.0|
|Newsgroups: alt.a...|       4.0|
|Newsgroups: alt.a...|       4.0|
|Newsgroups: alt.a...|       4.0|
|Path: cantaloupe....|       4.0|
|Path: cantaloupe....|       4.0|
|Path: cantaloupe....|       4.0|
|Path: cantaloupe....|       4.0|
|Path: cantaloupe....|       4.0|
|Path: cantaloupe....|       4.0|
|Xref: cantaloupe....|       4.0|
|Xref: cantaloupe....|       4.0|
|Xref: cantaloupe....|       4.0|
|Xref: cantaloupe....|       4.0|
|Newsgroups: comp....|       2.0|
|Path: cantaloupe....|       6.0|
|Path: cantaloupe....|       2.0|
|Path: cantaloupe....|       2.0|
|Path: cantaloupe....|       2.0|
+--------------------+----------+
only showing top 20 rows



In [82]:
from pyspark.ml.evaluation import RegressionEvaluator
# Create an RMSE evaluator using the label and predicted columns
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="label")
error = reg_eval.evaluate(predictions)

In [83]:
error

2.4029714614743543

In [86]:
indexed_df.show()

+-----------+--------------------+-----+
|  str_label|                text|label|
+-----------+--------------------+-----+
|alt.atheism|Xref: cantaloupe....|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Xref: cantaloupe....|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Xref: cantaloupe....|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
|alt.atheism|Newsgroups: alt.a...|  4.0|
|alt.atheism|Path: cantaloupe....|  4.0|
+-----------+--------------------+-----+
only showing top

In [95]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.

#labelIndexer = StringIndexer(inputCol="str_label", outputCol="indexedLabel").fit(indexed_df)
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features")
featureIndexer = VectorIndexer(inputCol=hashingTF.getOutputCol(), outputCol="features_", maxCategories=4)

gbt = GBTClassifier(labelCol="label", featuresCol="features_", maxIter=10)
pipeline2 = Pipeline(stages=[tokenizer,remover, hashingTF, featureIndexer, gbt])
model2 = pipeline2.fit(indexed_df)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61461)
Traceback (most recent call last):
  File "E:\Anaconda\Anaconda4.0\envs\py36\lib\site-packages\py4j\java_gateway.py", line 827, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "E:\Anaconda\Anaconda4.0\envs\py36\lib\site-packages\py4j\java_gateway.py", line 963, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [WinError 10061] 由于目标计算机积极拒绝，无法连接。


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:61461)