In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [2]:
from pyspark.sql import Row  
from pyspark.sql.types import BooleanType 
from pyspark.sql.functions import udf 

In [3]:
data = spark.sparkContext.\
    textFile("./newlabeledTrainData25000.tsv").\
    map(lambda line: line.split("\t")).\
    map(lambda line_seperated: Row(id = line_seperated[0],label =float(line_seperated[1]),text=line_seperated[2])).\
    toDF()

In [4]:
data.show()

+-------+-----+--------------------+
|     id|label|                text|
+-------+-----+--------------------+
| 5814_8|  1.0|With all this stu...|
| 2381_9|  1.0|"\""The Classic W...|
| 7759_3|  0.0|The film starts w...|
| 3630_4|  0.0|"It must be assum...|
| 9495_8|  1.0|"Superbly trashy ...|
| 8196_8|  1.0|I dont know why p...|
| 7166_2|  0.0|This movie could ...|
|10633_1|  0.0|I watched this vi...|
|  319_1|  0.0|A friend of mine ...|
|8713_10|  1.0|"<br /><br />This...|
| 2486_3|  0.0|What happens when...|
|6811_10|  1.0|Although I genera...|
|11744_9|  1.0|"\""Mr. Harvey Li...|
| 7369_1|  0.0|"I had a feeling ...|
|12081_1|  0.0|"note to George L...|
| 3561_4|  0.0|"Stephen King ada...|
| 4489_1|  0.0|`The Matrix' was ...|
| 3951_2|  0.0|"Ulli Lommel's 19...|
|3304_10|  1.0|"This movie is on...|
|9352_10|  1.0|Most people, espe...|
+-------+-----+--------------------+
only showing top 20 rows



In [5]:
data

DataFrame[id: string, label: double, text: string]

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, RegexTokenizer

regexTokenizer = RegexTokenizer(inputCol="text", outputCol="word", pattern="\\W")

In [7]:
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover(inputCol="word", outputCol="stopWord")

In [26]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="stopWord", outputCol="cv_vector", vocabSize=250, minDF=2.0)

In [27]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
hashingTF = HashingTF(inputCol="stopWord", outputCol="htf_vector", numFeatures=500)
idf = IDF(inputCol="htf_vector", outputCol="tfidf_vector")

In [28]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [29]:
(training, test) = data.randomSplit([0.8, 0.2])

In [35]:
rf = RandomForestClassifier(labelCol="label", featuresCol="tfidf_vector", numTrees=50)

In [36]:
pipeline = Pipeline(stages=[regexTokenizer, remover, hashingTF, idf, rf])

In [37]:
model = pipeline.fit(training)

In [38]:
predictions = model.transform(test)

In [39]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("accuracy = %g " % (accuracy))
print("Test Error = %g " % (1.0-accuracy))

accuracy = 0.720897 
Test Error = 0.279103 


# tf-idf ：500 RandomForest 
## tree 30 
## accuracy:0.720897 

In [40]:
from pyspark.ml.feature import Word2Vec

In [41]:
word2Vec = Word2Vec(vectorSize=5, minCount=1, inputCol="stopWord", outputCol="word2vec")

In [47]:
rf1= RandomForestClassifier(labelCol="label", featuresCol="word2vec", numTrees=50)

In [48]:
pipeline1 = Pipeline(stages=[regexTokenizer, remover, word2Vec, rf1 ])

In [49]:
model1= pipeline1.fit(training)

In [50]:
predictions1 = model1.transform(test)

In [51]:
evaluator1 = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction")
accuracy1 = evaluator1.evaluate(predictions1)
print("accuracy = %g " % (accuracy1))
print("Test Error = %g " % (1.0-accuracy1))

accuracy = 0.660736 
Test Error = 0.339264 


# word2vec RandomForest

## vector5   tree 30 accuracy 0.660736 