In [32]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.
      master("local[*]").
      appName("spark session example").
      getOrCreate()

In [33]:
import spark.implicits._

In [36]:
case class DataRecord(id:String,label: Double, text: String)

In [39]:
//原始資料讀入，轉成DataFrame
val data = spark.
    sparkContext.
    textFile("labeledTrainData25000.tsv").map({
    x =>
    var line = x.split("\t")
    DataRecord(line(0),line(1).toDouble,line(2))//
}).toDF()

In [40]:
data.show(1)

+--------+-----+--------------------+
|      id|label|                text|
+--------+-----+--------------------+
|"5814_8"|  1.0|"With all this st...|
+--------+-----+--------------------+
only showing top 1 row



In [1]:
//斷詞
import org.apache.spark.ml.feature.RegexTokenizer
val regexTokenizer = new RegexTokenizer().
  setInputCol("text").
  setOutputCol("word").
  setPattern("(\\W|\\d)")

In [42]:
//去除停用詞
import org.apache.spark.ml.feature.StopWordsRemover
val remover = new StopWordsRemover().
  setInputCol("word").
  setOutputCol("stopWord")

In [43]:
//count vector TF向量模型
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
val tf = new CountVectorizer().
      setInputCol("stopWord").
      setOutputCol("cv_vector").
      setVocabSize(200).  // 最多幾個字詞? 取最高頻的/////額外控制/////
      setMinDF(1) //統計字詞大於等於1次/////額外控制

In [44]:
//CPA維度縮減
import org.apache.spark.ml.feature.PCA
val pca = new PCA().
  setInputCol("cv_vector").
  setOutputCol("pca_vector").
  setK(5) 

In [45]:
//搭配一個你想要的分類器MPL
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier

val layers = Array[Int](5, 75, 50, 2)/////維度為5與上同/////
val trainer = new MultilayerPerceptronClassifier().
  setLayers(layers).
  setLabelCol("label").
  setFeaturesCol("pca_vector").
  setPredictionCol("prediction").
  setBlockSize(128).
  setSeed(1234L).
  setMaxIter(500)

In [46]:
//用一個管子串接起來
import org.apache.spark.ml.Pipeline
val pipeline = new Pipeline().
      setStages(Array(regexTokenizer, remover,tf, pca, trainer))

In [47]:
//將資料切成training, test 這裡因為資料太少，假設兩個資料集跟原始的資料集一樣!
val Array(training, test) = data.randomSplit(Array(0.7, 0.3))

In [48]:
//把training資料集 丟入管線 
val mlp_model = pipeline.fit( training )

In [49]:

val testResult = mlp_model.transform(  test )


import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
val predictionAndLabels = testResult.select("prediction", "label")
val evaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy")

In [50]:
println("Accuracy:"+evaluator.evaluate(predictionAndLabels))

Accuracy:0.5682641965364479


In [51]:
testResult.show()

+----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|        id|label|                text|                word|            stopWord|           cv_vector|          pca_vector|prediction|
+----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|     "0_9"|  1.0|"Bromwell High is...|[bromwell, high, ...|[bromwell, high, ...|(200,[3,6,11,13,2...|[-0.3557184500977...|       1.0|
| "10002_1"|  0.0|"Sorry everyone,,...|[sorry, everyone,...|[sorry, everyone,...|(200,[2,6,9,10,12...|[-0.8093691670658...|       0.0|
| "10004_8"|  1.0|"This isn't the c...|[this, isn, t, th...|[comedic, robin, ...|(200,[0,1,4,9,13,...|[-10.125619279929...|       0.0|
| "10005_3"|  0.0|"The second attem...|[the, second, att...|[second, attempt,...|(200,[0,1,2,3,4,8...|[-5.1893160399363...|       0.0|
| "10006_4"|  0.0|"I don't know who...|[i, don, t, know

#改成完整的資料集davinci_movie_reviews.csv

In [10]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.
      master("local[*]").
      appName("spark session example").
      getOrCreate()

import spark.implicits._

case class DataRecord(label: Double, text: String)

//原始資料讀入，轉成DataFrame
val data = spark.
    sparkContext.
    textFile("./davinci_movie_reviews.txt").map({
    x =>
    var line = x.split("\t")
    DataRecord(line(0).toDouble,line(1))
}).toDF()
/////讀檔/////

//斷詞
import org.apache.spark.ml.feature.RegexTokenizer
val regexTokenizer = new RegexTokenizer().
  setInputCol("text").
  setOutputCol("word").
  setPattern("(\\W|\\d)")

//去除停用詞
import org.apache.spark.ml.feature.StopWordsRemover
val remover = new StopWordsRemover().
  setInputCol("word").
  setOutputCol("stopWord")

//count vector TF向量模型
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
val tf = new CountVectorizer().
      setInputCol("stopWord").
      setOutputCol("cv_vector").
      setVocabSize(6000).  /// 最多幾個字詞? 取最高頻的  字數別太多 記憶體會爆掉!/////額外控制/////最大6000
      setMinDF(1) //統計字詞大於等於1次/////額外控制/////

//CPA維度縮減
import org.apache.spark.ml.feature.PCA
val pca = new PCA().
  setInputCol("cv_vector").
  setOutputCol("pca_vector").
  setK(200) /////設定維度為5/////

//搭配一個你想要的分類器MPL
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier

val layers = Array[Int](200, 60, 50, 2)/////維度為5與上同/////
val trainer = new MultilayerPerceptronClassifier().
  setLayers(layers).
  setLabelCol("label").
  setFeaturesCol("pca_vector").
  setPredictionCol("prediction").
  setBlockSize(128).
  setSeed(1234L).
  setMaxIter(500)

//用一個管子串接起來
import org.apache.spark.ml.Pipeline
val pipeline = new Pipeline().
      setStages(Array(regexTokenizer, remover,tf, pca, trainer))

//將資料切成training, test 這裡因為資料太少，假設兩個資料集跟原始的資料集一樣!
val Array(training, test) = data.randomSplit(Array(0.8, 0.2))/////打開/////

//val training = data/////注解掉/////
//val test = data/////注解掉/////

//把training資料集 丟入管線 
val mlp_model = pipeline.fit( training )

//預測測試資料集的答案
val testResult = mlp_model.transform(  test )


//計算準確度
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
val predictionAndLabels = testResult.select("prediction", "label")
val evaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy")

In [11]:
println("Accuracy:"+evaluator.evaluate(predictionAndLabels))

Accuracy:0.9916955017301038


In [12]:
testResult.show(2)

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|                text|                word|            stopWord|           cv_vector|          pca_vector|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  1.0|" I could have di...|[i, could, have, ...|[could, discussed...|(1748,[5,6,7,11,1...|[-0.5518235524981...|       1.0|
|  1.0|* He deemed me co...|[he, deemed, me, ...|[deemed, cool, li...|(1748,[0,1,11,95]...|[-0.8196292397423...|       1.0|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 2 rows

