In [1]:
import org.apache.spark.sql.SparkSession

In [2]:
val spark=SparkSession.builder.
    master("local[*]").
    appName("None").
    getOrCreate()

In [4]:
case class Datarecord(id:String,label:Double,text:String)
import spark.implicits._

In [5]:
val data=spark.sparkContext.
    textFile("./newlabeledTrainData25000.tsv").map({
        x =>
        var line=x.split("\t")
        Datarecord(line(0),line(1).toDouble,line(2))
    }).toDF()

In [6]:
data.show(1)

+------+-----+--------------------+
|    id|label|                text|
+------+-----+--------------------+
|5814_8|  1.0|With all this stu...|
+------+-----+--------------------+
only showing top 1 row



In [8]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, CountVectorizer, HashingTF, IDF}

In [9]:
val regexTokenizer = new RegexTokenizer().
  setInputCol("text").
  setOutputCol("word").
  setPattern("\\W")

In [10]:
val remover = new StopWordsRemover().
  setInputCol("word").
  setOutputCol("stopWord")

In [11]:
val tf = new CountVectorizer().
      setInputCol("stopWord").
      setOutputCol("cv_vector").
      setVocabSize(300).  
      setMinDF(2) 

In [12]:
val hashingTF = new HashingTF().
    setInputCol("stopWord").
    setOutputCol("htf_vector").
    setNumFeatures(500)

In [13]:
val idf = new IDF().
    setInputCol(hashingTF.getOutputCol).
    setOutputCol("tfidf_vector")

In [14]:
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

In [15]:
val layers = Array[Int](300,15,14, 2)
val trainer = new MultilayerPerceptronClassifier().
  setLayers(layers).
  setLabelCol("label").
  setFeaturesCol("cv_vector").
  setPredictionCol("prediction").
  setBlockSize(128).
  setSeed(1234L).
  setMaxIter(500)

In [16]:
val pipeline = new Pipeline().
      setStages(Array(regexTokenizer, remover, tf, hashingTF, idf, trainer))

In [17]:
val Array(training, test) = data.randomSplit(Array(0.8, 0.2))
val mlp_model = pipeline.fit(training)
val testResult = mlp_model.transform(test)

In [20]:
testResult.show(2)

+----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|        id|label|                text|                word|            stopWord|           cv_vector|          htf_vector|        tfidf_vector|prediction|
+----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|""10116_1"|  0.0|"I honestly want ...|[i, honestly, wan...|[honestly, want, ...|(300,[0,1,3,4,14,...|(500,[27,31,44,55...|(500,[27,31,44,55...|       1.0|
|""10205_2"|  0.0|"So, I'm wonderin...|[so, i, m, wonder...|[wondering, watch...|(300,[0,1,2,3,5,7...|(500,[3,20,25,31,...|(500,[3,20,25,31,...|       0.0|
+----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 2 rows



In [21]:
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
val predictionAndLabels = testResult.select("prediction", "label")
val evaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy")
println("Accuracy:"+evaluator.evaluate(predictionAndLabels))

Accuracy:0.7499010684606252


#  tf_idf ---MLP

### Layers:300  tf-idf:500

### Accuracy:0.749

In [23]:
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}

In [24]:
val rf = new RandomForestClassifier().
  setLabelCol("label").
  setFeaturesCol("tfidf_vector").
  setNumTrees(40)

In [25]:
val pipeline1 = new Pipeline().
      setStages(Array(regexTokenizer, remover, hashingTF, idf, rf))

In [26]:
val rf_model = pipeline1.fit(training)
val rftestResult = rf_model.transform(test)

In [28]:
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
val predictionAndLabels1 = rftestResult.select("prediction", "label")
val evaluator1 = new MulticlassClassificationEvaluator().setMetricName("accuracy")
println("Accuracy:"+evaluator1.evaluate(predictionAndLabels1))

Accuracy:0.7130985358132172


# RandomForest

## tree:40 tf-idf:500
##  Accuracy:0.713

In [42]:
val pca_hashingTF30 = new HashingTF().
    setInputCol("stopWord").
    setOutputCol("htf_vector3").
    setNumFeatures(500)

In [43]:
val pca_idf30 = new IDF().
    setInputCol(pca_hashingTF3.getOutputCol).
    setOutputCol("pca_tfidf_vector3")

In [44]:
val pca_rf30 = new RandomForestClassifier().
  setLabelCol("label").
  setFeaturesCol("pca_tfidf_vector3").
  setNumTrees(30)

In [45]:
val pipeline30 = new Pipeline().
      setStages(Array(regexTokenizer, remover, pca_hashingTF30, pca_idf30, pca_rf30))

In [46]:
val pca_rf_model30 = pipeline30.fit(training)
val pca_rftestResult30 = pca_rf_model30.transform(test)

In [48]:
val pca_predictionAndLabels30 = pca_rftestResult30.select("prediction", "label")
val evaluator30 = new MulticlassClassificationEvaluator().setMetricName("accuracy")
println("Accuracy:"+evaluator30.evaluate(pca_predictionAndLabels30))

Accuracy:0.7063711911357341


# PCA-RandomForest

# tf-idf:500 tree:30 accuracy:0.706


In [36]:
import org.apache.spark.ml.feature.Word2Vec
val word2vec5 = new Word2Vec().
  setInputCol("stopWord").
  setOutputCol("w2v_vector5").
  setVectorSize(100).
  setMinCount(1)

In [37]:
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}

In [38]:
val word2vec_rf5 = new RandomForestClassifier().
  setLabelCol("label").
  setFeaturesCol("w2v_vector5").
  setNumTrees(30)

In [39]:
val word2vex_pipeline5 = new Pipeline().
      setStages(Array(regexTokenizer, remover, word2vec5, word2vec_rf5))

In [40]:
val word2vex_rf_model5 = word2vex_pipeline5.fit( training )
val word2vex_testResult5 = word2vex_rf_model5.transform(test)

In [41]:
val word2vex_predictionAndLabels5 = word2vex_testResult5.select("prediction", "label")
val evaluator5 = new MulticlassClassificationEvaluator().setMetricName("accuracy")
println("Accuracy:"+evaluator5.evaluate(word2vex_predictionAndLabels5))

Accuracy:0.7853185595567868


# Word2Vec-RF

# size:100 tree:30 accuracy:0.785