In [53]:
println(s"Current spark version is ${spark.version}")

Current spark version is 2.4.4


In [54]:
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, LongType, StringType}

val dataSchema = new StructType()
    .add("target", IntegerType)
    .add("id", LongType)
    .add("raw_timestamp", StringType)
    .add("query_status", StringType)
    .add("author", StringType)
    .add("tweet", StringType)

val dataPath= "/home/jovyan/data/training.1600000.processed.noemoticon.csv"

// Load and parse the data file, converting it to a DataFrame.
val raw_sentiment = spark.read
    .format("csv")
    .option("header",false)
    .schema(dataSchema)
    .load(dataPath)
    .selectExpr("(case when target=4 then 1 else 0 end) as label","trim(tweet) as tweet")

raw_sentiment.groupBy($"label").count.show
raw_sentiment.where("label=0").show(5,150)
raw_sentiment.where("label=1").show(5,150)

+-----+------+
|label| count|
+-----+------+
|    1|800000|
|    0|800000|
+-----+------+

+-----+-------------------------------------------------------------------------------------------------------------------+
|label|                                                                                                              tweet|
+-----+-------------------------------------------------------------------------------------------------------------------+
|    0|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|
|    0|    is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!|
|    0|                          @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds|
|    0|                                                                     my whole body feels itchy and like its on fire|
|    0|     @nationwideclass no, it's not

dataSchema = StructType(StructField(target,IntegerType,true), StructField(id,LongType,true), StructField(raw_timestamp,StringType,true), StructField(query_status,StringType,true), StructField(author,StringType,true), StructField(tweet,StringType,true))
dataPath = /home/jovyan/data/training.1600000.processed.noemoticon.csv
raw_sentiment = [label: int, tweet: string]


[label: int, tweet: string]

In [55]:
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}

// tokenizer - split's tweet to words
val tokenizer = new Tokenizer()
    .setInputCol("tweet")
    .setOutputCol("words")

// getting features from words (through hash table)
val hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("features")

// Index labels, adding metadata to the label column.
// :-) // Fit on whole dataset to include all labels in index.
val labelIndexer = new StringIndexer()
  .setInputCol("label")
  .setOutputCol("indexedLabel")
  .fit(raw_sentiment)

// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
val featureIndexer = new VectorIndexer()
  .setInputCol("features")
  .setOutputCol("indexedFeatures")
  .setMaxCategories(4)
  //.fit(raw_sentiment)

// Train a RandomForest model.
val rf = new RandomForestClassifier()
  .setLabelCol("indexedLabel")
  .setFeaturesCol("indexedFeatures")
  .setNumTrees(10)

// Convert indexed labels back to original labels.
val labelConverter = new IndexToString()
  .setInputCol("prediction")
  .setOutputCol("predictedLabel")
  .setLabels(labelIndexer.labels)

// Chain indexers and forest in a Pipeline.
val pipeline = new Pipeline()
  .setStages(Array(tokenizer, hashingTF, labelIndexer, featureIndexer, rf, labelConverter))

tokenizer = tok_c5f7dc22a611
hashingTF = hashingTF_2882650a5058
labelIndexer = strIdx_e875a8015380
featureIndexer = vecIdx_561eaf79084a
rf = rfc_31d0ae4bc60c


labelConverter: org.apache.spark.ml.feature.Ind...


rfc_31d0ae4bc60c

In [None]:
// Split the data into training and test sets (30% held out for testing).
// val Array(trainingData, testData) = raw_sentiment.randomSplit(Array(0.7, 0.3))

In [56]:
// Train model. This also runs the indexers.
val model = pipeline.fit(raw_sentiment)

model = pipeline_1d3eb1631313


pipeline_1d3eb1631313

In [57]:
val modelPath = "/home/jovyan/models/spark-ml-model"

modelPath = /home/jovyan/models/spark-ml-model


/home/jovyan/models/spark-ml-model

In [58]:
model.write.overwrite().save(modelPath)

In [59]:
val sameModel = PipelineModel.load(modelPath)

sameModel = pipeline_1d3eb1631313


pipeline_1d3eb1631313

In [63]:
// Make predictions on 30%th test set
val predictionsDF = sameModel.transform(raw_sentiment)

predictionsDF = [label: int, tweet: string ... 8 more fields]


[label: int, tweet: string ... 8 more fields]

In [66]:
predictionsDF.select("tweet", "words", "features", "rawPrediction", "probability", "prediction").show(20)
//predictionsDF.where("tweet like '%Need a hug%'").select("tweet", "words", "features").show(20,70)

+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|               tweet|               words|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|@switchfoot http:...|[@switchfoot, htt...|(1000,[7,14,21,54...|[4.43931449741300...|[0.44393144974130...|       1.0|
|is upset that he ...|[is, upset, that,...|(1000,[170,193,22...|[5.46349235938573...|[0.54634923593857...|       0.0|
|@Kenichan I dived...|[@kenichan, i, di...|(1000,[10,36,77,1...|[5.60297627210827...|[0.56029762721082...|       0.0|
|my whole body fee...|[my, whole, body,...|(1000,[82,191,296...|[5.50443548555005...|[0.55044354855500...|       0.0|
|@nationwideclass ...|[@nationwideclass...|(1000,[18,96,130,...|[5.00477983480453...|[0.50047798348045...|       0.0|
|@Kwesidei not the...|[@kwesidei, not, ...|(1000,[18,223

In [None]:
// Some examples from doc...

/*
// Select (prediction, true label) and compute test error.
val evaluator = new MulticlassClassificationEvaluator()
  .setLabelCol("indexedLabel")
  .setPredictionCol("prediction")
  .setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictionsDF)
println(s"Test Error = ${(1.0 - accuracy)}")

val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel]
println(s"Learned classification forest model:\n ${rfModel.toDebugString}")
*/

In [68]:
// Select example rows to display.
predictionsDF.where("label=0").limit(10).union(predictionsDF.where("label=1").limit(10)).select("predictedLabel", "label", "tweet", "words", "features").show(20,40)

+--------------+-----+----------------------------------------+----------------------------------------+----------------------------------------+
|predictedLabel|label|                                   tweet|                                   words|                                features|
+--------------+-----+----------------------------------------+----------------------------------------+----------------------------------------+
|             1|    0|@switchfoot http://twitpic.com/2y1zl ...|[@switchfoot, http://twitpic.com/2y1z...|(1000,[7,14,21,54,91,170,220,246,311,...|
|             0|    0|is upset that he can't update his Fac...|[is, upset, that, he, can't, update, ...|(1000,[170,193,223,248,281,333,343,37...|
|             0|    0|@Kenichan I dived many times for the ...|[@kenichan, i, dived, many, times, fo...|(1000,[10,36,77,188,207,248,329,338,3...|
|             0|    0|my whole body feels itchy and like it...|[my, whole, body, feels, itchy, and, ...|(1000,[82,191,296,33

In [None]:
predictionsDF.where("label=0").limit(15).union(predictionsDF.where("label=1").limit(15))
   .selectExpr(
       "label","tweet","cast (words as string)","cast (features as string)","cast (rawPrediction as string)","cast (probability as string)","prediction"
   )
   .coalesce(1)
   .write.format("csv")
   .mode("overwrite")
   .option("header", "true")
   .save("/home/jovyan/work/predictionsDF.csv")

In [None]:
import org.apache.spark.sql.functions._

val getProbability =
    udf(
        (prediction: org.apache.spark.ml.linalg.Vector) =>
        {
            BigDecimal(prediction(0)).setScale(2, BigDecimal.RoundingMode.HALF_UP)+
            " / "+
            BigDecimal(prediction(1)).setScale(2, BigDecimal.RoundingMode.HALF_UP)
        }
    )

predictionsDF.where("label=0").limit(10).union(predictionsDF.where("label=1").limit(10))
    .select(getProbability($"probability").alias("clean_probability_0_1"),$"label").show(20,100)