In [1]:
println(s"Current spark version is ${spark.version}")

Current spark version is 2.4.4


In [2]:
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, LongType, StringType}

val dataSchema = new StructType()
    .add("target", IntegerType)
    .add("id", LongType)
    .add("raw_timestamp", StringType)
    .add("query_status", StringType)
    .add("author", StringType)
    .add("tweet", StringType)

val dataPath= "/home/jovyan/data/training.1600000.processed.noemoticon.csv"

val raw_sentiment = spark.read
    .format("csv")
    .option("header",false)
    .schema(dataSchema)
    .load(dataPath)
    .selectExpr("(case when target=4 then 1 else 0 end) as label","tweet")

raw_sentiment.groupBy($"label").count.show

+-----+------+
|label| count|
+-----+------+
|    1|800000|
|    0|800000|
+-----+------+



dataSchema = StructType(StructField(target,IntegerType,true), StructField(id,LongType,true), StructField(raw_timestamp,StringType,true), StructField(query_status,StringType,true), StructField(author,StringType,true), StructField(tweet,StringType,true))
dataPath = /home/jovyan/data/training.1600000.processed.noemoticon.csv
raw_sentiment = [label: int, tweet: string]


[label: int, tweet: string]

In [3]:
raw_sentiment.where("label=0").show(5,150)
raw_sentiment.where("label=1").show(5,150)

+-----+-------------------------------------------------------------------------------------------------------------------+
|label|                                                                                                              tweet|
+-----+-------------------------------------------------------------------------------------------------------------------+
|    0|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|
|    0|    is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!|
|    0|                          @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds|
|    0|                                                                    my whole body feels itchy and like its on fire |
|    0|    @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. |
+-----+-

In [4]:
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row

val tokenizer = new Tokenizer()
    .setInputCol("tweet")
    .setOutputCol("words")

val hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("features")

val lr = new LogisticRegression()
    .setMaxIter(10)
    .setRegParam(0.001)

val pipeline = new Pipeline()
  .setStages(Array(tokenizer, hashingTF, lr))


tokenizer = tok_382bb66be034
hashingTF = hashingTF_aba39531e215
lr = logreg_7d7c9218d06a
pipeline = pipeline_c88e57c57142


pipeline_c88e57c57142

In [5]:
val model = pipeline.fit(raw_sentiment)

model = pipeline_c88e57c57142


pipeline_c88e57c57142

In [6]:
model.write.overwrite().save("/home/jovyan/models/spark-ml-model")

In [7]:
val sameModel = PipelineModel.load("/home/jovyan/models/spark-ml-model")

sameModel = pipeline_c88e57c57142


pipeline_c88e57c57142

In [8]:
val predictionsDF = sameModel.transform(raw_sentiment)

predictionsDF.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|               tweet|               words|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    0|@switchfoot http:...|[@switchfoot, htt...|(1000,[7,14,21,54...|[-0.9010125659403...|[0.28884245921783...|       1.0|
|    0|is upset that he ...|[is, upset, that,...|(1000,[170,193,22...|[1.84195706807745...|[0.86318000204742...|       0.0|
|    0|@Kenichan I dived...|[@kenichan, i, di...|(1000,[10,36,77,1...|[1.56488554961140...|[0.82705328017345...|       0.0|
|    0|my whole body fee...|[my, whole, body,...|(1000,[82,191,296...|[0.22286270195627...|[0.55548620895353...|       0.0|
|    0|@nationwideclass ...|[@nationwideclass...|(1000,[18,96,130,...|[3.23587893775227...|[0.96216236372478...|       0.0|
|    0|@

predictionsDF = [label: int, tweet: string ... 5 more fields]


[label: int, tweet: string ... 5 more fields]

In [9]:
predictionsDF.where("label=0").limit(15).union(predictionsDF.where("label=1").limit(15))
   .selectExpr(
       "label","tweet","cast (words as string)","cast (features as string)","cast (rawPrediction as string)","cast (probability as string)","prediction"
   )
   .coalesce(1)
   .write.format("csv")
   .mode("overwrite")
   .option("header", "true")
   .save("/home/jovyan/work/predictionsDF.csv")

In [10]:
import org.apache.spark.sql.functions._

val getProbability =
    udf(
        (prediction: org.apache.spark.ml.linalg.Vector) =>
        {
            BigDecimal(prediction(0)).setScale(2, BigDecimal.RoundingMode.HALF_UP)+
            " / "+
            BigDecimal(prediction(1)).setScale(2, BigDecimal.RoundingMode.HALF_UP)
        }
    )

predictionsDF.where("label=0").limit(10).union(predictionsDF.where("label=1").limit(10))
    .select(getProbability($"probability").alias("clean_probability_0_1"),$"label").show(20,100)

+---------------------+-----+
|clean_probability_0_1|label|
+---------------------+-----+
|          0.29 / 0.71|    0|
|          0.86 / 0.14|    0|
|          0.83 / 0.17|    0|
|          0.56 / 0.44|    0|
|          0.96 / 0.04|    0|
|          0.58 / 0.42|    0|
|          0.52 / 0.48|    0|
|          0.09 / 0.91|    0|
|          0.70 / 0.30|    0|
|          0.47 / 0.53|    0|
|          0.19 / 0.81|    1|
|          0.69 / 0.31|    1|
|          0.12 / 0.88|    1|
|          0.88 / 0.12|    1|
|          0.48 / 0.52|    1|
|          0.22 / 0.78|    1|
|          0.42 / 0.58|    1|
|          0.06 / 0.94|    1|
|          0.16 / 0.84|    1|
|          0.54 / 0.46|    1|
+---------------------+-----+



getProbability = UserDefinedFunction(<function1>,StringType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))


UserDefinedFunction(<function1>,StringType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))

# Time 1:24:00