In [1]:
import org.apache.spark.sql.types.{StructType, StringType, IntegerType}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.DataFrame


In [2]:
println(s"Current spark version is ${spark.version}")

Current spark version is 2.4.4


In [12]:
// Read model
val modelPath = "/home/jovyan/models/spark-ml-model"
val model = PipelineModel.load(modelPath)

modelPath = /home/jovyan/models/spark-ml-model
model = pipeline_e3869da6fdc9


pipeline_e3869da6fdc9

In [3]:
val inputStreamPath = "/home/jovyan/work/events-stream"
val modelPath = "/home/jovyan/models/spark-ml-model"

val dataSchema = new StructType()
    .add("tweet", StringType)
    .add("hiddentargetclue", IntegerType)
    .add("arrived_key", StringType)

val inputDF = spark
    .readStream
    .schema(dataSchema)
    .option("maxFilesPerTrigger", 1)
    .json(inputStreamPath)

inputStreamPath = /home/jovyan/work/events-stream
modelPath = /home/jovyan/models/spark-ml-model
dataSchema = StructType(StructField(tweet,StringType,true), StructField(hiddentargetclue,IntegerType,true), StructField(arrived_key,StringType,true))
inputDF = [tweet: string, hiddentargetclue: int ... 1 more field]


[tweet: string, hiddentargetclue: int ... 1 more field]

In [None]:
// Определяем udf для получения probability по 0 и 1
val getProbability =
    udf(
        (prediction: org.apache.spark.ml.linalg.Vector, pos: Integer) =>
        {
            prediction(pos)
        }
    )

// Микробатч для вывода результата предсказания
// Выводится вероятность негативного твита
// В задании написано, что это последняя колонка, но она здесь вроде первая (в позиции 0)
val stream = inputDF.writeStream.foreachBatch {
    (batchDF: DataFrame, batchId: Long) => {
        try {
            // Применяем модель и получаем соотв. датасет с предсказаниями
            model.transform(batchDF)
                .select(
                    $"arrived_key",
                    $"tweet",
                    // $"hiddentargetclue",
                    (getProbability($"probability",lit(0))).alias("Negative Probability")
                )
                .show(100,50)
        } catch {
            case e:Throwable => {
                print(e.getMessage.replaceAll("\n"," "))
                print(13.toChar)
            }
        }
    }
}.start()

+-------------------------------+--------------------------------------------------+--------------------+
|                    arrived_key|                                             tweet|Negative Probability|
+-------------------------------+--------------------------------------------------+--------------------+
|2020-01-26 20:40:46.799 - 44975|@vegnotes I had a veggie burger earlier that di...|  0.5238818808632117|
|2020-01-26 20:40:46.799 - 70139|Just back from paintballing. Too sore to move now |  0.5065995378974235|
|2020-01-26 20:40:46.799 - 93063|@scoutercleary I just figured y'all made approp...|  0.5238808338292043|
|2020-01-26 20:40:46.799 - 97039|@2SistersGallery Well, I am sincere! Glad I cou...|  0.5536874946050802|
|2020-01-26 20:40:46.799 - 10245|       @PBoogie The countdown has begun! FINALLY! |   0.483722604995963|
|2020-01-26 20:40:46.799 - 11149|would like to take some pictures now..but my ca...|  0.5371339440653956|
|2020-01-26 20:40:46.799 - 11157|To achieve gr

In [None]:
//val window = inputDF.writeStream.window(seconds(10), seconds(1)) {
    
//}

In [84]:
stream.stop()

- Comment all notebooks and clear all that does not needed
- Попробуйте выводить статистику по количеству "негативных" и "позитивных" твитов за последние 10 секунд скользящим окном.