In [None]:
println(s"Current spark version is ${spark.version}")

In [None]:
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, LongType, StringType}

val dataSchema = new StructType()
    .add("target", IntegerType)
    .add("id", LongType)
    .add("raw_timestamp", StringType)
    .add("query_status", StringType)
    .add("author", StringType)
    .add("tweet", StringType)

    
val dataPath= "/home/jovyan/data/training.1600000.processed.noemoticon.csv"

val raw_sentiment = spark.read
    .format("csv")
    .option("header",false)
    .schema(dataSchema)
    .load(dataPath)
    .selectExpr(
        "tweet",
        "(case when target=4 then 1 else 0 end) as hiddentargetclue",
        "current_timestamp||' - '||lpad((row_number() over (order  by null)),5,'0') as arrived_key"
    )

println(s"Total tweets in file: ${raw_sentiment.count}")

In [None]:
raw_sentiment.selectExpr("max(arrived_key) as arr").show(1,100)
println("\033c")
raw_sentiment.selectExpr("max(arrived_key) as arr").show(1,100)

In [None]:
import java.util.{Calendar, Timer, TimerTask}

val outputStreamPath = "/home/jovyan/work/events-stream"
var writeMode = "overwrite" // first - overwrite for initialy empty dest dir

var fRuns: Long = 0
var fTweets: Long = 0

val timer = new Timer()

val task = new TimerTask {
  def run(): Unit = {
      val data = raw_sentiment.sample(fraction=0.00001,withReplacement=true)
      data.coalesce(1).write.format("json").mode(writeMode).save(outputStreamPath)
      writeMode = "append" // next - append
      fRuns += 1
      fTweets += data.count()
      print(s"${Calendar.getInstance().toInstant} - saved to the events stream $fTweets tweets $fRuns times"+13.toChar)
  } 
}

println("Streaming started!")

timer.schedule(task, 100L, 100L)

In [None]:
task.cancel()
println("Streaming stopped!")

In [None]:
java.time.LocalDateTime.now