# Генерирование случайных твитов в поток

In [11]:
println(s"Current spark version is ${spark.version}")

Current spark version is 2.4.4


In [12]:
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, LongType, StringType}

val dataSchema = new StructType()
    .add("target", IntegerType)
    .add("id", LongType)
    .add("raw_timestamp", StringType)
    .add("query_status", StringType)
    .add("author", StringType)
    .add("tweet", StringType)

val dataPath= "/home/jovyan/data/training.1600000.processed.noemoticon.csv"

val raw_sentiment = spark.read
    .format("csv")
    .option("header",false)
    .schema(dataSchema)
    .load(dataPath)
    .selectExpr(
        "tweet",
        "(case when target=4 then 1 else 0 end) as hiddentargetclue",
        // Генерируем timestamp для отработки оконных функций при приеме потока
        // что-то с локальным временем в спарке, несмотря на попытки настройки, поэтому пока вручную плюсую 2 часа для получения у себя на экране киевского времени :)
        "current_timestamp + INTERVAL 2 hours as timestamp"
    )

println(s"Total tweets in file: ${raw_sentiment.count}")

Total tweets in file: 1600000


dataSchema = StructType(StructField(target,IntegerType,true), StructField(id,LongType,true), StructField(raw_timestamp,StringType,true), StructField(query_status,StringType,true), StructField(author,StringType,true), StructField(tweet,StringType,true))
dataPath = /home/jovyan/data/training.1600000.processed.noemoticon.csv
raw_sentiment = [tweet: string, hiddentargetclue: int ... 1 more field]


[tweet: string, hiddentargetclue: int ... 1 more field]

In [13]:
import java.util.{Calendar, Timer, TimerTask}

val outputStreamPath = "/home/jovyan/work/events-stream"
var writeMode = "overwrite" // first - overwrite for initialy empty dest dir

var fRuns: Long = 0
var fTweets: Long = 0

val timer = new Timer()

val task = new TimerTask {
  def run(): Unit = {
      val data = raw_sentiment.sample(fraction=0.00001,withReplacement=true)
      data.coalesce(1).write.format("json").mode(writeMode).save(outputStreamPath)
      writeMode = "append" // next - append
      fRuns += 1
      fTweets += data.count()
      print(s"${Calendar.getInstance().toInstant} - saved to the events stream $fTweets tweets $fRuns times"+13.toChar)
  } 
}

println("Streaming started!")

timer.schedule(task, 1000L, 1000L)

Streaming started!


outputStreamPath = /home/jovyan/work/events-stream
writeMode = overwrite
fRuns = 0
fTweets = 0
timer = java.util.Timer@6ded5899
task = $anon$1@6cad543e


$anon$1@6cad543e

2020-01-29T15:33:30.477Z - saved to the events stream 492 tweets 31 times

In [14]:
task.cancel()
println("Streaming stopped!")

Streaming stopped!
