# Spark Structured Streaming

- structured streaming
- How does structured streaming differ from batch?  Unbounded

http://hortonworks.com/hadoop-tutorial/introduction-spark-streaming/
https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#programming-model

In [76]:
val sparkDummy = spark
import sparkDummy.implicits._

# Streaming Shakespeare

In [127]:
import sys.process._

"more data/summer.txt" !

Shall I compare thee to a summer’s day?
Thou art more lovely and more temperate.
Rough winds do shake the darling buds of May,
And summer’s lease hath all too short a date.
Sometime too hot the eye of heaven shines,
And often is his gold complexion dimmed;
And every fair from fair sometime declines,
By chance, or nature’s changing course, untrimmed;
But thy eternal summer shall not fade,
Nor lose possession of that fair thou ow’st,
Nor shall death brag thou wand’rest in his shade,
When in eternal lines to Time thou grow’st.
So long as men can breathe, or eyes can see,
So long lives this, and this gives life to thee.

In [108]:
def createStream(port: Int, duration: Int) {
    val lines = (spark.readStream
        .format("socket")
        .option("host", "localhost")
        .option("port", port)
        .load())

    val words = (lines
        .as[String]
        .flatMap(_.split("\\s+")))

    val wordCounts = words.groupByKey(_.toLowerCase).count().orderBy($"count(1)" desc)

    val query = (wordCounts.writeStream
        .outputMode("complete")
        .format("console")
        .start
        .awaitTermination(duration))
}

In [129]:
val port = 9001

// Broadcast file on port one line at time
(new Thread {
    override def run {
        s"scala Broadcast.scala ${port} data/summer.txt" !
    }
}).start

In [130]:
createStream(port, 12000)

-------------------------------------------
Batch: 0
-------------------------------------------
+--------+--------+
|   value|count(1)|
+--------+--------+
|    thee|       1|
|summer’s|       1|
|       i|       1|
|    day?|       1|
|   shall|       1|
|       a|       1|
|      to|       1|
| compare|       1|
+--------+--------+

-------------------------------------------
Batch: 1
-------------------------------------------
+--------+--------+
|   value|count(1)|
+--------+--------+
|     and|       3|
|    more|       2|
|     too|       2|
|summer’s|       2|
|     the|       2|
|      of|       2|
|       a|       2|
|     art|       1|
|   often|       1|
|    thou|       1|
|  lovely|       1|
|     hot|       1|
|   winds|       1|
|    thee|       1|
|    buds|       1|
| shines,|       1|
|     eye|       1|
|      is|       1|
|    gold|       1|
|   date.|       1|
+--------+--------+
only showing top 20 rows

-------------------------------------------
Batch: 2
------

# Streaming Netcast

In [6]:
// run `nc -lk 9002` in bash and start typing!

createStream(9002, 10000)

-------------------------------------------
Batch: 0
-------------------------------------------
+-----+--------+
|value|count(1)|
+-----+--------+
|   hi|       1|
+-----+--------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-----+--------+
|value|count(1)|
+-----+--------+
|   hi|       2|
|there|       1|
+-----+--------+

-------------------------------------------
Batch: 2
-------------------------------------------
+-----+--------+
|value|count(1)|
+-----+--------+
|there|       2|
|   hi|       2|
+-----+--------+



# Prasing Data using Case Classes and Schemas

In [170]:
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.catalyst.ScalaReflection

In [99]:
case class Person(name: String, city: String, country: String, age: Option[Int])

val caseSchema = (ScalaReflection
    .schemaFor[Person]
    .dataType
    .asInstanceOf[StructType])

val peopleStream = (spark.readStream
  .schema(caseSchema)
  .option("header", true)  // Headers are matched to Person properties
  .option("maxFilesPerTrigger", 1)  // each file is read in a separate batch
  .csv("data/people/")
  .as[Person])
  
(peopleStream.writeStream
    .outputMode("append")  // write results to screen
    .format("console")
    .start)

ACTIVE]

-------------------------------------------
Batch: 0
-------------------------------------------
+-------+-------------+-------+---+
|   name|         city|country|age|
+-------+-------------+-------+---+
|    Amy|        Paris|     FR| 30|
|    Bob|     New York|     US| 22|
|Charlie|       London|     UK| 35|
| Denise|San Francisco|     US| 22|
+-------+-------------+-------+---+

-------------------------------------------
Batch: 1
-------------------------------------------
+-------+------+-------+----+
|   name|  city|country| age|
+-------+------+-------+----+
| Edward|London|     UK|  53|
|Francis|      |     FR|  22|
| George|London|     UK|null|
+-------+------+-------+----+



# Using Filter in a Stream

In [98]:
(peopleStream.filter($"country" === "UK")
    .writeStream
    .outputMode("append")  // write results to screen
    .format("console")
    .start)

ACTIVE]

-------------------------------------------
Batch: 0
-------------------------------------------
+-------+------+-------+---+
|   name|  city|country|age|
+-------+------+-------+---+
|Charlie|London|     UK| 35|
+-------+------+-------+---+

-------------------------------------------
Batch: 1
-------------------------------------------
+------+------+-------+----+
|  name|  city|country| age|
+------+------+-------+----+
|Edward|London|     UK|  53|
|George|London|     UK|null|
+------+------+-------+----+



# Using Groupby in a Stream

In [164]:
(peopleStream.groupByKey(_.country)
    .count
    .writeStream
    .outputMode("complete")
    .format("console")
    .start)

ACTIVE]

-------------------------------------------
Batch: 0
-------------------------------------------
+-----+--------+
|value|count(1)|
+-----+--------+
|   US|       2|
|   FR|       1|
|   UK|       1|
+-----+--------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-----+--------+
|value|count(1)|
+-----+--------+
|   US|       2|
|   FR|       2|
|   UK|       3|
+-----+--------+



# Using SQL Aggregations Structured Stream

In [172]:
(peopleStream.groupBy($"country")
    .agg(avg($"age"))
    .writeStream
    .outputMode("complete")
    .format("console")
    .start)

ACTIVE]

-------------------------------------------
Batch: 0
-------------------------------------------
+-------+--------+
|country|avg(age)|
+-------+--------+
|     US|    22.0|
|     FR|    30.0|
|     UK|    35.0|
+-------+--------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-------+--------+
|country|avg(age)|
+-------+--------+
|     US|    22.0|
|     FR|    26.0|
|     UK|    44.0|
+-------+--------+



# Joining with datasets

- Broadcast variables

In [202]:
case class User(id: Int, name: String, email: String, country: String)
case class Transaction(userid: Int, product: String, cost: Double)

val userSchema = (ScalaReflection
    .schemaFor[User]
    .dataType
    .asInstanceOf[StructType]
)
    
val transactionSchema = (ScalaReflection
    .schemaFor[Transaction]
    .dataType
    .asInstanceOf[StructType]
)

val users = (spark.read
    .schema(userSchema)
    .option("header", true)
    .csv("data/users.csv")
    .as[User]
)
  
val transactionStream = (spark.readStream
    .schema(transactionSchema)
    .option("header", true)
    .csv("data/transactions.csv")
    .as[Transaction]
)

val spendingByCountry = (transactionStream
    .join(users, users("id") === transactionStream("userid"))
    .groupBy($"country")
    .agg(sum($"cost")))
    
(spendingByCountry.writeStream
    .outputMode("complete")
    .format("console")
    .start)

ACTIVE]

-------------------------------------------
Batch: 0
-------------------------------------------
+-------+---------+
|country|sum(cost)|
+-------+---------+
|     EN|     20.0|
|     FR|     50.0|
+-------+---------+

