# CHAPTER 8. Structured Streaming
> 

* https://spark.apache.org/docs/latest/streaming-programming-guide.html


In [2]:
from pyspark.sql import *

spark = (
    SparkSession
    .builder
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .getOrCreate()
)

In [3]:
lines = (
    spark
    .readStream
    .format("socket")
    .option("host", "localhost")
    .option("port", 9999)
    .load()
)

In [5]:
from pyspark.sql.functions import *

def foreach_batch_function(df, epoch_id):
    print("{} - {}".format(epoch_id, df.collect()))

counts = lines.select(split(col("value"), " ").alias("word")).groupBy("word").count().alias("count")

streamingQuery = (
    counts
    .writeStream
    .format("memory")
    .outputMode("complete")
    .trigger(processingTime="1 second") # 1 second micro batch interval
    .foreachBatch(foreach_batch_function)
    .start()
)
streamingQuery.awaitTermination()

0 - []
8 - [Row(word=['1', '2', '3', '1', '1', '1', ''], count=1), Row(word=['3'], count=4), Row(word=['1', '1', '1'], count=1), Row(word=['3', '3', '3', '3', '3', '3'], count=1), Row(word=['1'], count=8), Row(word=['2'], count=1)]


KeyboardInterrupt: 

In [4]:
!rm -rf "tmp/checkpoint"

In [5]:
from pyspark.sql.functions import *

checkpointDir = "tmp/checkpoint"
words = lines.select(split(col("value"), "\\s").alias("word"))
counts = words.groupBy("word").count()
streamingQuery = (
    counts
    .writeStream
    .format("console")
    .outputMode("complete")
    .trigger(processingTime="1 second") # 1 second micro batch interval
    .option("checkpointLocation", checkpointDir)
    .start()
)
streamingQuery.awaitTermination()

KeyboardInterrupt: 

In [37]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, 1)


ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by getOrCreate at <ipython-input-1-a6d1a7399678>:4 

In [2]:
# Create a DStream that will connect to hostname:port, like localhost:9999
lines = ssc.socketTextStream("localhost", 9999)

In [3]:
# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

In [4]:
# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()

In [5]:
ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate

-------------------------------------------
Time: 2021-05-11 15:14:33
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:34
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:35
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:36
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:37
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:38
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:39
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:40
-------------------------------------------
('test', 1)

-------------------------------------------
Time: 2021-05-11 15:14:4

KeyboardInterrupt: 

-------------------------------------------
Time: 2021-05-11 15:14:51
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:52
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:53
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:54
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:55
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:56
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:57
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:58
-------------------------------------------

-------------------------------------------
Time: 2021-05-11 15:14:59
----------