### Start Spark Session

In [None]:
from pyspark.sql import SparkSession

try:
    spark.stop()
except NameError:
    print("SparkContext not defined")

# local mode
#spark = SparkSession.builder \
#            .appName("Spark SQL basic example") \
#            .master("local[*]") \
#	    	.config("spark.some.config.option", "some-value") \
#	    	.getOrCreate()

# cluster mode
spark = SparkSession.builder \
            .appName("Spark SQL basic example") \
            .master("spark://spark:7077") \
	    	.config("spark.some.config.option", "some-value") \
	    	.getOrCreate()

### Stream from socket (must be running on other host)

In [None]:
from pyspark.sql.functions import *
lines = (spark
.readStream.format("socket")
.option("host", "socketstreamserver")
.option("port", 12345)
.load())

# add current timestamp for each event
lines = lines.withColumn("timestamp", current_timestamp())
lines

### Transform data stream (use window if necessary)

In [None]:
from pyspark.sql.functions import *
# line to words
words = lines.select(lines.timestamp, explode(split(lines.value, "\\s")).alias("word"))
print(words)
# trim words
words = words.select(words.timestamp, trim(words.word).alias("word"))
print(words)
# filter out empty words
words = words.filter(words.word != "")
print(words)
# group words by window (10 sec.) with sliding of 5 seconds, count words within group
counts = words.groupBy(window(words.timestamp, "10 seconds", "5 seconds"), words.word).count()
counts

### More transformation on the data

In [None]:
top_counts = counts.orderBy(col("count").desc())
top_counts

### Setup output sink

In [None]:
writer = top_counts.writeStream.format("console").option("truncate", False).outputMode("complete")
writer

### Trigger option defines the duration of mini-batches

In [None]:
writer2 = writer.trigger(processingTime="5 second")
writer2

### Start query with a timeout of 60 seconds

In [None]:
streamingQuery = writer2.start()
streamingQuery.awaitTermination(60)

In [None]:
streamingQuery.stop()