In [None]:
import sys
import os

os.environ.get('JAVA_HOME')
import findspark

findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
spark = SparkSession.builder.appName("StreamApp").master("local[*]").getOrCreate()

In [None]:
df = spark \
    .readStream \
    .format("socket") \
    .option("host","localhost") \
    .option("port","9999") \
    .load()

In [None]:
df.isStreaming

In [None]:
df.printSchema()

The "Output" is defined as what gets written out to the external storage. The output can be defined in a different mode:

* **Complete Mode** - The entire updated Result Table will be written to the external storage. It is up to the storage connector to decide how to handle writing of the entire table.


* **Append Mode** - Only the new rows appended in the Result Table since the last trigger will be written to the external storage. This is applicable only on the queries where existing rows in the Result Table are not expected to change.


* **Update Mode** - Only the rows that were updated in the Result Table since the last trigger will be written to the external storage (available since Spark 2.1.1). Note that this is different from the Complete Mode in that this mode only outputs the rows that have changed since the last trigger. If the query doesn’t contain aggregations, it will be equivalent to Append mode.

In [None]:
words_df = df.select(explode(split(df.value, " ")).alias("word"))
wc_df = words_df.groupBy("word").count()
query = wc_df \
        .writeStream \
        .outputMode("complete") \
        .format("console") \
        .start()

In [None]:
query.stop()