In [1]:
# Generate Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from Sockets")
    .master("local[*]")
    .getOrCreate()
)

spark

spark.conf.set("spark.sql.shuffle.partition",8)

In [2]:
# read input data 
df_raw = spark.readStream.format("socket").option("host","localhost").option("port","9999").load("data/input/example.txt")

In [3]:
df_raw.printSchema()

root
 |-- value: string (nullable = true)



In [4]:
#df_raw.show()

In [5]:
# split the line into words
from pyspark.sql.functions import split
df_words = df_raw.withColumn("words",split("value"," "))

In [6]:
#df_words.show()

In [7]:
# explode the list of words
from pyspark.sql.functions import explode
df_explode = df_words.withColumn("word",explode("words")).drop("value","words")

In [8]:
#df_explode.show()

In [9]:
# aggregate the words to generate count
from pyspark.sql.functions import count,lit
df_agg = df_explode.groupBy("word").agg(count(lit(1)).alias("cnt"))

In [10]:
#df_agg.show()

In [None]:
# write the output to console

df_agg.writeStream.format("console").outputMode("complete").start().awaitTermination()

In [12]:
#spark.stop()

The three modes are:
Mode	Description
append	Writes only new rows since the last trigger.
update	Writes only updated rows (rows that changed since last trigger).
complete	Writes the entire result table every time.

⚙️ Output modes in streaming
Mode	Description	Works with Aggregation?
append	Only newly added rows are written to sink	❌ Not allowed for aggregations without watermark
update	Only updated results for existing groups are written	✅ Allowed
complete	Entire result table (all groups) is written	✅ Allowed