In [1]:
# Generate Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from Sockets")
    .master("local[*]")
    .getOrCreate()
)

spark

In [20]:
# read input data 
df_raw = spark.readStream.format("socket").option("host","localhost").option("port","9999").load("data/input/example.txt")

In [21]:
df_raw.printSchema()

root
 |-- value: string (nullable = true)



In [7]:
#df_raw.show()

+--------------------+
|               value|
+--------------------+
|simon had a dog a...|
+--------------------+



In [22]:
# split the line into words
from pyspark.sql.functions import split
df_words = df_raw.withColumn("words",split("value"," "))

In [9]:
#df_words.show()

+--------------------+--------------------+
|               value|               words|
+--------------------+--------------------+
|simon had a dog a...|[simon, had, a, d...|
+--------------------+--------------------+



In [23]:
# explode the list of words
from pyspark.sql.functions import explode
df_explode = df_words.withColumn("word",explode("words")).drop("value","words")

In [15]:
#df_explode.show()

+-----+
| word|
+-----+
|simon|
|  had|
|    a|
|  dog|
|  and|
|    a|
|  cat|
|  the|
|  dog|
|  and|
|  cat|
| used|
|   to|
| love|
|simon|
+-----+



In [24]:
# aggregate the words to generate count
from pyspark.sql.functions import count,lit
df_agg = df_explode.groupBy("word").agg(count(lit(1)).alias("cnt"))

In [19]:
#df_agg.show()

+-----+---+
| word|cnt|
+-----+---+
| used|  1|
|simon|  2|
|  dog|  2|
| love|  1|
|  had|  1|
|  cat|  2|
|  the|  1|
|  and|  2|
|    a|  2|
|   to|  1|
+-----+---+



In [None]:
# write the output to console

df_agg.writeStream.format("console").outputMode("complete").start().awaitTermination()