In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
spark

In [3]:
stream_df = (spark.readStream.format('socket')
                             .option('host', 'localhost')
                             .option('port', 22223)
                             .load())

json_df = stream_df.selectExpr("CAST(value AS STRING) AS payload")

writer = (
    json_df.writeStream
           .queryName('askReddit')
           .format('memory')
           .outputMode('append')
)

streamer = writer.start()


In [7]:
import time

for _ in range(5):
    df = spark.sql("""
    SELECT get_json_object(payload, '$.id') AS id,
           CAST(get_json_object(payload, '$.score') AS INTEGER) AS score
    FROM askReddit
    """)
    
    df.show(10)
    
    print(df)
    time.sleep(5)
    
streamer.awaitTermination(timeout=10)
print('streaming done!')


+------+-----+
|    id|score|
+------+-----+
|tynl33| 5615|
|ty9i6m| 2319|
|ty2yxf| 1631|
|tyncry|   32|
|tybxkx|10303|
|tylv56|  256|
|tyllyi|   55|
|tygbgf| 2903|
|txzu59|40567|
|tyg6nx|  112|
+------+-----+
only showing top 10 rows

DataFrame[id: string, score: int]
+------+-----+
|    id|score|
+------+-----+
|tynl33| 5615|
|ty9i6m| 2319|
|ty2yxf| 1631|
|tyncry|   32|
|tybxkx|10303|
|tylv56|  256|
|tyllyi|   55|
|tygbgf| 2903|
|txzu59|40567|
|tyg6nx|  112|
+------+-----+
only showing top 10 rows

DataFrame[id: string, score: int]
+------+-----+
|    id|score|
+------+-----+
|tynl33| 5615|
|ty9i6m| 2319|
|ty2yxf| 1631|
|tyncry|   32|
|tybxkx|10303|
|tylv56|  256|
|tyllyi|   55|
|tygbgf| 2903|
|txzu59|40567|
|tyg6nx|  112|
+------+-----+
only showing top 10 rows

DataFrame[id: string, score: int]
+------+-----+
|    id|score|
+------+-----+
|tynl33| 5615|
|ty9i6m| 2319|
|ty2yxf| 1631|
|tyncry|   32|
|tybxkx|10303|
|tylv56|  256|
|tyllyi|   55|
|tygbgf| 2903|
|txzu59|40567|
|tyg6nx|  

In [16]:
from pyspark.sql import functions as F
(
    df.groupBy('id')
      .agg(F.min('score'), F.max('score'))
      .withColumn('pct_diff', (F.col('max(score)') - F.col('min(score)'))/F.col('min(score)'))
      .orderBy('pct_diff', ascending=False)
).show(25)

+------+----------+----------+--------------------+
|    id|min(score)|max(score)|            pct_diff|
+------+----------+----------+--------------------+
|typux9|        20|        27|                0.35|
|tyncry|        25|        32|                0.28|
|tyllyi|        53|        65| 0.22641509433962265|
|tyoc5e|        93|       112| 0.20430107526881722|
|tyrdu4|        15|        18|                 0.2|
|tylvsx|       387|       444| 0.14728682170542637|
|tyhrvs|        69|        77| 0.11594202898550725|
|tynl33|      5615|      6263| 0.11540516473731077|
|tyqttl|        18|        20|  0.1111111111111111|
|tyl5jf|       726|       801| 0.10330578512396695|
|tyl7md|        94|       103| 0.09574468085106383|
|tyk7qs|       191|       207| 0.08376963350785341|
|tyg6nx|       108|       116| 0.07407407407407407|
|tyk4k9|       131|       139|0.061068702290076333|
|tylv56|       255|       269|0.054901960784313725|
|tykh27|       703|       740| 0.05263157894736842|
|tybl6v|    