In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType
from time import sleep

In [2]:
spark_conf = SparkConf()
spark_conf.setMaster("spark://master:7077")
spark_conf.setAppName("Lab7_4")
spark_conf.set("spark.driver.memory", "2g")
spark_conf.set("spark.executor.cores", "1")
spark_conf.set("spark.driver.cores", "1")

# Create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()

In [3]:
data_schema = StructType([
    StructField("Arrival_Time", LongType(), True),
    StructField("Creation_Time", LongType(), True),
    StructField("Device", StringType(), True),
    StructField("Index", LongType(), True),
    StructField("Model", StringType(), True),
    StructField("User", StringType(), True),
    StructField("gt", StringType(), True),
    StructField("x", DoubleType(), True),
    StructField("y", DoubleType(), True),
    StructField("z", DoubleType(), True),
])

In [4]:
# Read from a source 
sdf = spark.readStream.schema(data_schema).option("maxFilesPerTrigger", 1).json("../data/activity")

In [5]:
# Create the event time column 
with_event_time_df = sdf.selectExpr("*", "cast(cast(Creation_Time as double)/1000000000 as timestamp) as event_time")

with_event_time_df.printSchema()

with_event_time_df.groupBy(window(col("event_time"), "10 minutes"), "User").count().writeStream.queryName("activity_events_per_window").format("memory").outputMode("complete").start()

root
 |-- Arrival_Time: long (nullable = true)
 |-- Creation_Time: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Index: long (nullable = true)
 |-- Model: string (nullable = true)
 |-- User: string (nullable = true)
 |-- gt: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)
 |-- event_time: timestamp (nullable = true)



<pyspark.sql.streaming.StreamingQuery at 0x7f761cf95730>

In [6]:
for x in range(10):
    spark.sql("SELECT * FROM activity_events_per_window").show()
    sleep(10)

+------+----+-----+
|window|User|count|
+------+----+-----+
+------+----+-----+

+------+----+-----+
|window|User|count|
+------+----+-----+
+------+----+-----+

+------+----+-----+
|window|User|count|
+------+----+-----+
+------+----+-----+

+--------------------+----+-----+
|              window|User|count|
+--------------------+----+-----+
|{2015-02-24 14:50...|   e| 1560|
|{2015-02-23 13:00...|   c| 1293|
|{2015-02-23 13:40...|   a| 1381|
|{2015-02-23 12:40...|   c| 1236|
|{2015-02-23 10:10...|   g|  121|
|{2015-02-24 12:50...|   d| 1420|
|{2015-02-23 14:20...|   h| 1126|
|{2015-02-23 14:10...|   h| 1450|
|{2015-02-24 11:50...|   i| 1386|
|{2015-02-24 14:10...|   b| 1235|
|{2015-02-24 13:50...|   b| 1223|
|{2015-02-23 12:20...|   c| 1318|
|{2015-02-24 12:00...|   f| 1366|
|{2015-02-24 15:20...|   e|  281|
|{2015-02-24 12:30...|   f| 1520|
|{2015-02-23 11:00...|   g| 1342|
|{2015-02-24 14:20...|   b| 1280|
|{2015-02-23 12:30...|   c| 1226|
|{2015-02-24 15:00...|   e| 1235|
|{2015-02

In [7]:
# Stop the spark context
spark.stop()