In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType
from time import sleep

In [2]:
spark_conf = SparkConf()
spark_conf.setMaster("spark://master:7077")
spark_conf.setAppName("Lab7_7")
spark_conf.set("spark.driver.memory", "2g")
spark_conf.set("spark.executor.cores", "1")
spark_conf.set("spark.driver.cores", "1")

# Create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()

In [3]:
data_schema = StructType([
    StructField("Arrival_Time", LongType(), True),
    StructField("Creation_Time", LongType(), True),
    StructField("Device", StringType(), True),
    StructField("Index", LongType(), True),
    StructField("Model", StringType(), True),
    StructField("User", StringType(), True),
    StructField("gt", StringType(), True),
    StructField("x", DoubleType(), True),
    StructField("y", DoubleType(), True),
    StructField("z", DoubleType(), True),
])

In [4]:
# Read from a source 
sdf = spark.readStream.schema(data_schema).option("maxFilesPerTrigger", 1).json("../data/activity")

In [5]:
# Create the event time column 
with_event_time_df = sdf.selectExpr("*", "cast(cast(Creation_Time as double)/1000000000 as timestamp) as event_time")

with_event_time_df.printSchema()

with_event_time_df.withWatermark("event_time", "10 minutes").groupBy(window(col("event_time"), "10 minutes", "5 minutes")).count().writeStream.queryName("activity_events_per_window").format("memory").outputMode("complete").start()

root
 |-- Arrival_Time: long (nullable = true)
 |-- Creation_Time: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Index: long (nullable = true)
 |-- Model: string (nullable = true)
 |-- User: string (nullable = true)
 |-- gt: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)
 |-- event_time: timestamp (nullable = true)



<pyspark.sql.streaming.StreamingQuery at 0x7f5302af5250>

In [6]:
for x in range(10):
    spark.sql("SELECT * FROM activity_events_per_window").show()
    sleep(10)

+------+-----+
|window|count|
+------+-----+
+------+-----+

+------+-----+
|window|count|
+------+-----+
+------+-----+

+------+-----+
|window|count|
+------+-----+
+------+-----+

+--------------------+-----+
|              window|count|
+--------------------+-----+
|{2015-02-23 12:55...| 1461|
|{2015-02-23 10:10...|  121|
|{2015-02-24 12:55...| 2434|
|{2015-02-23 14:05...| 1315|
|{2015-02-24 13:35...| 2205|
|{2015-02-24 13:00...| 1644|
|{2015-02-24 14:05...| 1579|
|{2015-02-23 11:10...| 1136|
|{2015-02-23 13:55...| 2481|
|{2015-02-24 12:50...| 2657|
|{2015-02-24 13:10...| 1322|
|{2015-02-23 10:55...| 1567|
|{2015-02-23 10:25...| 1218|
|{2015-02-23 14:45...|  361|
|{2015-02-24 14:35...| 1799|
|{2015-02-23 12:10...|  738|
|{2015-02-23 14:40...|  936|
|{2015-02-24 14:30...| 2368|
|{2015-02-23 12:30...| 1226|
|{2015-02-23 10:40...| 1121|
+--------------------+-----+
only showing top 20 rows

+--------------------+-----+
|              window|count|
+--------------------+-----+
|{2015-0

In [7]:
# Stop the spark context
spark.stop()