In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType
from time import sleep

In [2]:
spark_conf = SparkConf()
spark_conf.setMaster("spark://master:7077")
spark_conf.setAppName("Lab7_5")
spark_conf.set("spark.driver.memory", "2g")
spark_conf.set("spark.executor.cores", "1")
spark_conf.set("spark.driver.cores", "1")

# Create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()

In [3]:
data_schema = StructType([
    StructField("Arrival_Time", LongType(), True),
    StructField("Creation_Time", LongType(), True),
    StructField("Device", StringType(), True),
    StructField("Index", LongType(), True),
    StructField("Model", StringType(), True),
    StructField("User", StringType(), True),
    StructField("gt", StringType(), True),
    StructField("x", DoubleType(), True),
    StructField("y", DoubleType(), True),
    StructField("z", DoubleType(), True),
])

In [4]:
# Read from a source 
sdf = spark.readStream.schema(data_schema).option("maxFilesPerTrigger", 1).json("../data/activity")

In [5]:
# Create the event time column 
with_event_time_df = sdf.selectExpr("*", "cast(cast(Creation_Time as double)/1000000000 as timestamp) as event_time")

with_event_time_df.printSchema()

with_event_time_df.groupBy(window(col("event_time"), "10 minutes"), "User", "gt").count().writeStream.queryName("user_activity_events_per_window").format("memory").outputMode("complete").start()

root
 |-- Arrival_Time: long (nullable = true)
 |-- Creation_Time: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Index: long (nullable = true)
 |-- Model: string (nullable = true)
 |-- User: string (nullable = true)
 |-- gt: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)
 |-- event_time: timestamp (nullable = true)



<pyspark.sql.streaming.StreamingQuery at 0x7f151c1796a0>

In [6]:
for x in range(10):
    spark.sql("SELECT * FROM user_activity_events_per_window").show()
    sleep(10)

+------+----+---+-----+
|window|User| gt|count|
+------+----+---+-----+
+------+----+---+-----+

+------+----+---+-----+
|window|User| gt|count|
+------+----+---+-----+
+------+----+---+-----+

+------+----+---+-----+
|window|User| gt|count|
+------+----+---+-----+
+------+----+---+-----+

+--------------------+----+----------+-----+
|              window|User|        gt|count|
+--------------------+----+----------+-----+
|{2015-02-23 14:00...|   h|      walk|  225|
|{2015-02-23 10:40...|   g|  stairsup|  426|
|{2015-02-24 11:30...|   i|  stairsup|  568|
|{2015-02-23 13:20...|   a|  stairsup|  514|
|{2015-02-24 14:10...|   e|      null|   19|
|{2015-02-24 14:30...|   e|  stairsup|  537|
|{2015-02-23 11:10...|   g|stairsdown|  481|
|{2015-02-23 14:00...|   a|      bike|  528|
|{2015-02-23 13:10...|   a|      null|  252|
|{2015-02-23 13:40...|   a|      null|  218|
|{2015-02-23 13:00...|   c|      null|  131|
|{2015-02-23 14:40...|   h|  stairsup|   88|
|{2015-02-24 13:30...|   d|stairsd

In [7]:
# Stop the spark context
spark.stop()