In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col, mean
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, TimestampType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab7_exercise")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType(
    [StructField("User", StringType(), True),
     StructField("Team", StringType(), True),
     StructField("Score", DoubleType(), True),
     StructField("timestamp_in_ms", LongType(), True),
     StructField("event_time", TimestampType(), True)
     ])

# Read from a source 
sdf = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1) \
    .csv("/home/jovyan/data/gamestream")

#sdf_User_Team_avg_window = sdf.groupBy(window(col('event_time'), '10 seconds').alias('window_slot'), 'User', 'Team').agg(mean('Score'))
sdf_User_Team_avg_window = sdf.groupBy(window(col('event_time'), '10 seconds').alias('window_slot'), 'User', 'Team').avg('Score')

query = sdf_User_Team_avg_window.writeStream.queryName('sdf_avg').format('memory').outputMode('complete').start()

for i in range(10):
    spark.sql('SELECT * FROM sdf_avg').show(truncate=False)
    sleep(5)

+-----------+----+----+----------+
|window_slot|User|Team|avg(Score)|
+-----------+----+----+----------+
+-----------+----+----+----------+

+-----------+----+----+----------+
|window_slot|User|Team|avg(Score)|
+-----------+----+----+----------+
+-----------+----+----+----------+

+-----------+----+----+----------+
|window_slot|User|Team|avg(Score)|
+-----------+----+----+----------+
+-----------+----+----+----------+

+-----------+----+----+----------+
|window_slot|User|Team|avg(Score)|
+-----------+----+----+----------+
+-----------+----+----+----------+

+------------------------------------------+--------------------------+--------------------+------------------+
|window_slot                               |User                      |Team                |avg(Score)        |
+------------------------------------------+--------------------------+--------------------+------------------+
|{2015-11-16 16:11:50, 2015-11-16 16:12:00}|user8_AuburnDingo         |AuburnDingo         |9.805970

In [5]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, concat, col, lit, mean, window
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, TimestampType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab7_exercise")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Read the whole dataset as a batch
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka1:9093") \
        .option("subscribe", "game") \
        .option("startingOffsets", "earliest") \
        .option("failOnDataLoss", 'False') \
        .load()

#records = df.selectExpr("CAST(value AS STRING)")
records = df.selectExpr("CAST(value AS STRING)").select(split(col('value'), ",").alias('splitted'))

new_df =  records.selectExpr('splitted[0] as User', 
                             'splitted[1] as Team',
                             'cast(splitted[2] as double) as Score',
                             'cast(splitted[3] as long) as timestamp_in_ms',
                             'cast(splitted[4] as timestamp) as event_time')

sdf_User_Team_avg_window = new_df.groupBy(window(col('event_time'), '10 seconds').alias('window_slot'), 'User', 'Team').avg('Score')

# Write to a sink - here, the output is memory (only for testing). The query name (i.e., activity_counts) will be the Spark SQL table name.
activityQuery = sdf_User_Team_avg_window.writeStream.queryName("sdf_avg_1") \
                    .format("memory").outputMode("update") \
                    .start()
# Testing 
for x in range(10):
    spark.sql("SELECT * FROM sdf_avg_1").show(truncate=False)
    sleep(5)

+-----------+----+----+----------+
|window_slot|User|Team|avg(Score)|
+-----------+----+----+----------+
+-----------+----+----+----------+

+-----------+----+----+----------+
|window_slot|User|Team|avg(Score)|
+-----------+----+----+----------+
+-----------+----+----+----------+

+-----------+----+----+----------+
|window_slot|User|Team|avg(Score)|
+-----------+----+----+----------+
+-----------+----+----+----------+

+-----------+----+----+----------+
|window_slot|User|Team|avg(Score)|
+-----------+----+----+----------+
+-----------+----+----+----------+

+------------------------------------------+--------------------------+--------------------+----------+
|window_slot                               |User                      |Team                |avg(Score)|
+------------------------------------------+--------------------------+--------------------+----------+
|{2015-11-16 16:11:00, 2015-11-16 16:11:10}|user1_AntiqueBrassPlatypus|AntiqueBrassPlatypus|7.0       |
|{2015-11-16 16:11:0

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, concat, col, lit, mean, window
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, TimestampType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab7_exercise")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Read the whole dataset as a batch
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka1:9093") \
        .option("subscribe", "game") \
        .option("startingOffsets", "earliest") \
        .option("failOnDataLoss", 'False') \
        .load()

#records = df.selectExpr("CAST(value AS STRING)")
records = df.selectExpr("CAST(value AS STRING)").select(split(col('value'), ",").alias('splitted'))

new_df =  records.selectExpr('splitted[0] as User', 
                             'splitted[1] as Team',
                             'cast(splitted[2] as double) as Score',
                             'cast(splitted[3] as long) as timestamp_in_ms',
                             'cast(splitted[4] as timestamp) as event_time')

sdf_User_Team_avg_window = new_df.groupBy(window(col('event_time'), '10 seconds').alias('window_slot'), 'User', 'Team').avg('Score')
#sdf_User_Team_avg_window = sdf.groupBy(window(col('event_time'), '10 seconds').alias('window_slot'), 'User', 'Team').agg(mean('Score'))
sdf_User_Team_avg_window_avg = sdf_User_Team_avg_window.select('window_slot', 'User', 'Team', col('avg(Score)').alias('avg_score'))

string_sdf_User_Team_avg_window = sdf_User_Team_avg_window_avg.selectExpr( 
                             'cast(window_slot as string) as window_slot',
                             'cast(User as string) as User',
                             'cast(Team as string) as Team',
                             'cast(avg_score as string) as avg_score')

query = string_sdf_User_Team_avg_window \
              .select(concat(col("window_slot"), lit(","), col("User"), lit(","), col("Team")).alias("key"), 
                       col("avg_score").alias('value') ) \
              .writeStream \
              .format("kafka") \
              .option("kafka.bootstrap.servers", "kafka1:9093") \
              .option("checkpointLocation", "/home/jovyan/checkpoint")\
              .option("topic", "avg_score") \
              .outputMode("complete") \
              .start()

try:
    query.awaitTermination()
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")

In [4]:
spark.stop()

In [13]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, TimestampType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab7_exercise")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType(
    [StructField("User", StringType(), True),
     StructField("Team", StringType(), True),
     StructField("Score", DoubleType(), True),
     StructField("timestamp_in_ms", LongType(), True),
     StructField("event_time", TimestampType(), True)
     ])

df = spark.read.format('csv').schema(dataSchema).option('header', 'False').load("/home/jovyan/data/gamestream/game_data_split1.csv")

In [16]:
df.printSchema()
df.show(truncate=False)

root
 |-- User: string (nullable = true)
 |-- Team: string (nullable = true)
 |-- Score: double (nullable = true)
 |-- timestamp_in_ms: long (nullable = true)
 |-- event_time: timestamp (nullable = true)

+--------------------------+--------------------+-----+---------------+-----------------------+
|User                      |Team                |Score|timestamp_in_ms|event_time             |
+--------------------------+--------------------+-----+---------------+-----------------------+
|user16_AmaranthKoala      |AmaranthKoala       |18.0 |1447719060000  |2015-11-16 16:11:03.921|
|user10_AndroidGreenKoala  |AndroidGreenKoala   |2.0  |1447719060000  |2015-11-16 16:11:03.955|
|user9_AuburnCockatoo      |AuburnCockatoo      |5.0  |1447719060000  |2015-11-16 16:11:03.955|
|user1_AntiqueBrassPlatypus|AntiqueBrassPlatypus|7.0  |1447719060000  |2015-11-16 16:11:03.955|
|user9_BattleshipGreyPossum|BattleshipGreyPossum|14.0 |1447719060000  |2015-11-16 16:11:03.955|
|user1_AmaranthDingo       