In [1]:
! pip install datetime

Collecting datetime
  Downloading DateTime-4.3-py2.py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 3.4 MB/s eta 0:00:011
[?25hCollecting zope.interface
  Downloading zope.interface-5.4.0-cp39-cp39-manylinux2010_x86_64.whl (255 kB)
[K     |████████████████████████████████| 255 kB 10.4 MB/s eta 0:00:01
Installing collected packages: zope.interface, datetime
Successfully installed datetime-4.3 zope.interface-5.4.0


In [36]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, lit, window, sum, count, round, max, avg, broadcast
from pyspark.sql.window import Window
from time import sleep
from datetime import datetime 

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Assignment_2")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

date = datetime.now().strftime("%m%d%M")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Read the whole dataset as a batch
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka1:9093") \
        .option("subscribe", f"records{date}") \
        .option("startingOffsets", "earliest") \
        .option("failOnDataLoss", 'False') \
        .load()

# split the value
records = df.selectExpr("CAST(value AS STRING)").select(split(col('value'), ",").alias('splitted'))

# define the schema in this way
new_df =  records.selectExpr('splitted[0] as id', 
                             'cast(splitted[1] as timestamp) as event_time',
                             'cast(splitted[2] as long) as cc_num',
                             'splitted[3] as merchant',
                             'splitted[4] as category',
                             'cast(splitted[5] as double) as amt',
                             'splitted[6] as first',
                             'splitted[7] as last',
                             'splitted[8] as gender',
                             'splitted[9] as street',
                             'splitted[10] as city',
                             'splitted[11] as state',
                             'splitted[12] as zip',
                             'splitted[13] as lat',
                             'splitted[14] as long',
                             'cast(splitted[15] as long) as city_pop',
                             'splitted[16] as job',
                             'splitted[17] as dob',
                             'splitted[18] as trans_num',
                             'cast(splitted[19] as long) as unix_time',
                             'splitted[20] as merch_lat',
                             'splitted[21] as merch_long',
                             'cast(splitted[22] as int) as is_fraud')

new_df = new_df.select('event_time', 'category', 'amt', 'gender', 'is_fraud')

# filter out the fraud
#new_df = new_df.where(col('is_fraud') == 1)

window_5min = window(col('event_time'), '5 minutes').alias('time_slot')
sdf = new_df.groupBy(window_5min, 'gender', 'category') \
            .agg(sum('amt').alias('amt'), count('*').alias('count'))

################ Non-time-based windows are not supported on streaming DataFrames/Datasets #################
#window_group_by = Window.partitionBy(window_5min, col('gender'))
#sdf = sdf.withColumn('max_amt_per_5_min', max('amt').over(window_group_by)) \
#         .where(col('amt') == col('max_amt_per_5_min'))\
#         .drop('max_amt_per_5_min')

############### Multiple streaming aggregations are not supported with streaming DataFrames/Datasets #############
#temp_df = sdf.groupBy(col('time_slot').alias('t_time_slot'), 
#                      col("gender").alias("t_gender"))\
#             .agg(max('amt').alias('max_amt'))
#join_exp = (sdf['time_slot'] == temp_df['t_time_slot']) & (sdf['gender'] == temp_df['t_gender']) & (sdf['amt'] == temp_df['max_amt'])
#sdf = sdf.join(broadcast(temp_df), join_exp)\
#         .drop('t_gender')\
#         .drop('t_time_slot')\
#         .drop('max_amt')

################ The sub-query in the where clause didn't seem to work #######################
#sdf.createOrReplaceTempView('sdf_table')
#sql = """
#    SELECT
#       *
#    FROM
#        sdf_table
#    WHERE   (time_slot, gender, amt) IN (
#                SELECT time_slot, gender, MAX(amt) max_amt
#                FROM sdf_table
#                GROUP BY time_slot, gender )
#"""
#
#sdf = spark.sql(sql)
            
# Write to a sink - here, the output is memory (only for testing). 
activityQuery = sdf.select(sdf.time_slot.start.cast('string').alias('start_time'), 
                           sdf.time_slot.end.cast('string').alias('end_time'), 
                           'gender', 'category',
                           round('amt', 2).alias('amt'), 'count') \
                   .writeStream \
                   .queryName("most_fraud_table") \
                   .format("memory").outputMode("complete") \
                   .start()
# Testing 
for x in range(10):
    spark.sql("SELECT * FROM most_fraud_table").show(truncate=False)
    sleep(5)

AnalysisException: Multiple streaming aggregations are not supported with streaming DataFrames/Datasets;
Project [cast(time_slot#7640.start as string) AS start_time#7702, cast(time_slot#7640.end as string) AS end_time#7703, gender#7596, category#7592, round(amt#7647, 2) AS amt#7704, count#7649L]
+- Project [time_slot#7640, gender#7596, category#7592, amt#7647, count#7649L]
   +- Project [time_slot#7640, gender#7596, category#7592, amt#7647, count#7649L, max_amt#7664]
      +- Project [time_slot#7640, gender#7596, category#7592, amt#7647, count#7649L, t_time_slot#7656, max_amt#7664]
         +- Join Inner, (((time_slot#7640 = t_time_slot#7656) AND (gender#7596 = t_gender#7657)) AND (amt#7647 = max_amt#7664))
            :- Aggregate [window#7650, gender#7596, category#7592], [window#7650 AS time_slot#7640, gender#7596, category#7592, sum(amt#7593) AS amt#7647, count(1) AS count#7649L]
            :  +- Filter isnotnull(event_time#7589)
            :     +- Project [named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) as double) = (cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) THEN (CEIL((cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) + cast(1 as bigint)) ELSE CEIL((cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) END + cast(0 as bigint)) - cast(1 as bigint)) * 300000000) + 0), LongType, TimestampType), end, precisetimestampconversion((((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) as double) = (cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) THEN (CEIL((cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) + cast(1 as bigint)) ELSE CEIL((cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) END + cast(0 as bigint)) - cast(1 as bigint)) * 300000000) + 0) + 300000000), LongType, TimestampType)) AS window#7650, event_time#7589, category#7592, amt#7593, gender#7596, is_fraud#7610]
            :        +- Project [event_time#7589, category#7592, amt#7593, gender#7596, is_fraud#7610]
            :           +- Project [splitted#7586[0] AS id#7588, cast(splitted#7586[1] as timestamp) AS event_time#7589, cast(splitted#7586[2] as bigint) AS cc_num#7590L, splitted#7586[3] AS merchant#7591, splitted#7586[4] AS category#7592, cast(splitted#7586[5] as double) AS amt#7593, splitted#7586[6] AS first#7594, splitted#7586[7] AS last#7595, splitted#7586[8] AS gender#7596, splitted#7586[9] AS street#7597, splitted#7586[10] AS city#7598, splitted#7586[11] AS state#7599, splitted#7586[12] AS zip#7600, splitted#7586[13] AS lat#7601, splitted#7586[14] AS long#7602, cast(splitted#7586[15] as bigint) AS city_pop#7603L, splitted#7586[16] AS job#7604, splitted#7586[17] AS dob#7605, splitted#7586[18] AS trans_num#7606, cast(splitted#7586[19] as bigint) AS unix_time#7607L, splitted#7586[20] AS merch_lat#7608, splitted#7586[21] AS merch_long#7609, cast(splitted#7586[22] as int) AS is_fraud#7610]
            :              +- Project [split(value#7584, ,, -1) AS splitted#7586]
            :                 +- Project [cast(value#7571 as string) AS value#7584]
            :                    +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@54bde53b, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@1d74aaca, [startingOffsets=earliest, kafka.bootstrap.servers=kafka1:9093, subscribe=records112556, failOnDataLoss=False], [key#7570, value#7571, topic#7572, partition#7573, offset#7574L, timestamp#7575, timestampType#7576], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@340524c,kafka,List(),None,List(),None,Map(kafka.bootstrap.servers -> kafka1:9093, subscribe -> records112556, startingOffsets -> earliest, failOnDataLoss -> False),None), kafka, [key#7563, value#7564, topic#7565, partition#7566, offset#7567L, timestamp#7568, timestampType#7569]
            +- ResolvedHint (strategy=broadcast)
               +- Aggregate [time_slot#7640, gender#7596], [time_slot#7640 AS t_time_slot#7656, gender#7596 AS t_gender#7657, max(amt#7647) AS max_amt#7664]
                  +- Aggregate [window#7650, gender#7596, category#7592], [window#7650 AS time_slot#7640, gender#7596, category#7592, sum(amt#7593) AS amt#7647, count(1) AS count#7649L]
                     +- Filter isnotnull(event_time#7589)
                        +- Project [named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) as double) = (cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) THEN (CEIL((cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) + cast(1 as bigint)) ELSE CEIL((cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) END + cast(0 as bigint)) - cast(1 as bigint)) * 300000000) + 0), LongType, TimestampType), end, precisetimestampconversion((((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) as double) = (cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) THEN (CEIL((cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) + cast(1 as bigint)) ELSE CEIL((cast((precisetimestampconversion(event_time#7589, TimestampType, LongType) - 0) as double) / cast(300000000 as double))) END + cast(0 as bigint)) - cast(1 as bigint)) * 300000000) + 0) + 300000000), LongType, TimestampType)) AS window#7650, event_time#7589, category#7592, amt#7593, gender#7596, is_fraud#7610]
                           +- Project [event_time#7589, category#7592, amt#7593, gender#7596, is_fraud#7610]
                              +- Project [splitted#7586[0] AS id#7588, cast(splitted#7586[1] as timestamp) AS event_time#7589, cast(splitted#7586[2] as bigint) AS cc_num#7590L, splitted#7586[3] AS merchant#7591, splitted#7586[4] AS category#7592, cast(splitted#7586[5] as double) AS amt#7593, splitted#7586[6] AS first#7594, splitted#7586[7] AS last#7595, splitted#7586[8] AS gender#7596, splitted#7586[9] AS street#7597, splitted#7586[10] AS city#7598, splitted#7586[11] AS state#7599, splitted#7586[12] AS zip#7600, splitted#7586[13] AS lat#7601, splitted#7586[14] AS long#7602, cast(splitted#7586[15] as bigint) AS city_pop#7603L, splitted#7586[16] AS job#7604, splitted#7586[17] AS dob#7605, splitted#7586[18] AS trans_num#7606, cast(splitted#7586[19] as bigint) AS unix_time#7607L, splitted#7586[20] AS merch_lat#7608, splitted#7586[21] AS merch_long#7609, cast(splitted#7586[22] as int) AS is_fraud#7610]
                                 +- Project [split(value#7584, ,, -1) AS splitted#7586]
                                    +- Project [cast(value#7571 as string) AS value#7584]
                                       +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@54bde53b, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@1d74aaca, [startingOffsets=earliest, kafka.bootstrap.servers=kafka1:9093, subscribe=records112556, failOnDataLoss=False], [key#7570, value#7571, topic#7572, partition#7573, offset#7574L, timestamp#7575, timestampType#7576], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@340524c,kafka,List(),None,List(),None,Map(kafka.bootstrap.servers -> kafka1:9093, subscribe -> records112556, startingOffsets -> earliest, failOnDataLoss -> False),None), kafka, [key#7563, value#7564, topic#7565, partition#7566, offset#7567L, timestamp#7568, timestampType#7569]


In [None]:
query = string_sdf_User_Team_avg_window \
              .select(concat(col("window_slot"), lit(","), col("User"), lit(","), col("Team")).alias("key"), 
                       col("avg_score").alias('value') ) \
              .writeStream \
              .format("kafka") \
              .option("kafka.bootstrap.servers", "kafka1:9093") \
              .option("checkpointLocation", "/home/jovyan/checkpoint")\
              .option("topic", "avg_score") \
              .outputMode("complete") \
              .start()

try:
    query.awaitTermination()
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")

In [31]:
spark.stop()

In [13]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, TimestampType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab7_exercise")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType(
    [StructField("User", StringType(), True),
     StructField("Team", StringType(), True),
     StructField("Score", DoubleType(), True),
     StructField("timestamp_in_ms", LongType(), True),
     StructField("event_time", TimestampType(), True)
     ])

df = spark.read.format('csv').schema(dataSchema).option('header', 'False').load("/home/jovyan/data/gamestream/game_data_split1.csv")

In [16]:
df.printSchema()
df.show(truncate=False)

root
 |-- User: string (nullable = true)
 |-- Team: string (nullable = true)
 |-- Score: double (nullable = true)
 |-- timestamp_in_ms: long (nullable = true)
 |-- event_time: timestamp (nullable = true)

+--------------------------+--------------------+-----+---------------+-----------------------+
|User                      |Team                |Score|timestamp_in_ms|event_time             |
+--------------------------+--------------------+-----+---------------+-----------------------+
|user16_AmaranthKoala      |AmaranthKoala       |18.0 |1447719060000  |2015-11-16 16:11:03.921|
|user10_AndroidGreenKoala  |AndroidGreenKoala   |2.0  |1447719060000  |2015-11-16 16:11:03.955|
|user9_AuburnCockatoo      |AuburnCockatoo      |5.0  |1447719060000  |2015-11-16 16:11:03.955|
|user1_AntiqueBrassPlatypus|AntiqueBrassPlatypus|7.0  |1447719060000  |2015-11-16 16:11:03.955|
|user9_BattleshipGreyPossum|BattleshipGreyPossum|14.0 |1447719060000  |2015-11-16 16:11:03.955|
|user1_AmaranthDingo       