In [18]:
! pip install datetime

Collecting datetime
  Downloading DateTime-4.3-py2.py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 3.7 MB/s eta 0:00:011
Collecting zope.interface
  Downloading zope.interface-5.4.0-cp39-cp39-manylinux2010_x86_64.whl (255 kB)
[K     |████████████████████████████████| 255 kB 10.6 MB/s eta 0:00:01
Installing collected packages: zope.interface, datetime
Successfully installed datetime-4.3 zope.interface-5.4.0


In [58]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, concat, col, lit, mean, window, sum, count, round, max
from pyspark.sql.window import Window
from time import sleep
from datetime import datetime 

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Assignment_2")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

date = datetime.now().strftime("%m%d%M")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Read the whole dataset as a batch
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka1:9093") \
        .option("subscribe", f"records{date}") \
        .option("startingOffsets", "earliest") \
        .option("failOnDataLoss", 'False') \
        .load()

# split the value
records = df.selectExpr("CAST(value AS STRING)").select(split(col('value'), ",").alias('splitted'))

# define the schema in this way
new_df =  records.selectExpr('splitted[0] as id', 
                             'cast(splitted[1] as timestamp) as event_time',
                             'cast(splitted[2] as long) as cc_num',
                             'splitted[3] as merchant',
                             'splitted[4] as category',
                             'cast(splitted[5] as double) as amt',
                             'splitted[6] as first',
                             'splitted[7] as last',
                             'splitted[8] as gender',
                             'splitted[9] as street',
                             'splitted[10] as city',
                             'splitted[11] as state',
                             'splitted[12] as zip',
                             'splitted[13] as lat',
                             'splitted[14] as long',
                             'cast(splitted[15] as long) as city_pop',
                             'splitted[16] as job',
                             'splitted[17] as dob',
                             'splitted[18] as trans_num',
                             'cast(splitted[19] as long) as unix_time',
                             'splitted[20] as merch_lat',
                             'splitted[21] as merch_long',
                             'cast(splitted[22] as int) as is_fraud')

new_df = new_df.select('event_time', 'category', 'amt', 'gender', 'is_fraud')

# filter out the fraud
#new_df = new_df.where(col('is_fraud') == 1)

window_5min = window(col('event_time'), '5 minutes').alias('time_slot')
sdf = new_df.groupBy(window_5min, 'gender', 'category') \
            .agg(sum('amt').alias('amt'), count('*').alias('count'))

window_group_by = Window.partitionBy(window_5min, col('gender'))
sdf = sdf.withColumn('max_amt_per_5_min', max('amt').over(window_group_by)) \
         .where(col('amt') == col('max_amt_per_5_min'))\
         .drop('max_amt_per_5_min')
            
# Write to a sink - here, the output is memory (only for testing). 
activityQuery = sdf.select(sdf.time_slot.start.cast('string').alias('start_time'), 
                           sdf.time_slot.end.cast('string').alias('end_time'), 
                           'gender', round('amt', 2).alias('amt'), 'count', 'category') \
                   .writeStream \
                   .queryName("most_fraud_table") \
                   .format("memory").outputMode("update") \
                   .start()
# Testing 
for x in range(10):
    spark.sql("SELECT * FROM most_fraud_table").show(truncate=False)
    sleep(5)

+----------+--------+------+---+-----+--------+
|start_time|end_time|gender|amt|count|category|
+----------+--------+------+---+-----+--------+
+----------+--------+------+---+-----+--------+

+----------+--------+------+---+-----+--------+
|start_time|end_time|gender|amt|count|category|
+----------+--------+------+---+-----+--------+
+----------+--------+------+---+-----+--------+

+----------+--------+------+---+-----+--------+
|start_time|end_time|gender|amt|count|category|
+----------+--------+------+---+-----+--------+
+----------+--------+------+---+-----+--------+

+----------+--------+------+---+-----+--------+
|start_time|end_time|gender|amt|count|category|
+----------+--------+------+---+-----+--------+
+----------+--------+------+---+-----+--------+

+----------+--------+------+---+-----+--------+
|start_time|end_time|gender|amt|count|category|
+----------+--------+------+---+-----+--------+
+----------+--------+------+---+-----+--------+

+----------+--------+------+---+---

In [None]:
query = string_sdf_User_Team_avg_window \
              .select(concat(col("window_slot"), lit(","), col("User"), lit(","), col("Team")).alias("key"), 
                       col("avg_score").alias('value') ) \
              .writeStream \
              .format("kafka") \
              .option("kafka.bootstrap.servers", "kafka1:9093") \
              .option("checkpointLocation", "/home/jovyan/checkpoint")\
              .option("topic", "avg_score") \
              .outputMode("complete") \
              .start()

try:
    query.awaitTermination()
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")

In [49]:
spark.stop()

In [13]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, TimestampType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab7_exercise")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType(
    [StructField("User", StringType(), True),
     StructField("Team", StringType(), True),
     StructField("Score", DoubleType(), True),
     StructField("timestamp_in_ms", LongType(), True),
     StructField("event_time", TimestampType(), True)
     ])

df = spark.read.format('csv').schema(dataSchema).option('header', 'False').load("/home/jovyan/data/gamestream/game_data_split1.csv")

In [16]:
df.printSchema()
df.show(truncate=False)

root
 |-- User: string (nullable = true)
 |-- Team: string (nullable = true)
 |-- Score: double (nullable = true)
 |-- timestamp_in_ms: long (nullable = true)
 |-- event_time: timestamp (nullable = true)

+--------------------------+--------------------+-----+---------------+-----------------------+
|User                      |Team                |Score|timestamp_in_ms|event_time             |
+--------------------------+--------------------+-----+---------------+-----------------------+
|user16_AmaranthKoala      |AmaranthKoala       |18.0 |1447719060000  |2015-11-16 16:11:03.921|
|user10_AndroidGreenKoala  |AndroidGreenKoala   |2.0  |1447719060000  |2015-11-16 16:11:03.955|
|user9_AuburnCockatoo      |AuburnCockatoo      |5.0  |1447719060000  |2015-11-16 16:11:03.955|
|user1_AntiqueBrassPlatypus|AntiqueBrassPlatypus|7.0  |1447719060000  |2015-11-16 16:11:03.955|
|user9_BattleshipGreyPossum|BattleshipGreyPossum|14.0 |1447719060000  |2015-11-16 16:11:03.955|
|user1_AmaranthDingo       