<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark_streaming/usecase1/script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cloning Repo

https://medium.com/@ashwindesilva/how-to-use-google-colaboratory-to-clone-a-github-repository-e07cf8d3d22b

https://github.com/lucprosa/dataeng-basic-course

In [1]:
from google.colab import drive
drive.mount('/collab/')

Mounted at /collab/


In [2]:
%cd /collab/MyDrive/Collab/repos/

/collab/MyDrive/Collab/repos


In [3]:
! git clone https://github.com/lucprosa/dataeng-basic-course.git

Cloning into 'dataeng-basic-course'...
remote: Enumerating objects: 43, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 43 (delta 10), reused 31 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (43/43), 9.56 KiB | 699.00 KiB/s, done.
Resolving deltas: 100% (10/10), done.


In [4]:
! cd /content/
! mkdir /content/files
! mkdir /content/input
! mkdir /content/output
! cp /collab/MyDrive/Collab/repos/dataeng-basic-course/spark_streaming/usecase1/source/* /content/files

# Setting up PySpark

In [5]:
%pip install pyspark



In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').config('spark.ui.port', '4050').getOrCreate()

# Read CSVs as streaming

In [7]:
from pyspark.sql.types import *
schema = StructType([
StructField('timestamp',TimestampType(),True),
StructField('person_ID',IntegerType(),True),
StructField('name',StringType(),True),
StructField('first',StringType(),True),
StructField('last', StringType(), True),
StructField('middle', StringType(), True),
StructField('email', StringType(), True),
StructField('phone', StringType(), True),
StructField('fax', StringType(), True),
StructField('title', StringType(), True)])

In [None]:
! rm -rf /content/checkpoint

In [14]:
people_df = spark.readStream.format('csv').schema(schema).option('header', True).load('/content/input').withWatermark("timestamp", "10 minutes")

In [17]:
print(people_df.isStreaming)

True


In [16]:
results_df = people_df.withWatermark("timestamp", "10 minutes").groupBy("title").count()

query = (results_df.writeStream
.format('memory')
.queryName('my_query')
.option('checkpointLocation', '/content/checkpoint')
.option('path', '/content/output')
.outputMode('append')
.start()
)

AnalysisException: Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;
Aggregate [title#101], [title#101, count(1) AS count#144L]
+- EventTimeWatermark timestamp#92: timestamp, 10 minutes
   +- EventTimeWatermark timestamp#92: timestamp, 10 minutes
      +- StreamingRelation DataSource(org.apache.spark.sql.SparkSession@7599894e,csv,List(),Some(StructType(StructField(timestamp,TimestampType,true),StructField(person_ID,IntegerType,true),StructField(name,StringType,true),StructField(first,StringType,true),StructField(last,StringType,true),StructField(middle,StringType,true),StructField(email,StringType,true),StructField(phone,StringType,true),StructField(fax,StringType,true),StructField(title,StringType,true))),List(),None,Map(header -> true, path -> /content/input),None), FileSource[/content/input], [timestamp#92, person_ID#93, name#94, first#95, last#96, middle#97, email#98, phone#99, fax#100, title#101]


In [None]:
spark.sql("select * from my_query").show()

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `my_query` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [my_query], [], false


In [None]:
query.stop()

In [244]:
import pyspark.sql.functions as F

# read stream
stream1 = spark.readStream.format("rate").option("rowsPerSecond", 10).load()

# transform
transformed = stream1.withColumn("minute", F.minute("timestamp"))
agg = transformed.groupBy(F.window(transformed.timestamp, "5 seconds")).count()

# write stream
query = (agg.writeStream
.format('memory')
.queryName('my_query')
.outputMode('complete')
.start()
)

In [232]:
spark.sql("select * from my_query order by window desc").show(10,False)

+------------------------------------------+-----+
|window                                    |count|
+------------------------------------------+-----+
|{2024-11-05 18:12:25, 2024-11-05 18:12:30}|29   |
|{2024-11-05 18:12:20, 2024-11-05 18:12:25}|50   |
|{2024-11-05 18:12:15, 2024-11-05 18:12:20}|50   |
|{2024-11-05 18:12:10, 2024-11-05 18:12:15}|50   |
|{2024-11-05 18:12:05, 2024-11-05 18:12:10}|50   |
|{2024-11-05 18:12:00, 2024-11-05 18:12:05}|50   |
|{2024-11-05 18:11:55, 2024-11-05 18:12:00}|50   |
|{2024-11-05 18:11:50, 2024-11-05 18:11:55}|50   |
|{2024-11-05 18:11:45, 2024-11-05 18:11:50}|50   |
|{2024-11-05 18:11:40, 2024-11-05 18:11:45}|50   |
+------------------------------------------+-----+
only showing top 10 rows



In [245]:
query.stop()

In [243]:
import pyspark.sql.functions as F

# read stream
stream1 = spark.readStream.format("rate").option("rowsPerSecond", 10).load()

# transform
transformed = stream1.withColumn("minute", F.minute("timestamp"))
agg = transformed.groupBy(F.window(transformed.timestamp, "5 seconds"), transformed.minute).count()

# write stream
query = (agg.writeStream
.format('parquet')
.option('checkpointLocation', '/content/checkpoint')
.option('path', '/content/output')
.outputMode('append')
.partitionBy('minute')
.start()
)

AnalysisException: Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;
Aggregate [window#108635, minute#108625], [window#108635 AS window#108629, minute#108625, count(1) AS count#108634L]
+- Project [named_struct(start, knownnullable(precisetimestampconversion(((precisetimestampconversion(timestamp#108621, TimestampType, LongType) - CASE WHEN (((precisetimestampconversion(timestamp#108621, TimestampType, LongType) - 0) % 5000000) < cast(0 as bigint)) THEN (((precisetimestampconversion(timestamp#108621, TimestampType, LongType) - 0) % 5000000) + 5000000) ELSE ((precisetimestampconversion(timestamp#108621, TimestampType, LongType) - 0) % 5000000) END) - 0), LongType, TimestampType)), end, knownnullable(precisetimestampconversion((((precisetimestampconversion(timestamp#108621, TimestampType, LongType) - CASE WHEN (((precisetimestampconversion(timestamp#108621, TimestampType, LongType) - 0) % 5000000) < cast(0 as bigint)) THEN (((precisetimestampconversion(timestamp#108621, TimestampType, LongType) - 0) % 5000000) + 5000000) ELSE ((precisetimestampconversion(timestamp#108621, TimestampType, LongType) - 0) % 5000000) END) - 0) + 5000000), LongType, TimestampType))) AS window#108635, timestamp#108621, value#108622L, minute#108625]
   +- Filter isnotnull(timestamp#108621)
      +- Project [timestamp#108621, value#108622L, minute(timestamp#108621, Some(Etc/UTC)) AS minute#108625]
         +- StreamingRelationV2 org.apache.spark.sql.execution.streaming.sources.RateStreamProvider@6bc2e48c, rate, org.apache.spark.sql.execution.streaming.sources.RateStreamTable@3ba379a5, [rowsPerSecond=10], [timestamp#108621, value#108622L]


In [None]:
# salvar em parquet , particionando por janela de 10 segundos


