<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark_streaming/examples/2-checkpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Checkpoint

# Setting up PySpark

In [None]:
%pip install pyspark



In [23]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

In [146]:
!rm -rf content/input/*
!rm -rf content/output/*
!rm -rf content/checkpoint/*

In [29]:
from datetime import datetime
import csv

def generate_file():
  timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
  filename = f"content/input/file_{timestamp}.csv"
  with open(filename, 'w', newline='') as csvfile:
      fieldnames = ['col', 'value', 'file']
      writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=";")
      writer.writeheader()
      writer.writerow({'col': 'c1', 'value': 'v1', 'file': filename})
      writer.writerow({'col': 'c2', 'value': 'v2', 'file': filename})
      writer.writerow({'col': 'c3', 'value': 'v3', 'file': filename})

!mkdir -p content/input

rm: cannot remove 'content/input/*': No such file or directory


In [83]:
spark.read.format("csv").option("sep", ";").option("header", True).load("content/input/").show(100, False)

+---+-----+-------------------------------------+
|col|value|file                                 |
+---+-----+-------------------------------------+
|c1 |v1   |content/input/file_20241123003045.csv|
|c2 |v2   |content/input/file_20241123003045.csv|
|c3 |v3   |content/input/file_20241123003045.csv|
|c1 |v1   |content/input/file_20241123002642.csv|
|c2 |v2   |content/input/file_20241123002642.csv|
|c3 |v3   |content/input/file_20241123002642.csv|
|c1 |v1   |content/input/file_20241123003047.csv|
|c2 |v2   |content/input/file_20241123003047.csv|
|c3 |v3   |content/input/file_20241123003047.csv|
+---+-----+-------------------------------------+



In [166]:
generate_file()

In [147]:
from pyspark.sql.types import *

schema = StructType([
StructField('Col',StringType(),True),
StructField('Value',StringType(),True),
StructField('File',StringType(),True)
])

stream = spark.readStream.format('csv').schema(schema).option("sep", ";").option('header', True).load('content/input/')

In [163]:
query = (stream.writeStream
.format('csv')
.option("header", True)
.queryName("stream")
.option('checkpointLocation', 'content/checkpoint')
.option('path', 'content/output')
.trigger(processingTime='5 seconds')
.outputMode('append')
.start()
)

In [167]:
print(spark.read.csv('content/output', header=True, sep=";").count())
spark.read.csv('content/output', header=True, sep=",").show(100, False)

15
+---+-----+-------------------------------------+
|Col|Value|File                                 |
+---+-----+-------------------------------------+
|c1 |v1   |content/input/file_20241123004631.csv|
|c2 |v2   |content/input/file_20241123004631.csv|
|c3 |v3   |content/input/file_20241123004631.csv|
|c1 |v1   |content/input/file_20241123004633.csv|
|c2 |v2   |content/input/file_20241123004633.csv|
|c3 |v3   |content/input/file_20241123004633.csv|
|c1 |v1   |content/input/file_20241123004709.csv|
|c2 |v2   |content/input/file_20241123004709.csv|
|c3 |v3   |content/input/file_20241123004709.csv|
|c1 |v1   |content/input/file_20241123004711.csv|
|c2 |v2   |content/input/file_20241123004711.csv|
|c3 |v3   |content/input/file_20241123004711.csv|
|c1 |v1   |content/input/file_20241123004745.csv|
|c2 |v2   |content/input/file_20241123004745.csv|
|c3 |v3   |content/input/file_20241123004745.csv|
+---+-----+-------------------------------------+



In [168]:
query.stop()

In [157]:
query.isActive

False