## Import Libs

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, expr, lit
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from delta import configure_spark_with_delta_pip

## Start Spark Session

In [None]:
spark = SparkSession\
    .builder\
    .appName("KafkaStreaming")\
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.3.0")\
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

## Define the schema

In [None]:
spark.version

In [None]:
schema = StructType()\
    .add("user_id", StringType())\
    .add("user_name", StringType())\
    .add("event_type", StringType())\
    .add("page_id", StringType())\
    .add("timestamp", TimestampType())\
    .add("device_info", StringType())

## Read from kafka

In [None]:
raw_df = spark\
    .readStream\
    .format("kafka")\
    .option("kafka.bootstrap.servers", "localhost:9092")\
    .option("subscribe", "my-first-topic")\
    .option("startingOffsets", "earliest")\
    .load()

## Parse the dataset

In [None]:
# Cast binary Kafka value to string before parsing
json_df = raw_df.selectExpr("CAST(value AS STRING) as json_value")

# Parse the JSON string using the schema
parsed_df = json_df.select(from_json(col("json_value"), schema).alias("data")).select("data.*")

In [None]:
new_df = parsed_df\
    .withColumn("testing",lit(1))

new_df

In [None]:
# Write to console (for testing)
query = parsed_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination()

## Writing into Delta Table

In [None]:
query = parsed_df\
    .writeStream\
    .format("delta")\
    .outputMode("append")\
    .option("checkpointLocation","/Users/sahilnagpal/Desktop/byte-building/AMAZON_USER_BEHAVIOR_TRACKING/delta_lake/raw/user_events")\
    .start()

In [None]:
spark.sql("describe detail user_events").show(truncate=False)