In [1]:
print("hello")

hello


In [2]:
""" istall the driver if does not exist """
# !pip install cassandra-driver

' istall the driver if does not exist '

In [3]:
""" Create the keyspace and created_users table for data loading if it does not exist """

from cassandra.cluster import Cluster

cluster = Cluster(['cassandra_db'])  # Update with your Cassandra host
session = cluster.connect()

# Create the keyspace
keyspace_query = """
CREATE KEYSPACE IF NOT EXISTS spark_streams 
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
"""
session.execute(keyspace_query)
print("Keyspace 'spark_streams' created (if not exists).")

# Use the keyspace
session.set_keyspace('spark_streams')

# Create the table
table_query = """
CREATE TABLE IF NOT EXISTS created_users (
    first_name text,
    last_name text,
    gender text,
    address text,
    postcode text,
    email text,
    username text,
    dob text,
    registered text,
    phone text,
    picture text,
    PRIMARY KEY ((first_name, last_name), email)
);
"""
session.execute(table_query)
print("Table 'created_users' created (if not exists).")

# Close the connection
cluster.shutdown()

Keyspace 'spark_streams' created (if not exists).
Table 'created_users' created (if not exists).


In [4]:
    # .config('spark.jars', '/home/jovyan/.ivy2/jars/com.datastax.spark_spark-cassandra-connector-driver_2.12-3.3.0.jar')
    # .config('spark.jars', '/home/jovyan/.ivy2/jars/postgresql-42.2.20.jar')
    #.config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.3.0")

In [5]:
# Create the Spark Session
# from pyspark.sql import SparkSession

# spark = (
#     SparkSession 
#     .builder 
#     .appName("Streaming from Kafka") 
#     .config("spark.streaming.stopGracefullyOnShutdown", True) 
#     .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0')
#     .config("spark.jars.packages", 'com.datastax.spark:spark-cassandra-connector_2.12:3.3.0')
#     .config("spark.sql.shuffle.partitions", 4)
#     .master("local[*]") 
#     .getOrCreate()
# )

# spark
# from pyspark.sql import SparkSession

# spark = (
#     SparkSession 
#     .builder 
#     .appName("Writing to Multiple Sinks") 
#     .config("spark.streaming.stopGracefullyOnShutdown", True) 
#     .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0")
#     .config('spark.jars', '/home/jovyan/.ivy2/jars/com.datastax.spark_spark-cassandra-connector-driver_2.12-3.3.0.jar,/home/jovyan/.ivy2/jars/postgresql-42.2.20.jar')
#     .config("spark.cassandra.connection.host", "cassandra_db")  # Docker hostname
#     .config("spark.cassandra.connection.port", "9042")       # Default port
#     .config("spark.cassandra.auth.username", "cassandra")    # Credentials from your Docker setup
#     .config("spark.cassandra.auth.password", "cassandra")
#     .config("spark.sql.shuffle.partitions", 8)
#     .master("local[*]") 
#     .getOrCreate()
# )

# spark
from pyspark.sql import SparkSession

spark = (
    SparkSession 
    .builder 
    .appName("Writing to Multiple Sinks") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,com.datastax.spark:spark-cassandra-connector_2.12:3.3.0,org.postgresql:postgresql:42.2.20")
    .config("spark.cassandra.connection.host", "cassandra_db")  # Docker hostname
    .config("spark.cassandra.connection.port", "9042")       # Default port
    .config("spark.cassandra.auth.username", "cassandra")    # Credentials from your Docker setup
    .config("spark.cassandra.auth.password", "cassandra")
    .config("spark.sql.shuffle.partitions", 8)
    .master("local[*]") 
    .getOrCreate()
)
spark

In [6]:
# Create the kafka_df to read from kafka

kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "ed-kafka:29092")
    .option("subscribe", "users_created")
    .option("startingOffsets", "earliest")
    .load()
)

In [7]:
# View schema for raw kafka_df
kafka_df.printSchema()
#kafka_df.show()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [8]:
# Parse value from binay to string into kafka_json_df
from pyspark.sql.functions import expr

kafka_json_df = kafka_df.withColumn("value", expr("cast(value as string)"))

In [9]:
from pyspark.sql.types import StringType, StructField, StructType, ArrayType, LongType

json_schema = StructType([
        StructField("first_name", StringType(), True),
        StructField("last_name", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("address", StringType(), True),
        StructField("postcode", StringType(), True),
        StructField("email", StringType(), True),
        StructField("username", StringType(), True),
        StructField("dob", StringType(), True),
        StructField("registered", StringType(), True),
        StructField("phone", StringType(), True),
        StructField("picture", StringType(), True)
    ])

In [10]:
# Apply the schema to payload to read the data
from pyspark.sql.functions import from_json,col

streaming_df = kafka_json_df.withColumn("values_json", from_json(col("value"), json_schema)).selectExpr("values_json.*")

In [11]:
# To the schema of the data, place a sample json file and change readStream to read 
from pyspark.sql.functions import to_timestamp

# Convert the 'dob' column to a timestamp
streaming_df = streaming_df.withColumn("dob", to_timestamp("dob"))
# Cast the 'registered' column to timestamp
streaming_df = streaming_df.withColumn("registered", to_timestamp("registered"))

streaming_df.printSchema()
#streaming_df.show(truncate=False)

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- email: string (nullable = true)
 |-- username: string (nullable = true)
 |-- dob: timestamp (nullable = true)
 |-- registered: timestamp (nullable = true)
 |-- phone: string (nullable = true)
 |-- picture: string (nullable = true)



In [12]:
# Python function to write to multiple sinks
def device_data_output(df, batch_id):
    print("Batch id: "+ str(batch_id))
    
    # Write to parquet
    df.write.format("parquet").mode("append").save("data/output/device_data.parquet/")
    
    
    # Write to JDBC Postgres
    (
        df.write
        .mode("append")
        .format("jdbc")
        .option("driver", "org.postgresql.Driver")
        .option("url", "jdbc:postgresql://postgres:5432/airflow")
        .option("dbtable", "loaded_users_data")
        .option("user", "airflow")
        .option("password", "airflow")
        .save()
    
    )
   
    # Write to Cassandra
    (
        df.write
        .format("org.apache.spark.sql.cassandra")
        .mode("append")
        .option("table", "created_users")
        .option("keyspace", "spark_streams")
        .option("spark.cassandra.connection.host", "cassandra_db")
        .option("spark.cassandra.connection.port", "9042")
        .save()
    )
    
    
    # Diplay
    df.show()

In [None]:
# Running foreachBatch
# Write the output to Multiple Sinks
(streaming_df
 .writeStream
 .foreachBatch(device_data_output)
 .trigger(processingTime='10 seconds')
 .option("checkpointLocation", "checkpoint_dir_kafka")
 .start()
 .awaitTermination())


Batch id: 0
+----------+-----------+------+--------------------+--------+--------------------+------------------+--------------------+--------------------+--------------+--------------------+
|first_name|  last_name|gender|             address|postcode|               email|          username|                 dob|          registered|         phone|             picture|
+----------+-----------+------+--------------------+--------+--------------------+------------------+--------------------+--------------------+--------------+--------------------+
|    Connor|      Meyer|  male|3476 York Road, K...|   20159|connor.meyer@exam...|      purplecat254|1969-11-28 07:57:...|2006-07-19 06:10:...|  021-198-8350|https://randomuse...|
|  Danielle|       Beck|female|5946 Park Lane, W...|   53098|danielle.beck@exa...|   goldenrabbit747|1967-10-23 00:57:...|2004-10-14 10:54:...|  041-455-6813|https://randomuse...|
|  Prvoslav|  Todorović|  male|7006 Belopoljska,...|   26941|prvoslav.todorovi...| beaut