In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os
from pyspark.sql import DataFrame
import time
import logging
from datetime import datetime


#Variable declaration

#cassandra database detail
KEYSPACE = "ineuron"
TABLE="employee"

#kafka server detail
KAFKA_BOOTSTRAP_SERVER="kafka:9092"
KAFKA_TOPIC = "employee"

#Cassandra database connectivity credentails
CASSANDRA_HOST="cassandra"
CASSANDRA_USER="cassandra"
CASSANDRA_PASSWORD="cassandra"


#Maining log 
#log file name
LOG_FILE_NAME = f"{datetime.now().strftime('%m%d%Y__%H%M%S')}.log"
#log directory
LOG_FILE_DIR = os.path.join(os.getcwd(),"logs")
#create folder if not available
os.makedirs(LOG_FILE_DIR,exist_ok=True)


logging.basicConfig(
    filename=os.path.join(LOG_FILE_DIR,LOG_FILE_NAME),
    format="[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s",
    level=logging.INFO,
)


#create spark session with cassandar configuration
sparkSesison = (SparkSession.builder
                 .config("spark.cassandra.connection.host","cassandra")
                 .config("spark.cassandra.auth.username","cassandra")
                 .config("spark.cassandra.auth.password","cassandra")
                 .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0")
                 .appName("demo").getOrCreate()
                 )




:: loading settings :: url = jar:file:/usr/local/lib/python3.8/dist-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8a89cf81-ff0e-4d57-ac56-f00a70441a61;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.1 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
	found com.datastax.spark#spark-cassandra-connector_2.12;3.0.0 in central
	found com.datastax.spark#spark-cassandra-connector-driver_2

In [2]:

#Reading table from cassandra db and returning spark dataframe
def dataFrameFromCassandaDbTable(sparkSession:SparkSession,keyspace:str,table:str)->DataFrame:
    df = (sparkSession.read
        .format("org.apache.spark.sql.cassandra")
        .options(table=table, keyspace=keyspace)
        .load())
    return df

In [3]:
df = dataFrameFromCassandaDbTable(sparkSession=sparkSesison,keyspace=KEYSPACE,table=TABLE)

In [4]:
df.show()

+------+----------+---------+-------------+
|emp_id|      city| emp_name|        state|
+------+----------+---------+-------------+
|    13|     Noida|    Rahul|Uttar Pradesh|
|     6|      Pune|   Barton|  Maharashtra|
|    25|    Mumbai|     Iris|  Maharashtra|
|    16|    Nagpur|   Vikash|  Maharashtra|
|    27|      Pune|    Ankit|  Maharashtra|
|    19| Hyderabad| Scarlett|Andra Pradesh|
|    30|     Delhi|     Emma|        Delhi|
|     4|    Mumbai|  Stephen|  Maharashtra|
|    18|      Pune|     Adam|  Maharashtra|
|     7| Hyderabad|  Natasha|Andra Pradesh|
|    17|   Chennai|  Aravind|    Tamilnadu|
|    31|New Mumbai| Abhishek|  Maharashtra|
|    11|    Nagpur|    Krish|  Maharashtra|
|     1| Bengalore|   Avnish|    Karnataka|
|    10|New Mumbai|Sudhanshu|  Maharashtra|
|    24| Hyderabad|  Susmita|Andra Pradesh|
|    20|     Noida|   Robert|Uttar Pradesh|
|    21|     Delhi|   Shivam|        Delhi|
|    14|     Delhi|    Sunny|        Delhi|
|    15|New Mumbai|   Vishal|  M

In [6]:
dataframe = df.select(col("emp_id").cast(StringType()).alias("key"),to_json(struct("emp_id","emp_name","city","state")).alias("value"))
    

In [7]:
dataframe.show(2,truncate=False)

+---+-----------------------------------------------------------------------------+
|key|value                                                                        |
+---+-----------------------------------------------------------------------------+
|17 |{"emp_id":17,"emp_name":"Aravind","city":"Chennai","state":"Tamilnadu"}      |
|24 |{"emp_id":24,"emp_name":"Susmita","city":"Hyderabad","state":"Andra Pradesh"}|
+---+-----------------------------------------------------------------------------+
only showing top 2 rows



## Reading data from dataSink employee data

In [9]:
(dataframe
    .write
    .format("kafka")
    .option("kafka.bootstrap.servers",KAFKA_BOOTSTRAP_SERVER)
    .option("failOnDataLoss", "false") 
    .option("topic",KAFKA_TOPIC ) 
    .save())

                                                                                

In [13]:
df = sparkSesison.read.parquet("/project/employee_data")

In [14]:
df.show()

+------+--------+----------+-------------+
|emp_id|emp_name|      city|        state|
+------+--------+----------+-------------+
|    24| Susmita| Hyderabad|Andra Pradesh|
|     2|  Avnish| Bengalore|    Karnataka|
|     9|Shashank|     Delhi|        Delhi|
|    11|   Krish|    Nagpur|  Maharashtra|
|    27|   Ankit|      Pune|  Maharashtra|
|    17| Aravind|   Chennai|    Tamilnadu|
|    13|   Rahul|     Noida|Uttar Pradesh|
|     1|  Avnish| Bengalore|    Karnataka|
|    12|    Aman| Hyderabad|Andra Pradesh|
|     8|  Sundar|     Noida|Uttar Pradesh|
|    26|     Pia|   Chennai|    Tamilnadu|
|    25|    Iris|    Mumbai|  Maharashtra|
|    22|  Deepak|New Mumbai|  Maharashtra|
|    21|  Shivam|     Delhi|        Delhi|
|    19|Scarlett| Hyderabad|Andra Pradesh|
|    14|   Sunny|     Delhi|        Delhi|
|     7| Natasha| Hyderabad|Andra Pradesh|
|    30|    Emma|     Delhi|        Delhi|
|    29|   Harry|     Noida|Uttar Pradesh|
|    23|    Amit|    Nagpur|  Maharashtra|
+------+---