# Kafka Producer Test
1. Read this CSV with headers using spark.
2. Publish these records into Kafka in streaming fashion.

### Test dummy (practice)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, to_json, struct

In [2]:
spark = SparkSession.builder \
    .appName("SimpleKafkaProducerTest") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.5") \
    .getOrCreate()

In [3]:
spark.sparkContext.setLogLevel("WARN")

In [4]:
rate_df = spark.readStream \
    .format("rate") \
    .option("rowsPerSecond", 1) \
    .load()

In [5]:
test_df = rate_df.select(
    to_json(
        struct(
            lit("Test Message").alias("message"),
            rate_df["value"].alias("id")
        )
    ).alias("value")
)

In [6]:
kafka_bootstrap_servers = "kafka:9092"  
topic = "test-topic"

In [7]:
query = test_df.writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("topic", topic) \
    .option("checkpointLocation", "/tmp/kafka_test_checkpoint") \
    .outputMode("append") \
    .start()

In [9]:
# print("Simple test streaming producer is running. Check your Kafka consumer for messages.")
# query.awaitTermination()

---

# Kafka_Producer Code

In [10]:
from pyspark.sql import Row
from pyspark.sql.functions import col
from pyspark.sql.types import LongType

In [11]:
spark = SparkSession.builder \
    .appName("KafkaProducer") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.5") \
    .config("spark.jars.repositories", "https://repos.spark-packages.org") \
    .getOrCreate()

In [12]:
spark.sparkContext.setLogLevel("WARN")

In [18]:
df = spark.read.csv("/data/dataset.csv", header=True, inferSchema=True)
print("CSV Preview:")
df.show(5)
print("Static CSV row count:", df.count())

CSV Preview:
+----------------+-------------------+----------------+-----------------------------+------------------+
|       Date/Time|LV ActivePower (kW)|Wind Speed (m/s)|Theoretical_Power_Curve (KWh)|Wind Direction (°)|
+----------------+-------------------+----------------+-----------------------------+------------------+
|01 01 2018 00:00|   380.047790527343|5.31133604049682|             416.328907824861|  259.994903564453|
|01 01 2018 00:10|    453.76919555664|5.67216682434082|             519.917511061494|   268.64111328125|
|01 01 2018 00:20|   306.376586914062|5.21603679656982|             390.900015810951|  272.564788818359|
|01 01 2018 00:30|   419.645904541015|5.65967416763305|             516.127568975674|  271.258087158203|
|01 01 2018 00:40|   380.650695800781|5.57794094085693|             491.702971953588|  265.674285888671|
+----------------+-------------------+----------------+-----------------------------+------------------+
only showing top 5 rows

Static CSV row co

In [19]:
def add_idx_row(row, index):
    row_dict = row.asDict()
    row_dict["row_id"] = index
    return Row(**row_dict)

In [20]:
rdd = df.rdd.zipWithIndex().map(lambda x: add_idx_row(x[0], x[1]))
new_schema = df.schema.add("row_id", LongType())
new_df = spark.createDataFrame(rdd, schema=new_schema)

print("Static CSV Input DF Schema:")
new_df.printSchema()

Static CSV Input DF Schema:
root
 |-- Date/Time: string (nullable = true)
 |-- LV ActivePower (kW): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Theoretical_Power_Curve (KWh): double (nullable = true)
 |-- Wind Direction (°): double (nullable = true)
 |-- row_id: long (nullable = true)



In [21]:
streaming_df = spark.readStream \
    .format("rate") \
    .option("rowsPerSecond", 10) \
    .load()

In [22]:
streaming_df = streaming_df.withColumnRenamed("value", "row_id")

In [23]:
print("Static CSV row count:", new_df.count())

Static CSV row count: 99


In [25]:
streaming_df = streaming_df.filter(col("row_id") < lit(new_df.count()))

In [26]:
join_type = "left_outer"
joined_df = streaming_df.join(new_df, on="row_id", how=join_type).drop("timestamp")

print("Joined Streaming DF Schema:")
joined_df.printSchema()

Joined Streaming DF Schema:
root
 |-- row_id: long (nullable = true)
 |-- Date/Time: string (nullable = true)
 |-- LV ActivePower (kW): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Theoretical_Power_Curve (KWh): double (nullable = true)
 |-- Wind Direction (°): double (nullable = true)



In [27]:
kafka_output_df = joined_df.select(
    to_json(struct([col(x) for x in joined_df.columns])).alias("value")
)

print("Kafka Output DF Schema (JSON):")
kafka_output_df.printSchema()

Kafka Output DF Schema (JSON):
root
 |-- value: string (nullable = true)



In [None]:
console_query = kafka_output_df.writeStream \
    .format("console") \
    .outputMode("append") \
    .start()

print("Streaming data to console sink now... (Press Ctrl+C to stop)")
console_query.awaitTermination()

Streaming data to console sink now... (Press Ctrl+C to stop)


# TODO
- verify on consumer console.