In [1]:
import findspark
findspark.init()

In [2]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1 pyspark-shell'

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [4]:
spark = SparkSession.builder \
.master("local[4]") \
.appName("WriteToKafka") \
.config("spark.driver.memory","2g") \
.config("spark.executor.memory", "4g") \
.getOrCreate()

In [5]:
df = spark.read.format("csv") \
.option("header",True) \
.load("D:/Datasets/Advertising.csv")
df.show(2)

+---+-----+-----+---------+-----+
| ID|   TV|Radio|Newspaper|Sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
+---+-----+-----+---------+-----+
only showing top 2 rows



In [8]:
df = df.withColumn("key", col("ID")).drop("ID")

In [9]:
df.show(2)

+-----+-----+---------+-----+---+
|   TV|Radio|Newspaper|Sales|key|
+-----+-----+---------+-----+---+
|230.1| 37.8|     69.2| 22.1|  1|
| 44.5| 39.3|     45.1| 10.4|  2|
+-----+-----+---------+-----+---+
only showing top 2 rows



In [10]:
df2 = df.select( 'key' ,
    concat( 
        col("TV"), lit(","), 
        col("Radio"), lit(","),
        col("Newspaper"), lit(","),
        col("Sales")
    ).alias('value')  
)

df2.show(2)

+---+--------------------+
|key|               value|
+---+--------------------+
|  1|230.1,37.8,69.2,22.1|
|  2| 44.5,39.3,45.1,10.4|
+---+--------------------+
only showing top 2 rows



In [11]:
df2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
  .write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("topic", "deneme") \
  .save()
