In [1]:
from pyspark import SparkContext                                                                                        
from pyspark.sql import SparkSession                                                                                    
from pyspark.streaming import StreamingContext                                                                          
from pyspark.streaming.kafka import KafkaUtils    
import pandas as pd
import json

In [2]:
ss = SparkSession.Builder() \
     .appName("SparkStreamingKafka") \
     .master("spark://streaming-spark-master:7077") \
     .config("spark.jars", "./spark-streaming-kafka-0-8-assembly_2.11-2.4.1.jar") \
     .config("spark.driver.allowMultipleContexts", "true") \
     .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
     .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/hive") \
     .enableHiveSupport() \
     .getOrCreate()

# .config('spark.jars.packages', 'org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.4.1') \

In [3]:
sc = ss.sparkContext
ssc = StreamingContext(sc, 5)
ss.sparkContext.setLogLevel('WARN')

In [4]:
def handle_rdd(rdd):
    if not rdd.isEmpty():
        global ss
        print(f"Number of records: {len(rdd.collect())}")
        df = ss.createDataFrame(
            rdd,
            schema=[
                'ArrivalTime',
                'BusinessLeisure',
                'CabinCategory',
                'CreationDate',
                'CurrencyCode',
                'DepartureTime',
                'Destination',
                'OfficeIdCountry',
                'Origin',
                'TotalAmount',
                'nPAX'
            ])
        df.write.saveAsTable(name='default.trips', format='hive', mode='append')

In [7]:
def json_to_list(s):
    t = json.loads(s)
    results = []
    for k, v in t.items():
        results.append(v)
    return results

In [None]:
ks = KafkaUtils.createDirectStream(
    ssc, ['trips'], {'metadata.broker.list': 'kafka-broker-1:9093,kafka-broker-2:9093'})
lines = ks.map(lambda x: x[1])

# transform = lines.map(lambda tweet: (tweet, int(len(tweet.split())), int(len(tweet))))
transform = lines.map(lambda tripInfo: json_to_list(tripInfo))
transform.foreachRDD(handle_rdd)
ssc.start()
ssc.awaitTermination()

Number of records: 1418
Number of records: 2824
Number of records: 3408
Number of records: 3269
Number of records: 3005
Number of records: 3048
Number of records: 3305
Number of records: 2996
Number of records: 2915
Number of records: 3251
Number of records: 3290
Number of records: 2937
Number of records: 3195
Number of records: 3158
Number of records: 3419
Number of records: 3300
Number of records: 3478
Number of records: 3451
Number of records: 3317
Number of records: 3476
Number of records: 3504
Number of records: 3501
Number of records: 3333
Number of records: 3128
Number of records: 3492
Number of records: 3571
Number of records: 3533
Number of records: 3589
Number of records: 3544
Number of records: 3505
Number of records: 3248
