In [1]:
from pyspark import SparkContext                                                                                        
from pyspark.sql import SparkSession                                                                                    
from pyspark.streaming import StreamingContext                                                                          
from pyspark.streaming.kafka import KafkaUtils    
import pandas as pd
import json

In [2]:
ss = SparkSession.Builder() \
     .appName("TEST") \
     .master("spark://spark-master:7077") \
     .config('spark.jars.packages', 'org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.4.1') \
     .config("spark.driver.allowMultipleContexts", "true") \
     .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
     .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/hive") \
     .enableHiveSupport() \
     .getOrCreate()

# ss = SparkSession.Builder() \
#      .appName("TEST") \
#      .master("spark://spark-master:7077") \
#      .enableHiveSupport() \
#      .getOrCreate()
# #
#org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.1
#org.apache.spark:spark-streaming-kafka-0-8:2.4.1

In [5]:
sc = ss.sparkContext
ssc = StreamingContext(sc, 5)
ss.sparkContext.setLogLevel('WARN')

In [4]:
def handle_rdd(rdd):
    if not rdd.isEmpty():
        global ss
        print(f"Number of records: {len(rdd.collect())}")
        df = ss.createDataFrame(
            rdd,
            schema=[
                'ArrivalTime',
                'BusinessLeisure',
                'CabinCategory',
                'CreationDate',
                'CurrencyCode',
                'DepartureTime',
                'Destination',
                'OfficeIdCountry',
                'Origin',
                'TotalAmount',
                'nPAX'
            ])
        df.write.saveAsTable(name='default.trips', format='hive', mode='append')

In [7]:
def json_to_list(s):
    t = json.loads(s)
    results = []
    for k, v in t.items():
        results.append(v)
    return results

In [None]:
ks = KafkaUtils.createDirectStream(ssc, ['trips'], {'metadata.broker.list': 'kafka:9093'})
lines = ks.map(lambda x: x[1])

# transform = lines.map(lambda tweet: (tweet, int(len(tweet.split())), int(len(tweet))))
transform = lines.map(lambda tripInfo: json_to_list(tripInfo))
transform.foreachRDD(handle_rdd)
ssc.start()
ssc.awaitTermination()

[['1441404544', 'nan', '40', '2457207', 'nan', '1442142720', 'FDF', 'FR', 'LYS', 'nan', '1']]
[['1447067264', 'nan', '40', '2457232', 'nan', '1444564096', 'KWI', 'ES', 'HEL', 'nan', '1'], ['1434222208', 'nan', '40', '2457072', 'nan', '1433783936', 'CUE', 'ES', 'PMI', 'nan', '1'], ['1411671552', 'nan', '40', '2456846', 'nan', '1409718528', 'AKL', 'PE', 'HKG', 'nan', '1'], ['1444364800', 'nan', '10', '2457219', 'nan', '1443363328', 'GIG', 'BR', 'SSA', 'nan', '1'], ['1442012288', 'nan', '40', '2457235', 'nan', '1442806272', 'MMX', 'GB', 'ONT', 'nan', '1']]
[['1425074560', 'nan', '40', '2457052', 'nan', '1427236736', 'CDG', 'US', 'MCO', 'nan', '1'], ['1427076480', 'nan', '40', '2457048', 'nan', '1427292160', 'BOG', 'AR', 'EZE', 'nan', '1'], ['1447250432', 'L', '40', '2457262', 'nan', '1445463680', 'CAN', 'CA', 'YYZ', 'nan', '2'], ['1420607872', 'nan', '20', '2456934', 'nan', '1420845696', 'LHR', 'GB', 'DOU', 'nan', '1'], ['1400953856', 'nan', '40', '2456744', 'nan', '1400681216', 'MRV', 'F

[['1438877440', 'nan', '40', '2457238', 'nan', '1441036032', 'LHR', 'GB', 'MLA', 'nan', '1'], ['1421441408', 'nan', '40', '2457002', 'nan', '1418077184', 'FRA', 'SE', 'ARN', 'nan', '1'], ['1450470784', 'nan', '40', '2457384', 'nan', '1452050816', 'PEK', 'US', 'LAX', 'nan', '1'], ['1433049344', 'nan', '10', '2457129', 'nan', '1429093120', 'OSL', 'IT', 'TOS', 'nan', '1'], ['1452605952', 'B', '40', '2457369', 'nan', '1454342784', 'LPA', 'FR', 'SMF', 'nan', '1']]
[['1411808640', 'nan', '40', '2456933', 'nan', '1411300864', 'BEY', 'EG', 'DMM', 'nan', '1'], ['1432419712', 'nan', '40', '2457159', 'nan', '1432674304', 'HAU', 'SE', 'OSL', 'nan', '1'], ['1424675840', 'nan', '40', '2456975', 'nan', '1424872704', 'FRA', 'FI', 'HEL', 'nan', '1'], ['1455806848', 'nan', '40', '2457311', 'nan', '1450927616', 'DAC', 'SA', 'JED', 'nan', '1'], ['1446126720', 'L', '40', '2457127', 'nan', '1442743936', 'MNL', 'IN', 'IKA', 'nan', '2']]
[['1460415616', 'nan', '40', '2457441', 'nan', '1460874240', 'STR', 'DE'

[['1447915904', 'nan', '40', '2457276', 'nan', '1447405440', 'CCS', 'EE', 'TLL', 'nan', '1'], ['1423169280', 'nan', '40', '2456800', 'nan', '1419806720', 'MIA', 'LU', 'LUX', 'nan', '1'], ['1436158208', 'nan', '40', '2457135', 'PLN', '1436235008', 'DUS', 'PL', 'WAW', '0.0', '2'], ['1440478720', 'nan', '20', '2457161', 'nan', '1437981312', 'HAM', 'FR', 'CDG', 'nan', '1'], ['1442843520', 'nan', '40', '2457210', 'nan', '1442716032', 'LIS', 'ES', 'TNG', 'nan', '1']]
[['1420695680', 'nan', '40', '2457004', 'nan', '1418465152', 'GOT', 'SE', 'CPH', 'nan', '1'], ['1437613952', 'nan', '40', '2457214', 'nan', '1437581184', 'ABJ', 'FR', 'BOD', 'nan', '1'], ['1401281792', 'nan', '40', '2456649', 'nan', '1399481728', 'EWR', 'DE', 'HAM', 'nan', '1'], ['1427311872', 'nan', '40', '2456939', 'nan', '1425254016', 'TXL', 'IL', 'TLV', 'nan', '1'], ['1467714432', 'nan', '40', '2457430', 'AUD', '1465921792', 'LAX', 'AU', 'SYD', '0.0', '1']]
[['1414733952', 'nan', '40', '2456911', 'nan', '1415272064', 'PRG', 