In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import from_json, window

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0-beta,com.datastax.cassandra:cassandra-driver-core:3.9.0 pyspark-shell'

In [3]:
bootstrap_servers = 'broker:9094'
topic = 'bitcoin'

In [4]:
cassandra_connection = 'cassandra:7000'

In [5]:
spark = SparkSession \
    .builder \
    .appName('StructuredStreamingKafka') \
    .getOrCreate()

In [6]:
kafka_raw_stream = spark \
    .readStream \
    .format('kafka') \
    .option('kafka.bootstrap.servers',bootstrap_servers) \
    .option("spark.cassandra.connection.host", cassandra_connection) \
    .option('subscribe',topic) \
    .load()

In [7]:
kafka_stream = kafka_raw_stream \
    .selectExpr('CAST(key as String)','CAST(value as String)')

In [8]:
coin_schema = StructType(
    [
        StructField('24hVolume',   StringType(),True),
        StructField('btcPrice',   StringType(),True),
        StructField('change', StringType(),True),
        StructField('coinrankingUrl',    StringType(),True),
        StructField('color',    StringType(),True),
        StructField('iconUrl',StringType(),True),
        StructField('listedAt',   LongType(),  True),
        StructField('lowVolume',   BooleanType(),  True),
        StructField('marketCap',   DoubleType(),  True),
        StructField('name',   StringType(),  True),
        StructField('price',   StringType(),  True),
        StructField('rank',   LongType(),  True),
        StructField("sparkline", ArrayType(
        StructType([
          StructField("value", DoubleType()),
        ]))),
        StructField('symbol', StringType(),  True),
        StructField('tier', LongType(),  True),
        StructField('uuid', StringType(),  True)
   
    ]
)


#  "24hVolume": "45865856021.84056227483486300146", 
#         "btcPrice": "1", 
#         "change": "-2.951245541595327", 
#         "coinrankingUrl": "https://coinranking.com/coin/Qwsogvtv82FCd+bitcoin-btc", 
#         "color": "#f7931A", 
#         "iconUrl": "https://cdn.coinranking.com/bOabBYkcX/bitcoin_btc.svg", 
#         "listedAt": 1330214400, 
#         "lowVolume": false, 
#         "marketCap": "872063459795.82604756063261182578", 
#         "name": "Bitcoin", 
#         "price": "46605.15802439004179893254", 
#         "rank": 1, 
#         "sparkline": [
#           "48121.48483135035029351326", 
#           "47680.53858679600719235244", 
#           "48363.56523278616068368839", 
#           "48547.2249512731716192729", 
#           "48600.68095521068515406288", 
#           "47867.01860299033202525132", 
#           "47313.78751502263220988798", 
#           "47705.20653715832878873714", 
#           "48135.84226539024826131598", 
#           "48248.90108849445470098487", 
#           "48292.63898319552541734948", 
#           "48327.98116447038348587492", 
#           "48469.01320208769617189259", 
#           "49183.96868989059700892347", 
#           "49109.4865405254377944505", 
#           "49059.22910241027652704515", 
#           "49430.5228999390130752474", 
#           "49659.9610417944210137561", 
#           "49467.61462382282509885858", 
#           "49102.70547826240853393093", 
#           "48924.5702112945445245131", 
#           "48763.21645728770164001951", 
#           "48159.55221947218685386965", 
#           "47662.404264526723439679", 
#           "47014.14955772720869414904", 
#           "46783.42960858551741566763", 
#           "46605.15802439004179893254"
#         ], 
#         "symbol": "BTC", 
#         "tier": 1, 
#         "uuid": "Qwsogvtv82FCd"

In [9]:
coins = kafka_stream \
    .withColumn('coin', from_json('value',coin_schema)) \
    .selectExpr('key','coin.*')

In [10]:
# vehicles_with_ts = vehicles \
#     .selectExpr('key','type','line','brigade','lat','lon','bearing','CAST(time/1000 as Timestamp) as time')

In [11]:
# event_count_5min = vehicles_with_ts \
#     .withWatermark('time','10 seconds') \
#     .groupBy(
#         window('time','1 second')
#     ).count()

In [12]:
query = coins  \
    .writeStream \
    .format('console') \
    .option('truncate','true') \
    .start()

In [None]:
query.awaitTermination()