In [1]:
import os
import time
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import from_json, col

# For use in Chapter 9 - Data Sources
# https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc
packages = "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4"

os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages {0} pyspark-shell".format(packages)
)

def logLevel(spark):
    # REF: https://stackoverflow.com/questions/25193488/how-to-turn-off-info-logging-in-spark
    sc = spark.sparkContext
    log4jLogger = sc._jvm.org.apache.log4j
    log4jLogger.Logger.getLogger("org").setLevel(log4jLogger.Level.ERROR)
    log = log4jLogger.LogManager.getLogger(__name__)
    log.warn("Custom Warning")


spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Demo") \
    .getOrCreate()


logLevel(spark)

spark.sparkContext.setLogLevel("ERROR")

In [2]:
print(spark.range(5000).where("id > 500").selectExpr("sum(id)").collect())

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("mode", "FAILFAST") \
    .load("file:" + os.getenv("OPTION3_HOME") + "/data/training_set.csv")

df.show(5)



[Row(sum(id)=12372250)]
+---------+----------+--------+-----------+---------+--------+
|object_id|       mjd|passband|       flux| flux_err|detected|
+---------+----------+--------+-----------+---------+--------+
|      615|59750.4229|       2|-544.810303| 3.622952|       1|
|      615|59750.4306|       1|-816.434326| 5.553370|       1|
|      615|59750.4383|       3|-471.385529| 3.801213|       1|
|      615|59750.4450|       4|-388.984985|11.395031|       1|
|      615|59752.4070|       2|-681.858887| 4.041204|       1|
+---------+----------+--------+-----------+---------+--------+
only showing top 5 rows



In [3]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("startingOffsets", "latest") \
    .option("subscribe", "twitter_status_connect") \
    .load()


df.printSchema()

topicSchema = StructType() \
                .add("schema", StringType()) \
                .add("payload", StringType())


tweets = df.select(col("key").cast("string"),
            from_json(col("value").cast("string"), topicSchema))

print(type(tweets))

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

<class 'pyspark.sql.dataframe.DataFrame'>


In [4]:
streamQuery = tweets.writeStream\
                    .format("memory")\
                    .queryName("tweets_data")\
                    .outputMode("append")\
                    .start()

In [5]:
print(streamQuery.isActive)

True


In [6]:
for seconds in range(10):
    print("Refreshing....")
    spark.sql("""
      SELECT *
      FROM tweets_data
      """)\
      .show(5)
    time.sleep(2)

print(type(spark.sql(""" SELECT * FROM tweets_data """)))

Refreshing....
+---+------------------------------------+
|key|jsontostructs(CAST(value AS STRING))|
+---+------------------------------------+
+---+------------------------------------+

Refreshing....
+---+------------------------------------+
|key|jsontostructs(CAST(value AS STRING))|
+---+------------------------------------+
+---+------------------------------------+

Refreshing....
+---+------------------------------------+
|key|jsontostructs(CAST(value AS STRING))|
+---+------------------------------------+
+---+------------------------------------+

Refreshing....
+---+------------------------------------+
|key|jsontostructs(CAST(value AS STRING))|
+---+------------------------------------+
+---+------------------------------------+

Refreshing....
+--------------------+------------------------------------+
|                 key|jsontostructs(CAST(value AS STRING))|
+--------------------+------------------------------------+
|{"schema":{"type"...|                [{"type":"struc

In [7]:
df = spark.sql(""" SELECT * FROM tweets_data """)

In [8]:
df.columns

['key', 'jsontostructs(CAST(value AS STRING))']

In [9]:
df.select("jsontostructs(CAST(value AS STRING))").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
streamQuery.stop()
# streamQuery.awaitTermination()

In [11]:
payload = df.toPandas()["jsontostructs(CAST(value AS STRING))"][0].asDict()['payload']

  Unsupported type in conversion to Arrow: StructType(List(StructField(schema,StringType,true),StructField(payload,StringType,true)))
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


In [12]:
type(payload)

str

In [13]:
import json    # or `import simplejson as json` if on Python < 2.6

json_string = payload
obj = json.loads(json_string)

In [17]:
pp = json.loads(df.toPandas()["jsontostructs(CAST(value AS STRING))"][2].asDict()['payload'])
pp

{'CreatedAt': 1590315405000,
 'Id': 1264500577507012609,
 'Text': 'RT @MoudNsui: .@nsui is not just a frontal organisation of a political party , but it’s an ideology having an amalgamation of patriotism wi…',
 'Source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
 'Truncated': False,
 'InReplyToStatusId': -1,
 'InReplyToUserId': -1,
 'InReplyToScreenName': None,
 'GeoLocation': None,
 'Place': None,
 'Favorited': False,
 'Retweeted': False,
 'FavoriteCount': 0,
 'User': {'Id': 1210460605712224256,
  'Name': 'Prabudha Devkar',
  'ScreenName': 'PrabudhaD',
  'Location': None,
  'Description': 'Be Yourself bcz an original is always worth more then a copy ... #Engg .. \n#DBOSS fan ..',
  'ContributorsEnabled': False,
  'ProfileImageURL': 'http://pbs.twimg.com/profile_images/1227955330992070658/u62IFkPu_normal.jpg',
  'BiggerProfileImageURL': 'http://pbs.twimg.com/profile_images/1227955330992070658/u62IFkPu_bigger.jpg',
  'MiniProfileImageURL': 'http://pbs.tw

In [15]:
obj.keys()

dict_keys(['CreatedAt', 'Id', 'Text', 'Source', 'Truncated', 'InReplyToStatusId', 'InReplyToUserId', 'InReplyToScreenName', 'GeoLocation', 'Place', 'Favorited', 'Retweeted', 'FavoriteCount', 'User', 'Retweet', 'Contributors', 'RetweetCount', 'RetweetedByMe', 'CurrentUserRetweetId', 'PossiblySensitive', 'Lang', 'WithheldInCountries', 'HashtagEntities', 'UserMentionEntities', 'MediaEntities', 'SymbolEntities', 'URLEntities'])

In [16]:
obj

{'CreatedAt': 1590315399000,
 'Id': 1264500549766074373,
 'Text': 'Current thought \n\nOverly nice people with perfect lives very often have something to hide, what is presented as a happy look at me in my perfect world is usually a very sad place!\n\nSomeone with a spark, troubles and tribulations who strives for success is far more appeasing!',
 'Source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
 'Truncated': True,
 'InReplyToStatusId': -1,
 'InReplyToUserId': -1,
 'InReplyToScreenName': None,
 'GeoLocation': None,
 'Place': None,
 'Favorited': False,
 'Retweeted': False,
 'FavoriteCount': 0,
 'User': {'Id': 3427893292,
  'Name': 'BlueBell_369 💋',
  'ScreenName': 'Bluebell_369',
  'Location': 'London',
  'Description': None,
  'ContributorsEnabled': False,
  'ProfileImageURL': 'http://pbs.twimg.com/profile_images/1036157158260391936/g7K96oek_normal.jpg',
  'BiggerProfileImageURL': 'http://pbs.twimg.com/profile_images/1036157158260391936/g7K96oek_bigg

VIDEO

In [133]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("startingOffsets", "latest") \
    .option("subscribe", "twitter_status_connect") \
    .load()


df.printSchema()

topicSchema = StructType() \
                .add("schema", StringType()) \
                .add("payload", StringType())


tweets = df.select(col("key").cast("string"),
                    from_json(col("value").cast("string"), topicSchema).alias("data"), 
                    col("timestamp"))\
                    .writeStream\
                    .format("memory")\
                    .queryName("tweets_data")\
                    .outputMode("update")\
                    .start()
print(type(tweets))

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

<class 'pyspark.sql.streaming.StreamingQuery'>


In [134]:
type(tweets)

pyspark.sql.streaming.StreamingQuery

In [135]:
tweets.isActive

True

In [136]:
df = spark.sql(""" SELECT * FROM tweets_data """)

In [137]:
df.select('data', 'timestamp').show(truncate=False)

+----+---------+
|data|timestamp|
+----+---------+
+----+---------+



In [152]:
df.count()

21

In [157]:
df = df.orderBy("timestamp", ascending=False)

In [196]:
df = df.limit(1)
df.show()

+--------------------+--------------------+--------------------+
|                 key|                data|           timestamp|
+--------------------+--------------------+--------------------+
|{"schema":{"type"...|[{"type":"struct"...|2020-05-24 16:42:...|
+--------------------+--------------------+--------------------+



In [197]:
# spark.catalog.clearCache()

In [198]:
df.toPandas()['data'][0].asDict()["payload"]

'{"CreatedAt":1590334979000,"Id":1264582677900005376,"Text":"RT @romanoffnparker: I love spark notes https://t.co/tQI4d70FDN","Source":"<a href=\\"http://twitter.com/download/android\\" rel=\\"nofollow\\">Twitter for Android</a>","Truncated":false,"InReplyToStatusId":-1,"InReplyToUserId":-1,"InReplyToScreenName":null,"GeoLocation":null,"Place":null,"Favorited":false,"Retweeted":false,"FavoriteCount":0,"User":{"Id":2368240250,"Name":"Michael Lewis","ScreenName":"myfanpleasure","Location":"Dallas, TX","Description":"Place to express my fandom.  I love Doctor Who, RP video games, and cosplayers.","ContributorsEnabled":false,"ProfileImageURL":"http://pbs.twimg.com/profile_images/1166211143817469953/3cDs5gcL_normal.jpg","BiggerProfileImageURL":"http://pbs.twimg.com/profile_images/1166211143817469953/3cDs5gcL_bigger.jpg","MiniProfileImageURL":"http://pbs.twimg.com/profile_images/1166211143817469953/3cDs5gcL_mini.jpg","OriginalProfileImageURL":"http://pbs.twimg.com/profile_images/116621114381

In [199]:
payload = json.loads(df.toPandas()["data"][0].asDict()["payload"])

In [200]:
payload.get('Text')

'RT @romanoffnparker: I love spark notes https://t.co/tQI4d70FDN'

In [201]:
geo = payload.get('GeoLocation', None)
print(geo)

None


In [191]:
df.count()

1

In [103]:
type(df.toPandas()["data"])
datadf = df.toPandas()["data"].to_frame()
df.toPandas()["data"][0].asDict()["payload"]

'{"CreatedAt":1590331785000,"Id":1264569281464995844,"Text":"RT @Chadd_McClain: When a nigga spark a mid blunt n da club \\uD83D\\uDE12\\uD83D\\uDEAB\\uD83D\\uDE45\\uD83C\\uDFFD\u200d♂️✌\\uD83C\\uDFFE✋\\uD83C\\uDFFD\\uD83E\\uDD26\\uD83C\\uDFFE\u200d♂️ https://t.co/J9kyqCWt79","Source":"<a href=\\"http://twitter.com/download/iphone\\" rel=\\"nofollow\\">Twitter for iPhone</a>","Truncated":false,"InReplyToStatusId":-1,"InReplyToUserId":-1,"InReplyToScreenName":null,"GeoLocation":null,"Place":null,"Favorited":false,"Retweeted":false,"FavoriteCount":0,"User":{"Id":294347885,"Name":"DJ Chadìco Papìto \\uD83E\\uDD2C\\uD83D\\uDD25\\uD83D\\uDCB8\\uD83D\\uDD0C","ScreenName":"Chadd_McClain","Location":"E/S of Blatlanta ","Description":"Father Of Prince Ayden. Papi Pancho! Envisionist Known by Errbody who matters! #LL21OneOne #LLZoot #LLDMapp #LLRadd","ContributorsEnabled":false,"ProfileImageURL":"http://pbs.twimg.com/profile_images/1102705096008351744/wiswgLYF_normal.jpg","BiggerProfileImageURL"

In [90]:
for i in range(df.count()):
    try:
        payload = json.loads(df.toPandas()["data"][i].asDict()["payload"])
        print(payload)
        if payload["GeoLocation"] != None:
            print(payload["GeoLocation"])
    except Exception as e:
        print(repr(e))
        continue

  Unsupported type in conversion to Arrow: StructType(List(StructField(schema,StringType,true),StructField(payload,StringType,true)))
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


{'CreatedAt': 1590331785000, 'Id': 1264569281464995844, 'Text': 'RT @Chadd_McClain: When a nigga spark a mid blunt n da club 😒🚫🙅🏽\u200d♂️✌🏾✋🏽🤦🏾\u200d♂️ https://t.co/J9kyqCWt79', 'Source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'Truncated': False, 'InReplyToStatusId': -1, 'InReplyToUserId': -1, 'InReplyToScreenName': None, 'GeoLocation': None, 'Place': None, 'Favorited': False, 'Retweeted': False, 'FavoriteCount': 0, 'User': {'Id': 294347885, 'Name': 'DJ Chadìco Papìto 🤬🔥💸🔌', 'ScreenName': 'Chadd_McClain', 'Location': 'E/S of Blatlanta ', 'Description': 'Father Of Prince Ayden. Papi Pancho! Envisionist Known by Errbody who matters! #LL21OneOne #LLZoot #LLDMapp #LLRadd', 'ContributorsEnabled': False, 'ProfileImageURL': 'http://pbs.twimg.com/profile_images/1102705096008351744/wiswgLYF_normal.jpg', 'BiggerProfileImageURL': 'http://pbs.twimg.com/profile_images/1102705096008351744/wiswgLYF_bigger.jpg', 'MiniProfileImageURL': 'http://pbs.twimg.co

{'CreatedAt': 1590331808000, 'Id': 1264569377455652872, 'Text': 'spark、なんか知ってるフォロワーさんが散見されてるw', 'Source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'Truncated': False, 'InReplyToStatusId': -1, 'InReplyToUserId': -1, 'InReplyToScreenName': None, 'GeoLocation': None, 'Place': None, 'Favorited': False, 'Retweeted': False, 'FavoriteCount': 0, 'User': {'Id': 844907605445173248, 'Name': 'たこ', 'ScreenName': 'takobaseball_52', 'Location': None, 'Description': '埼玉西武ライオンズ/オークランドアスレチックス', 'ContributorsEnabled': False, 'ProfileImageURL': 'http://pbs.twimg.com/profile_images/982653979321487361/09DjslpW_normal.jpg', 'BiggerProfileImageURL': 'http://pbs.twimg.com/profile_images/982653979321487361/09DjslpW_bigger.jpg', 'MiniProfileImageURL': 'http://pbs.twimg.com/profile_images/982653979321487361/09DjslpW_mini.jpg', 'OriginalProfileImageURL': 'http://pbs.twimg.com/profile_images/982653979321487361/09DjslpW.jpg', 'ProfileImageURLHttps': 'https://pbs.twimg.com

{'CreatedAt': 1590331858000, 'Id': 1264569587309273089, 'Text': 'RT @JessiePang0125: "Now this is not the end. It is not even the beginning of the end. But it is, perhaps, the end of the beginning." –Wins…', 'Source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'Truncated': False, 'InReplyToStatusId': -1, 'InReplyToUserId': -1, 'InReplyToScreenName': None, 'GeoLocation': None, 'Place': None, 'Favorited': False, 'Retweeted': False, 'FavoriteCount': 0, 'User': {'Id': 1180504142281728000, 'Name': 'swhk', 'ScreenName': 'swhk67216857', 'Location': None, 'Description': 'A HK citizen #FollowbackHongKong  #FightforFreedom #StandwithHongKong #HongKongProtest', 'ContributorsEnabled': False, 'ProfileImageURL': 'http://pbs.twimg.com/profile_images/1261215946472972288/wMqBQei0_normal.jpg', 'BiggerProfileImageURL': 'http://pbs.twimg.com/profile_images/1261215946472972288/wMqBQei0_bigger.jpg', 'MiniProfileImageURL': 'http://pbs.twimg.com/profile_images/126121

{'CreatedAt': 1590331908000, 'Id': 1264569796282265603, 'Text': "@AshleyRoboto I feel you. Between loss of personal motivation and overwork, it's hard to find that joyous spark inside that inspires me to make content.\nEven so, you're still posting motivational and funny content frequently, and that always perks me up! Take your time.", 'Source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'Truncated': True, 'InReplyToStatusId': 1264339373216202752, 'InReplyToUserId': 355995137, 'InReplyToScreenName': 'AshleyRoboto', 'GeoLocation': None, 'Place': None, 'Favorited': False, 'Retweeted': False, 'FavoriteCount': 0, 'User': {'Id': 923585902173151232, 'Name': 'Michela Dee 🖤 Black Eagles 🖤', 'ScreenName': 'micheladlondon', 'Location': None, 'Description': 'Soulsborne enthusiast and ancient cosplayer with a love of history, gaming, cosmetics, and art. Professional and hobbyist content creator. 🏳️\u200d🌈', 'ContributorsEnabled': False, 'ProfileImageUR

KeyError(126)
KeyError(127)
KeyError(128)
KeyError(129)
KeyError(130)
KeyError(131)
KeyError(132)
KeyError(133)
KeyError(134)
KeyError(135)
KeyError(136)
KeyError(137)
KeyError(138)
KeyError(139)
KeyError(140)
KeyError(141)
KeyError(142)
KeyError(143)
KeyError(144)
KeyError(145)
KeyError(146)
KeyError(147)
KeyError(148)
KeyError(149)
KeyError(150)
KeyError(151)
KeyError(152)
KeyError(153)
KeyError(154)
KeyError(155)
KeyError(156)
KeyError(157)
KeyError(158)
KeyError(159)
KeyError(160)
KeyError(161)
KeyError(162)
KeyError(163)
KeyError(164)
KeyError(165)
KeyError(166)
KeyError(167)
KeyError(168)
KeyError(169)
KeyError(170)
KeyError(171)
KeyError(172)
KeyError(173)
KeyError(174)
KeyError(175)
KeyError(176)
KeyError(177)
KeyError(178)
KeyError(179)
KeyError(180)
KeyError(181)
KeyError(182)
KeyError(183)
KeyError(184)
KeyError(185)
KeyError(186)
KeyError(187)
KeyError(188)
KeyError(189)
KeyError(190)
KeyError(191)
KeyError(192)
KeyError(193)
KeyError(194)
KeyError(195)
KeyError(196)
KeyErr

In [202]:
tweets.stop()