In [1]:
import os
import time
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import from_json, col

# For use in Chapter 9 - Data Sources
# https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc
packages = "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4"

os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages {0} pyspark-shell".format(packages)
)

def logLevel(spark):
    # REF: https://stackoverflow.com/questions/25193488/how-to-turn-off-info-logging-in-spark
    sc = spark.sparkContext
    log4jLogger = sc._jvm.org.apache.log4j
    log4jLogger.Logger.getLogger("org").setLevel(log4jLogger.Level.ERROR)
    log = log4jLogger.LogManager.getLogger(__name__)
    log.warn("Custom Warning")


spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Demo") \
    .getOrCreate()


logLevel(spark)

spark.sparkContext.setLogLevel("ERROR")

In [2]:
print(spark.range(5000).where("id > 500").selectExpr("sum(id)").collect())

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("mode", "FAILFAST") \
    .load("file:" + os.getenv("OPTION3_HOME") + "/data/training_set.csv")

df.show(5)

[Row(sum(id)=12372250)]
+---------+----------+--------+-----------+---------+--------+
|object_id|       mjd|passband|       flux| flux_err|detected|
+---------+----------+--------+-----------+---------+--------+
|      615|59750.4229|       2|-544.810303| 3.622952|       1|
|      615|59750.4306|       1|-816.434326| 5.553370|       1|
|      615|59750.4383|       3|-471.385529| 3.801213|       1|
|      615|59750.4450|       4|-388.984985|11.395031|       1|
|      615|59752.4070|       2|-681.858887| 4.041204|       1|
+---------+----------+--------+-----------+---------+--------+
only showing top 5 rows



# Read Stream

In [3]:
# df = spark \
#     .readStream \
#     .format("kafka") \
#     .option("kafka.bootstrap.servers", "localhost:9092") \
#     .option("startingOffsets", "latest") \
#     .option("subscribe", "twitter_status_connect") \
#     .load()


# df.printSchema()

# topicSchema = StructType() \
#                 .add("schema", StringType()) \
#                 .add("payload", StringType())


# tweets = df.select(col("key").cast("string"),
#             from_json(col("value").cast("string"), topicSchema))

# print(type(tweets))

In [4]:
# streamQuery = tweets.writeStream\
#                     .format("memory")\
#                     .queryName("tweets_data")\
#                     .outputMode("append")\
#                     .start()

In [5]:
# print(streamQuery.isActive)

In [6]:
# for seconds in range(10):
#     print("Refreshing....")
#     spark.sql("""
#       SELECT *
#       FROM tweets_data
#       """)\
#       .show(5)
#     time.sleep(2)

# print(type(spark.sql(""" SELECT * FROM tweets_data """)))

In [7]:
# df = spark.sql(""" SELECT * FROM tweets_data """)

In [8]:
# df.columns

In [9]:
# df.select("jsontostructs(CAST(value AS STRING))").show(truncate=False)

In [10]:
# streamQuery.stop()
# streamQuery.awaitTermination()

In [11]:
# payload = df.toPandas()["jsontostructs(CAST(value AS STRING))"][0].asDict()['payload']

In [12]:
# type(payload)

In [13]:
# import json    # or `import simplejson as json` if on Python < 2.6
#
# json_string = payload
# obj = json.loads(json_string)

In [14]:
# pp = json.loads(df.toPandas()["jsontostructs(CAST(value AS STRING))"][2].asDict()['payload'])
# pp

# Read

In [239]:
df = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "twitter_tweets") \
    .option("endingOffsets", """{"twitter_tweets":{"0":10}}""") \
    .load() \
    .selectExpr("CAST(value AS STRING) as tweets")

df.printSchema()

root
 |-- tweets: string (nullable = true)



In [240]:
df.show()

+--------------------+
|              tweets|
+--------------------+
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
+--------------------+



In [241]:
df.schema

StructType(List(StructField(tweets,StringType,true)))

In [270]:
import pyspark.sql.functions as psf


def parseJSONCols(df, *cols, sanitize=True):
    """Auto infer the schema of a json column and parse into a struct.

    rdd-based schema inference works if you have well-formatted JSON,
    like ``{"key": "value", ...}``, but breaks if your 'JSON' is just a
    string (``"data"``) or is an array (``[1, 2, 3]``). In those cases you
    can fix everything by wrapping the data in another JSON object
    (``{"key": [1, 2, 3]}``). The ``sanitize`` option (default True)
    automatically performs the wrapping and unwrapping.

    The schema inference is based on this
    `SO Post <https://stackoverflow.com/a/45880574)/>`_.

    Parameters
    ----------
    df : pyspark dataframe
        Dataframe containing the JSON cols.
    *cols : string(s)
        Names of the columns containing JSON.
    sanitize : boolean
        Flag indicating whether you'd like to sanitize your records
        by wrapping and unwrapping them in another JSON object layer.

    Returns
    -------
    pyspark dataframe
        A dataframe with the decoded columns.
    """
    res = df
    for i in cols:
        # sanitize if requested.
        if sanitize:
            res = (
                res.withColumn(
                    i,
                    psf.concat(psf.lit('{"data": '), i, psf.lit('}'))
                )
            )
        # infer schema and apply it
        schema = spark.read.json(res.rdd.map(lambda x: x[i])).schema
        res = res.withColumn(i, psf.from_json(psf.col(i), schema))

        # unpack the wrapped object if needed
        if sanitize:
            res = res.withColumn(i, psf.col(i).data)
    return res

In [271]:
res = parseJSONCols(df, 'tweets', sanitize=False)
res.schema

StructType(List(StructField(tweets,StructType(List(StructField(contributors,StringType,true),StructField(coordinates,StructType(List(StructField(coordinates,ArrayType(DoubleType,true),true),StructField(type,StringType,true))),true),StructField(created_at,StringType,true),StructField(display_text_range,ArrayType(LongType,true),true),StructField(entities,StructType(List(StructField(hashtags,ArrayType(StructType(List(StructField(indices,ArrayType(LongType,true),true),StructField(text,StringType,true))),true),true),StructField(media,ArrayType(StructType(List(StructField(additional_media_info,StructType(List(StructField(description,StringType,true),StructField(embeddable,BooleanType,true),StructField(monetizable,BooleanType,true),StructField(title,StringType,true))),true),StructField(description,StringType,true),StructField(display_url,StringType,true),StructField(expanded_url,StringType,true),StructField(id,LongType,true),StructField(id_str,StringType,true),StructField(indices,ArrayType(Lo

In [272]:
res.printSchema()

root
 |-- tweets: struct (nullable = true)
 |    |-- contributors: string (nullable = true)
 |    |-- coordinates: struct (nullable = true)
 |    |    |-- coordinates: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)
 |    |    |-- type: string (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- display_text_range: array (nullable = true)
 |    |    |-- element: long (containsNull = true)
 |    |-- entities: struct (nullable = true)
 |    |    |-- hashtags: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |    |-- text: string (nullable = true)
 |    |    |-- media: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |    |-- description: 

In [273]:
schema = res.schema
schema

StructType(List(StructField(tweets,StructType(List(StructField(contributors,StringType,true),StructField(coordinates,StructType(List(StructField(coordinates,ArrayType(DoubleType,true),true),StructField(type,StringType,true))),true),StructField(created_at,StringType,true),StructField(display_text_range,ArrayType(LongType,true),true),StructField(entities,StructType(List(StructField(hashtags,ArrayType(StructType(List(StructField(indices,ArrayType(LongType,true),true),StructField(text,StringType,true))),true),true),StructField(media,ArrayType(StructType(List(StructField(additional_media_info,StructType(List(StructField(description,StringType,true),StructField(embeddable,BooleanType,true),StructField(monetizable,BooleanType,true),StructField(title,StringType,true))),true),StructField(description,StringType,true),StructField(display_url,StringType,true),StructField(expanded_url,StringType,true),StructField(id,LongType,true),StructField(id_str,StringType,true),StructField(indices,ArrayType(Lo

In [274]:
df.show()

+--------------------+
|              tweets|
+--------------------+
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
+--------------------+
only showing top 20 rows



In [275]:
res.show()

+--------------------+
|              tweets|
+--------------------+
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
+--------------------+
only showing top 20 rows



In [276]:
res.select("tweets.created_at", "tweets.coordinates").show()

+--------------------+-----------+
|          created_at|coordinates|
+--------------------+-----------+
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
+--------------------+-----------+
only showing top 20 rows



In [277]:
res.select("tweets.created_at", "tweets.coordinates", "tweets.user.followers_count").show()

+--------------------+-----------+---------------+
|          created_at|coordinates|followers_count|
+--------------------+-----------+---------------+
|Tue May 26 16:14:...|       null|            696|
|Tue May 26 16:14:...|       null|              9|
|Tue May 26 16:14:...|       null|            845|
|Tue May 26 16:14:...|       null|            334|
|Tue May 26 16:14:...|       null|            526|
|Tue May 26 16:14:...|       null|             44|
|Tue May 26 16:14:...|       null|            103|
|Tue May 26 16:14:...|       null|            568|
|Tue May 26 16:14:...|       null|            192|
|Tue May 26 16:14:...|       null|            807|
|Tue May 26 16:15:...|       null|          14108|
|Tue May 26 16:15:...|       null|            310|
|Tue May 26 16:14:...|       null|             28|
|Tue May 26 16:15:...|       null|              5|
|Tue May 26 16:15:...|       null|             57|
|Tue May 26 16:15:...|       null|            229|
|Tue May 26 16:15:...|       nu

In [278]:
res.select("tweets.created_at", "tweets.coordinates", "tweets.user.followers_count") \
                .where(col("tweets.coordinates").isNull()) \
                .where(col("tweets.user.followers_count") > 100000) \
                .show()

+--------------------+-----------+---------------+
|          created_at|coordinates|followers_count|
+--------------------+-----------+---------------+
|Tue May 26 16:27:...|       null|         133918|
|Tue May 26 19:44:...|       null|         230348|
|Tue May 26 19:45:...|       null|         200857|
|Wed May 27 09:57:...|       null|         447390|
|Wed May 27 09:57:...|       null|         127387|
|Wed May 27 09:58:...|       null|         512790|
|Wed May 27 09:58:...|       null|         203863|
|Wed May 27 09:58:...|       null|         190846|
|Wed May 27 09:58:...|       null|         127387|
|Wed May 27 09:58:...|       null|         286442|
|Wed May 27 09:59:...|       null|         134188|
|Wed May 27 09:59:...|       null|         333014|
|Wed May 27 09:59:...|       null|         134188|
|Wed May 27 09:59:...|       null|         594976|
|Wed May 27 09:59:...|       null|        2593438|
|Wed May 27 09:59:...|       null|         855058|
|Wed May 27 09:59:...|       nu

In [279]:
from pyspark.sql.functions import size
resHash = res.withColumn("size", size(col("tweets.entities.hashtags")))
resHash_filtered = resHash.filter(col("size") >= 1)
resHash.show()

+--------------------+----+
|              tweets|size|
+--------------------+----+
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   2|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   1|
|[,, Tue May 26 16...|   2|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   1|
|[,, Tue May 26 16...|   0|
+--------------------+----+
only showing top 20 rows



In [280]:
res.select("tweets.created_at", "tweets.place.bounding_box.coordinates", "tweets.user.followers_count") \
                .where(col("tweets.place.bounding_box.coordinates").isNotNull()) \
                .show()

+--------------------+--------------------+---------------+
|          created_at|         coordinates|followers_count|
+--------------------+--------------------+---------------+
|Tue May 26 16:14:...|[[[-76.711521, 39...|             44|
|Tue May 26 16:40:...|[[[-2.319934, 53....|            235|
|Tue May 26 19:42:...|[[[-95.846367, 29...|            120|
|Wed May 27 09:57:...|[[[-77.144435, 38...|           3467|
|Wed May 27 09:57:...|[[[1.373441, 52.6...|            302|
|Wed May 27 09:57:...|[[[0.378141, 52.7...|           1775|
|Wed May 27 09:57:...|[[[-3.062965, 53....|           1464|
|Wed May 27 09:57:...|[[[-1.321481, 53....|            174|
|Wed May 27 09:57:...|[[[-1.568224, 53....|          20406|
|Wed May 27 09:58:...|[[[-2.814317, 52....|             20|
|Wed May 27 09:58:...|[[[0.621509, 51.9...|            162|
|Wed May 27 09:58:...|[[[-63.39386, -41...|           1460|
|Wed May 27 09:58:...|[[[-0.968219, 52....|            314|
|Wed May 27 09:58:...|[[[-4.11587, 55.7.

In [281]:
df = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "twitter_tweets") \
    .option("startingOffsets", "earliest") \
    .load() \
    .selectExpr("CAST(value AS STRING) as tweets")

df.printSchema()

root
 |-- tweets: string (nullable = true)



In [282]:
res = parseJSONCols(df, 'tweets', sanitize=True)
res.schema

StructType(List(StructField(tweets,StructType(List(StructField(contributors,StringType,true),StructField(coordinates,StructType(List(StructField(coordinates,ArrayType(DoubleType,true),true),StructField(type,StringType,true))),true),StructField(created_at,StringType,true),StructField(display_text_range,ArrayType(LongType,true),true),StructField(entities,StructType(List(StructField(hashtags,ArrayType(StructType(List(StructField(indices,ArrayType(LongType,true),true),StructField(text,StringType,true))),true),true),StructField(media,ArrayType(StructType(List(StructField(additional_media_info,StructType(List(StructField(description,StringType,true),StructField(embeddable,BooleanType,true),StructField(monetizable,BooleanType,true),StructField(title,StringType,true))),true),StructField(description,StringType,true),StructField(display_url,StringType,true),StructField(expanded_url,StringType,true),StructField(id,LongType,true),StructField(id_str,StringType,true),StructField(indices,ArrayType(Lo

In [283]:
res.count()

9272

In [284]:
res.select("tweets.created_at", "tweets.coordinates", "tweets.user.followers_count") \
                .where(col("tweets.coordinates").isNotNull()) \
                .show()

+--------------------+--------------------+---------------+
|          created_at|         coordinates|followers_count|
+--------------------+--------------------+---------------+
|Sun May 31 20:25:...|[[-0.12085, 51.46...|            411|
|Sun May 31 20:25:...|[[-0.12085, 51.46...|            411|
|Sun May 31 20:26:...|[[-0.12085, 51.46...|            411|
|Sun May 31 20:29:...|[[-0.1094, 51.514...|           2096|
|Sun May 31 20:30:...|[[0.107, 51.486],...|            824|
|Sun May 31 20:36:...|[[-0.22537, 51.53...|            411|
|Sun May 31 20:37:...|[[-0.22537, 51.53...|            411|
|Sun May 31 20:37:...|[[-0.1094, 51.514...|             20|
|Sun May 31 20:37:...|[[-0.1094, 51.514...|            936|
|Sun May 31 20:37:...|[[0.02868891, 51....|            203|
|Sun May 31 20:37:...|[[-0.1094, 51.514...|            649|
|Sun May 31 20:37:...|[[-0.22537, 51.53...|            411|
|Sun May 31 20:37:...|[[-0.1094, 51.514...|            103|
+--------------------+------------------

In [285]:
res.count()

9272

pandasUDF to get coordinates from list

# After watching the VIDEO

In [310]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("startingOffsets", "earliest") \
    .option("subscribe", "twitter_tweets") \
    .load() \
    .selectExpr("CAST(value AS STRING) as tweets")

In [311]:
df.printSchema()

root
 |-- tweets: string (nullable = true)



In [312]:
schema

StructType(List(StructField(tweets,StructType(List(StructField(contributors,StringType,true),StructField(coordinates,StructType(List(StructField(coordinates,ArrayType(DoubleType,true),true),StructField(type,StringType,true))),true),StructField(created_at,StringType,true),StructField(display_text_range,ArrayType(LongType,true),true),StructField(entities,StructType(List(StructField(hashtags,ArrayType(StructType(List(StructField(indices,ArrayType(LongType,true),true),StructField(text,StringType,true))),true),true),StructField(media,ArrayType(StructType(List(StructField(additional_media_info,StructType(List(StructField(description,StringType,true),StructField(embeddable,BooleanType,true),StructField(monetizable,BooleanType,true),StructField(title,StringType,true))),true),StructField(description,StringType,true),StructField(display_url,StringType,true),StructField(expanded_url,StringType,true),StructField(id,LongType,true),StructField(id_str,StringType,true),StructField(indices,ArrayType(Lo

In [313]:
tweets = df.writeStream \
                    .format("memory") \
                    .queryName("tweeters") \
                    .outputMode("update") \
                    .start()

In [323]:
tweets.stop()

In [314]:
df = spark.sql(""" SELECT * FROM tweeters """)

In [315]:
ddf = parseJSONCols(df, 'tweets', sanitize=True)
ddf.schema

StructType(List(StructField(tweets,StructType(List(StructField(contributors,StringType,true),StructField(coordinates,StructType(List(StructField(coordinates,ArrayType(DoubleType,true),true),StructField(type,StringType,true))),true),StructField(created_at,StringType,true),StructField(display_text_range,ArrayType(LongType,true),true),StructField(entities,StructType(List(StructField(hashtags,ArrayType(StructType(List(StructField(indices,ArrayType(LongType,true),true),StructField(text,StringType,true))),true),true),StructField(media,ArrayType(StructType(List(StructField(additional_media_info,StructType(List(StructField(description,StringType,true),StructField(embeddable,BooleanType,true),StructField(monetizable,BooleanType,true),StructField(title,StringType,true))),true),StructField(description,StringType,true),StructField(display_url,StringType,true),StructField(expanded_url,StringType,true),StructField(id,LongType,true),StructField(id_str,StringType,true),StructField(indices,ArrayType(Lo

In [316]:
ddf.printSchema()

root
 |-- tweets: struct (nullable = true)
 |    |-- contributors: string (nullable = true)
 |    |-- coordinates: struct (nullable = true)
 |    |    |-- coordinates: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)
 |    |    |-- type: string (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- display_text_range: array (nullable = true)
 |    |    |-- element: long (containsNull = true)
 |    |-- entities: struct (nullable = true)
 |    |    |-- hashtags: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |    |-- text: string (nullable = true)
 |    |    |-- media: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |    |-- description: 

In [322]:
tweets.isActive

True

In [268]:
df = spark.sql(""" SELECT * FROM tweets_data """)

In [318]:
df.show()

+--------------------+
|              tweets|
+--------------------+
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
+--------------------+
only showing top 20 rows



In [319]:
ddf.show()

+--------------------+
|              tweets|
+--------------------+
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
+--------------------+
only showing top 20 rows



In [320]:
ddf.select("tweets.created_at", "tweets.place.bounding_box.coordinates", "tweets.user.followers_count").show(truncate=False)

+------------------------------+------------------------------------------------------------------------------------------------------+---------------+
|created_at                    |coordinates                                                                                           |followers_count|
+------------------------------+------------------------------------------------------------------------------------------------------+---------------+
|Tue May 26 16:14:27 +0000 2020|null                                                                                                  |696            |
|Tue May 26 16:14:28 +0000 2020|null                                                                                                  |9              |
|Tue May 26 16:14:30 +0000 2020|null                                                                                                  |845            |
|Tue May 26 16:14:34 +0000 2020|null                                                    

In [321]:
ddf.count()

9272

In [238]:
tweets.stop()