In [1]:
import os
import time
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import from_json, col

# For use in Chapter 9 - Data Sources
# https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc
packages = "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4"

os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages {0} pyspark-shell".format(packages)
)

def logLevel(spark):
    # REF: https://stackoverflow.com/questions/25193488/how-to-turn-off-info-logging-in-spark
    sc = spark.sparkContext
    log4jLogger = sc._jvm.org.apache.log4j
    log4jLogger.Logger.getLogger("org").setLevel(log4jLogger.Level.ERROR)
    log = log4jLogger.LogManager.getLogger(__name__)
    log.warn("Custom Warning")


spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Demo") \
    .getOrCreate()


logLevel(spark)

spark.sparkContext.setLogLevel("ERROR")

In [2]:
print(spark.range(5000).where("id > 500").selectExpr("sum(id)").collect())

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("mode", "FAILFAST") \
    .load("file:" + os.getenv("OPTION3_HOME") + "/data/training_set.csv")

df.show(5)

[Row(sum(id)=12372250)]
+---------+----------+--------+-----------+---------+--------+
|object_id|       mjd|passband|       flux| flux_err|detected|
+---------+----------+--------+-----------+---------+--------+
|      615|59750.4229|       2|-544.810303| 3.622952|       1|
|      615|59750.4306|       1|-816.434326| 5.553370|       1|
|      615|59750.4383|       3|-471.385529| 3.801213|       1|
|      615|59750.4450|       4|-388.984985|11.395031|       1|
|      615|59752.4070|       2|-681.858887| 4.041204|       1|
+---------+----------+--------+-----------+---------+--------+
only showing top 5 rows



# Read Stream

In [3]:
# df = spark \
#     .readStream \
#     .format("kafka") \
#     .option("kafka.bootstrap.servers", "localhost:9092") \
#     .option("startingOffsets", "latest") \
#     .option("subscribe", "twitter_status_connect") \
#     .load()


# df.printSchema()

# topicSchema = StructType() \
#                 .add("schema", StringType()) \
#                 .add("payload", StringType())


# tweets = df.select(col("key").cast("string"),
#             from_json(col("value").cast("string"), topicSchema))

# print(type(tweets))

In [4]:
# streamQuery = tweets.writeStream\
#                     .format("memory")\
#                     .queryName("tweets_data")\
#                     .outputMode("append")\
#                     .start()

In [5]:
# print(streamQuery.isActive)

In [6]:
# for seconds in range(10):
#     print("Refreshing....")
#     spark.sql("""
#       SELECT *
#       FROM tweets_data
#       """)\
#       .show(5)
#     time.sleep(2)

# print(type(spark.sql(""" SELECT * FROM tweets_data """)))

In [7]:
# df = spark.sql(""" SELECT * FROM tweets_data """)

In [8]:
# df.columns

In [9]:
# df.select("jsontostructs(CAST(value AS STRING))").show(truncate=False)

In [10]:
# streamQuery.stop()
# streamQuery.awaitTermination()

In [11]:
# payload = df.toPandas()["jsontostructs(CAST(value AS STRING))"][0].asDict()['payload']

In [12]:
# type(payload)

In [13]:
# import json    # or `import simplejson as json` if on Python < 2.6
#
# json_string = payload
# obj = json.loads(json_string)

In [14]:
# pp = json.loads(df.toPandas()["jsontostructs(CAST(value AS STRING))"][2].asDict()['payload'])
# pp

# Read

In [239]:
df = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "twitter_tweets") \
    .option("endingOffsets", """{"twitter_tweets":{"0":10}}""") \
    .load() \
    .selectExpr("CAST(value AS STRING) as tweets")

df.printSchema()

root
 |-- tweets: string (nullable = true)



In [240]:
df.show()

+--------------------+
|              tweets|
+--------------------+
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
+--------------------+



In [241]:
df.schema

StructType(List(StructField(tweets,StringType,true)))

In [270]:
import pyspark.sql.functions as psf


def parseJSONCols(df, *cols, sanitize=True):
    """Auto infer the schema of a json column and parse into a struct.

    rdd-based schema inference works if you have well-formatted JSON,
    like ``{"key": "value", ...}``, but breaks if your 'JSON' is just a
    string (``"data"``) or is an array (``[1, 2, 3]``). In those cases you
    can fix everything by wrapping the data in another JSON object
    (``{"key": [1, 2, 3]}``). The ``sanitize`` option (default True)
    automatically performs the wrapping and unwrapping.

    The schema inference is based on this
    `SO Post <https://stackoverflow.com/a/45880574)/>`_.

    Parameters
    ----------
    df : pyspark dataframe
        Dataframe containing the JSON cols.
    *cols : string(s)
        Names of the columns containing JSON.
    sanitize : boolean
        Flag indicating whether you'd like to sanitize your records
        by wrapping and unwrapping them in another JSON object layer.

    Returns
    -------
    pyspark dataframe
        A dataframe with the decoded columns.
    """
    res = df
    for i in cols:
        # sanitize if requested.
        if sanitize:
            res = (
                res.withColumn(
                    i,
                    psf.concat(psf.lit('{"data": '), i, psf.lit('}'))
                )
            )
        # infer schema and apply it
        schema = spark.read.json(res.rdd.map(lambda x: x[i])).schema
        res = res.withColumn(i, psf.from_json(psf.col(i), schema))

        # unpack the wrapped object if needed
        if sanitize:
            res = res.withColumn(i, psf.col(i).data)
    return res

In [271]:
res = parseJSONCols(df, 'tweets', sanitize=False)
res.schema

StructType(List(StructField(tweets,StructType(List(StructField(contributors,StringType,true),StructField(coordinates,StructType(List(StructField(coordinates,ArrayType(DoubleType,true),true),StructField(type,StringType,true))),true),StructField(created_at,StringType,true),StructField(display_text_range,ArrayType(LongType,true),true),StructField(entities,StructType(List(StructField(hashtags,ArrayType(StructType(List(StructField(indices,ArrayType(LongType,true),true),StructField(text,StringType,true))),true),true),StructField(media,ArrayType(StructType(List(StructField(additional_media_info,StructType(List(StructField(description,StringType,true),StructField(embeddable,BooleanType,true),StructField(monetizable,BooleanType,true),StructField(title,StringType,true))),true),StructField(description,StringType,true),StructField(display_url,StringType,true),StructField(expanded_url,StringType,true),StructField(id,LongType,true),StructField(id_str,StringType,true),StructField(indices,ArrayType(Lo

In [272]:
res.printSchema()

root
 |-- tweets: struct (nullable = true)
 |    |-- contributors: string (nullable = true)
 |    |-- coordinates: struct (nullable = true)
 |    |    |-- coordinates: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)
 |    |    |-- type: string (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- display_text_range: array (nullable = true)
 |    |    |-- element: long (containsNull = true)
 |    |-- entities: struct (nullable = true)
 |    |    |-- hashtags: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |    |-- text: string (nullable = true)
 |    |    |-- media: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |    |-- description: 

In [273]:
schema = res.schema
schema

StructType(List(StructField(tweets,StructType(List(StructField(contributors,StringType,true),StructField(coordinates,StructType(List(StructField(coordinates,ArrayType(DoubleType,true),true),StructField(type,StringType,true))),true),StructField(created_at,StringType,true),StructField(display_text_range,ArrayType(LongType,true),true),StructField(entities,StructType(List(StructField(hashtags,ArrayType(StructType(List(StructField(indices,ArrayType(LongType,true),true),StructField(text,StringType,true))),true),true),StructField(media,ArrayType(StructType(List(StructField(additional_media_info,StructType(List(StructField(description,StringType,true),StructField(embeddable,BooleanType,true),StructField(monetizable,BooleanType,true),StructField(title,StringType,true))),true),StructField(description,StringType,true),StructField(display_url,StringType,true),StructField(expanded_url,StringType,true),StructField(id,LongType,true),StructField(id_str,StringType,true),StructField(indices,ArrayType(Lo

In [274]:
df.show()

+--------------------+
|              tweets|
+--------------------+
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
+--------------------+
only showing top 20 rows



In [275]:
res.show()

+--------------------+
|              tweets|
+--------------------+
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
+--------------------+
only showing top 20 rows



In [276]:
res.select("tweets.created_at", "tweets.coordinates").show()

+--------------------+-----------+
|          created_at|coordinates|
+--------------------+-----------+
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:14:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
|Tue May 26 16:15:...|       null|
+--------------------+-----------+
only showing top 20 rows



In [277]:
res.select("tweets.created_at", "tweets.coordinates", "tweets.user.followers_count").show()

+--------------------+-----------+---------------+
|          created_at|coordinates|followers_count|
+--------------------+-----------+---------------+
|Tue May 26 16:14:...|       null|            696|
|Tue May 26 16:14:...|       null|              9|
|Tue May 26 16:14:...|       null|            845|
|Tue May 26 16:14:...|       null|            334|
|Tue May 26 16:14:...|       null|            526|
|Tue May 26 16:14:...|       null|             44|
|Tue May 26 16:14:...|       null|            103|
|Tue May 26 16:14:...|       null|            568|
|Tue May 26 16:14:...|       null|            192|
|Tue May 26 16:14:...|       null|            807|
|Tue May 26 16:15:...|       null|          14108|
|Tue May 26 16:15:...|       null|            310|
|Tue May 26 16:14:...|       null|             28|
|Tue May 26 16:15:...|       null|              5|
|Tue May 26 16:15:...|       null|             57|
|Tue May 26 16:15:...|       null|            229|
|Tue May 26 16:15:...|       nu

In [278]:
res.select("tweets.created_at", "tweets.coordinates", "tweets.user.followers_count") \
                .where(col("tweets.coordinates").isNull()) \
                .where(col("tweets.user.followers_count") > 100000) \
                .show()

+--------------------+-----------+---------------+
|          created_at|coordinates|followers_count|
+--------------------+-----------+---------------+
|Tue May 26 16:27:...|       null|         133918|
|Tue May 26 19:44:...|       null|         230348|
|Tue May 26 19:45:...|       null|         200857|
|Wed May 27 09:57:...|       null|         447390|
|Wed May 27 09:57:...|       null|         127387|
|Wed May 27 09:58:...|       null|         512790|
|Wed May 27 09:58:...|       null|         203863|
|Wed May 27 09:58:...|       null|         190846|
|Wed May 27 09:58:...|       null|         127387|
|Wed May 27 09:58:...|       null|         286442|
|Wed May 27 09:59:...|       null|         134188|
|Wed May 27 09:59:...|       null|         333014|
|Wed May 27 09:59:...|       null|         134188|
|Wed May 27 09:59:...|       null|         594976|
|Wed May 27 09:59:...|       null|        2593438|
|Wed May 27 09:59:...|       null|         855058|
|Wed May 27 09:59:...|       nu

In [279]:
from pyspark.sql.functions import size
resHash = res.withColumn("size", size(col("tweets.entities.hashtags")))
resHash_filtered = resHash.filter(col("size") >= 1)
resHash.show()

+--------------------+----+
|              tweets|size|
+--------------------+----+
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   2|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   1|
|[,, Tue May 26 16...|   2|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   0|
|[,, Tue May 26 16...|   1|
|[,, Tue May 26 16...|   0|
+--------------------+----+
only showing top 20 rows



In [280]:
res.select("tweets.created_at", "tweets.place.bounding_box.coordinates", "tweets.user.followers_count") \
                .where(col("tweets.place.bounding_box.coordinates").isNotNull()) \
                .show()

+--------------------+--------------------+---------------+
|          created_at|         coordinates|followers_count|
+--------------------+--------------------+---------------+
|Tue May 26 16:14:...|[[[-76.711521, 39...|             44|
|Tue May 26 16:40:...|[[[-2.319934, 53....|            235|
|Tue May 26 19:42:...|[[[-95.846367, 29...|            120|
|Wed May 27 09:57:...|[[[-77.144435, 38...|           3467|
|Wed May 27 09:57:...|[[[1.373441, 52.6...|            302|
|Wed May 27 09:57:...|[[[0.378141, 52.7...|           1775|
|Wed May 27 09:57:...|[[[-3.062965, 53....|           1464|
|Wed May 27 09:57:...|[[[-1.321481, 53....|            174|
|Wed May 27 09:57:...|[[[-1.568224, 53....|          20406|
|Wed May 27 09:58:...|[[[-2.814317, 52....|             20|
|Wed May 27 09:58:...|[[[0.621509, 51.9...|            162|
|Wed May 27 09:58:...|[[[-63.39386, -41...|           1460|
|Wed May 27 09:58:...|[[[-0.968219, 52....|            314|
|Wed May 27 09:58:...|[[[-4.11587, 55.7.

In [281]:
df = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "twitter_tweets") \
    .option("startingOffsets", "earliest") \
    .load() \
    .selectExpr("CAST(value AS STRING) as tweets")

df.printSchema()

root
 |-- tweets: string (nullable = true)



In [282]:
res = parseJSONCols(df, 'tweets', sanitize=True)
res.schema

StructType(List(StructField(tweets,StructType(List(StructField(contributors,StringType,true),StructField(coordinates,StructType(List(StructField(coordinates,ArrayType(DoubleType,true),true),StructField(type,StringType,true))),true),StructField(created_at,StringType,true),StructField(display_text_range,ArrayType(LongType,true),true),StructField(entities,StructType(List(StructField(hashtags,ArrayType(StructType(List(StructField(indices,ArrayType(LongType,true),true),StructField(text,StringType,true))),true),true),StructField(media,ArrayType(StructType(List(StructField(additional_media_info,StructType(List(StructField(description,StringType,true),StructField(embeddable,BooleanType,true),StructField(monetizable,BooleanType,true),StructField(title,StringType,true))),true),StructField(description,StringType,true),StructField(display_url,StringType,true),StructField(expanded_url,StringType,true),StructField(id,LongType,true),StructField(id_str,StringType,true),StructField(indices,ArrayType(Lo

In [283]:
res.count()

9272

In [284]:
res.select("tweets.created_at", "tweets.coordinates", "tweets.user.followers_count") \
                .where(col("tweets.coordinates").isNotNull()) \
                .show()

+--------------------+--------------------+---------------+
|          created_at|         coordinates|followers_count|
+--------------------+--------------------+---------------+
|Sun May 31 20:25:...|[[-0.12085, 51.46...|            411|
|Sun May 31 20:25:...|[[-0.12085, 51.46...|            411|
|Sun May 31 20:26:...|[[-0.12085, 51.46...|            411|
|Sun May 31 20:29:...|[[-0.1094, 51.514...|           2096|
|Sun May 31 20:30:...|[[0.107, 51.486],...|            824|
|Sun May 31 20:36:...|[[-0.22537, 51.53...|            411|
|Sun May 31 20:37:...|[[-0.22537, 51.53...|            411|
|Sun May 31 20:37:...|[[-0.1094, 51.514...|             20|
|Sun May 31 20:37:...|[[-0.1094, 51.514...|            936|
|Sun May 31 20:37:...|[[0.02868891, 51....|            203|
|Sun May 31 20:37:...|[[-0.1094, 51.514...|            649|
|Sun May 31 20:37:...|[[-0.22537, 51.53...|            411|
|Sun May 31 20:37:...|[[-0.1094, 51.514...|            103|
+--------------------+------------------

In [285]:
res.count()

9272

pandasUDF to get coordinates from list

# After watching the VIDEO

In [310]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("startingOffsets", "earliest") \
    .option("subscribe", "twitter_tweets") \
    .load() \
    .selectExpr("CAST(value AS STRING) as tweets")

In [311]:
df.printSchema()

root
 |-- tweets: string (nullable = true)



In [312]:
schema

StructType(List(StructField(tweets,StructType(List(StructField(contributors,StringType,true),StructField(coordinates,StructType(List(StructField(coordinates,ArrayType(DoubleType,true),true),StructField(type,StringType,true))),true),StructField(created_at,StringType,true),StructField(display_text_range,ArrayType(LongType,true),true),StructField(entities,StructType(List(StructField(hashtags,ArrayType(StructType(List(StructField(indices,ArrayType(LongType,true),true),StructField(text,StringType,true))),true),true),StructField(media,ArrayType(StructType(List(StructField(additional_media_info,StructType(List(StructField(description,StringType,true),StructField(embeddable,BooleanType,true),StructField(monetizable,BooleanType,true),StructField(title,StringType,true))),true),StructField(description,StringType,true),StructField(display_url,StringType,true),StructField(expanded_url,StringType,true),StructField(id,LongType,true),StructField(id_str,StringType,true),StructField(indices,ArrayType(Lo

In [313]:
tweets = df.writeStream \
                    .format("memory") \
                    .queryName("tweeters") \
                    .outputMode("update") \
                    .start()

In [323]:
tweets.stop()

In [314]:
df = spark.sql(""" SELECT * FROM tweeters """)

In [315]:
ddf = parseJSONCols(df, 'tweets', sanitize=True)
ddf.schema

StructType(List(StructField(tweets,StructType(List(StructField(contributors,StringType,true),StructField(coordinates,StructType(List(StructField(coordinates,ArrayType(DoubleType,true),true),StructField(type,StringType,true))),true),StructField(created_at,StringType,true),StructField(display_text_range,ArrayType(LongType,true),true),StructField(entities,StructType(List(StructField(hashtags,ArrayType(StructType(List(StructField(indices,ArrayType(LongType,true),true),StructField(text,StringType,true))),true),true),StructField(media,ArrayType(StructType(List(StructField(additional_media_info,StructType(List(StructField(description,StringType,true),StructField(embeddable,BooleanType,true),StructField(monetizable,BooleanType,true),StructField(title,StringType,true))),true),StructField(description,StringType,true),StructField(display_url,StringType,true),StructField(expanded_url,StringType,true),StructField(id,LongType,true),StructField(id_str,StringType,true),StructField(indices,ArrayType(Lo

In [316]:
ddf.printSchema()

root
 |-- tweets: struct (nullable = true)
 |    |-- contributors: string (nullable = true)
 |    |-- coordinates: struct (nullable = true)
 |    |    |-- coordinates: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)
 |    |    |-- type: string (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- display_text_range: array (nullable = true)
 |    |    |-- element: long (containsNull = true)
 |    |-- entities: struct (nullable = true)
 |    |    |-- hashtags: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |    |-- text: string (nullable = true)
 |    |    |-- media: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |    |-- description: 

In [322]:
tweets.isActive

True

In [268]:
df = spark.sql(""" SELECT * FROM tweets_data """)

In [318]:
df.show()

+--------------------+
|              tweets|
+--------------------+
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
|{"created_at":"Tu...|
+--------------------+
only showing top 20 rows



In [319]:
ddf.show()

+--------------------+
|              tweets|
+--------------------+
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
|[,, Tue May 26 16...|
+--------------------+
only showing top 20 rows



In [320]:
ddf.select("tweets.created_at", "tweets.place.bounding_box.coordinates", "tweets.user.followers_count").show(truncate=False)

+------------------------------+------------------------------------------------------------------------------------------------------+---------------+
|created_at                    |coordinates                                                                                           |followers_count|
+------------------------------+------------------------------------------------------------------------------------------------------+---------------+
|Tue May 26 16:14:27 +0000 2020|null                                                                                                  |696            |
|Tue May 26 16:14:28 +0000 2020|null                                                                                                  |9              |
|Tue May 26 16:14:30 +0000 2020|null                                                                                                  |845            |
|Tue May 26 16:14:34 +0000 2020|null                                                    

In [321]:
ddf.count()

9272

In [366]:
dd = ddf.select("tweets.created_at", "tweets.coordinates.coordinates", "tweets.user.followers_count") \
                .where(col("tweets.coordinates").isNotNull())

In [332]:
ddf.select("tweets.created_at", "tweets.coordinates.coordinates", "tweets.user.followers_count") \
                .where(col("tweets.coordinates").isNotNull()) \
                .show(truncate=False)

+------------------------------+-------------------------+---------------+
|created_at                    |coordinates              |followers_count|
+------------------------------+-------------------------+---------------+
|Sun May 31 20:25:00 +0000 2020|[-0.12085, 51.4682]      |411            |
|Sun May 31 20:25:35 +0000 2020|[-0.12085, 51.4682]      |411            |
|Sun May 31 20:26:40 +0000 2020|[-0.12085, 51.4682]      |411            |
|Sun May 31 20:29:30 +0000 2020|[-0.1094, 51.5141]       |2096           |
|Sun May 31 20:30:08 +0000 2020|[0.107, 51.486]          |824            |
|Sun May 31 20:36:48 +0000 2020|[-0.22537, 51.530537]    |411            |
|Sun May 31 20:37:14 +0000 2020|[-0.22537, 51.530537]    |411            |
|Sun May 31 20:37:15 +0000 2020|[-0.1094, 51.5141]       |20             |
|Sun May 31 20:37:21 +0000 2020|[-0.1094, 51.5141]       |936            |
|Sun May 31 20:37:30 +0000 2020|[0.02868891, 51.50814147]|203            |
|Sun May 31 20:37:38 +000

In [506]:
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import *

# Declare the function and create the UDF
def multiply_func(a):
    return a * 2

multiply = pandas_udf(multiply_func, returnType=LongType())

# The function for a pandas_udf should be able to execute with local pandas data
x = pd.Series([1, 2, 3])
print(multiply_func(x))
# 0    1
# 1    4
# 2    9
# dtype: int64

# Create a Spark DataFrame, 'spark' is an existing SparkSession
# df = spark.createDataFrame(pd.DataFrame(x, columns=["x"]))

# Execute function as a Spark vectorized UDF
# df.select(multiply(col("x"), col("x"))).show()

@pandas_udf(ArrayType(FloatType()))
def lat(v):
    return v.apply(lambda x: x[1:])

@pandas_udf(ArrayType(FloatType()))
def long(v):
    return v.apply(lambda x: x[:1])
# lati = pandas_udf(lat, returnType=DoubleType())

@pandas_udf(ArrayType(FloatType()))
def second_pass(v):
    res=[]
    for row in v:
        res.append(row[0])
    return pd.Series(res)

0    2
1    4
2    6
dtype: int64


In [488]:
dd.show()

+--------------------+--------------------+---------------+
|          created_at|         coordinates|followers_count|
+--------------------+--------------------+---------------+
|Sun May 31 20:25:...| [-0.12085, 51.4682]|            411|
|Sun May 31 20:25:...| [-0.12085, 51.4682]|            411|
|Sun May 31 20:26:...| [-0.12085, 51.4682]|            411|
|Sun May 31 20:29:...|  [-0.1094, 51.5141]|           2096|
|Sun May 31 20:30:...|     [0.107, 51.486]|            824|
|Sun May 31 20:36:...|[-0.22537, 51.530...|            411|
|Sun May 31 20:37:...|[-0.22537, 51.530...|            411|
|Sun May 31 20:37:...|  [-0.1094, 51.5141]|             20|
|Sun May 31 20:37:...|  [-0.1094, 51.5141]|            936|
|Sun May 31 20:37:...|[0.02868891, 51.5...|            203|
|Sun May 31 20:37:...|  [-0.1094, 51.5141]|            649|
|Sun May 31 20:37:...|[-0.22537, 51.530...|            411|
|Sun May 31 20:37:...|  [-0.1094, 51.5141]|            103|
+--------------------+------------------

In [489]:
dd.withColumn("follower_count_times_two", multiply(col("followers_count"))).show()

+--------------------+--------------------+---------------+------------------------+
|          created_at|         coordinates|followers_count|follower_count_times_two|
+--------------------+--------------------+---------------+------------------------+
|Sun May 31 20:25:...| [-0.12085, 51.4682]|            411|                     822|
|Sun May 31 20:25:...| [-0.12085, 51.4682]|            411|                     822|
|Sun May 31 20:26:...| [-0.12085, 51.4682]|            411|                     822|
|Sun May 31 20:29:...|  [-0.1094, 51.5141]|           2096|                    4192|
|Sun May 31 20:30:...|     [0.107, 51.486]|            824|                    1648|
|Sun May 31 20:36:...|[-0.22537, 51.530...|            411|                     822|
|Sun May 31 20:37:...|[-0.22537, 51.530...|            411|                     822|
|Sun May 31 20:37:...|  [-0.1094, 51.5141]|             20|                      40|
|Sun May 31 20:37:...|  [-0.1094, 51.5141]|            936|      

In [490]:
ldd = dd.withColumn("latitude", lat(col("coordinates")))
lldd = ldd.withColumn("longditude", long(col("coordinates")))
lldd.show()

+--------------------+--------------------+---------------+-----------+------------+
|          created_at|         coordinates|followers_count|   latitude|  longditude|
+--------------------+--------------------+---------------+-----------+------------+
|Sun May 31 20:25:...| [-0.12085, 51.4682]|            411|  [51.4682]|  [-0.12085]|
|Sun May 31 20:25:...| [-0.12085, 51.4682]|            411|  [51.4682]|  [-0.12085]|
|Sun May 31 20:26:...| [-0.12085, 51.4682]|            411|  [51.4682]|  [-0.12085]|
|Sun May 31 20:29:...|  [-0.1094, 51.5141]|           2096|  [51.5141]|   [-0.1094]|
|Sun May 31 20:30:...|     [0.107, 51.486]|            824|   [51.486]|     [0.107]|
|Sun May 31 20:36:...|[-0.22537, 51.530...|            411|[51.530537]|  [-0.22537]|
|Sun May 31 20:37:...|[-0.22537, 51.530...|            411|[51.530537]|  [-0.22537]|
|Sun May 31 20:37:...|  [-0.1094, 51.5141]|             20|  [51.5141]|   [-0.1094]|
|Sun May 31 20:37:...|  [-0.1094, 51.5141]|            936|  [51.

In [491]:
lldd_pdf = lldd.toPandas()

In [477]:
lldd_pdf['latitude'].tolist()[0].item()

51.46820068359375

In [493]:
import numpy as np
import pandas as pd

In [492]:
lati_series = lldd_pdf['latitude'].transform(lambda x: x.item())

In [494]:
longd_series = lldd_pdf['longditude'].transform(lambda x: x.item())

In [495]:
pd.DataFrame({'latitude': lati_series, 'longtitude': longd_series})

Unnamed: 0,latitude,longtitude
0,51.468201,-0.12085
1,51.468201,-0.12085
2,51.468201,-0.12085
3,51.514099,-0.1094
4,51.486,0.107
5,51.530537,-0.22537
6,51.530537,-0.22537
7,51.514099,-0.1094
8,51.514099,-0.1094
9,51.508141,0.028689


In [496]:
lldd_pdf['llat'] = lldd_pdf['latitude'].transform(lambda x: x.item())

In [497]:
lldd_pdf['llong'] = lldd_pdf['longditude'].transform(lambda x: x.item())

In [498]:
lldd_pdf.head()

Unnamed: 0,created_at,coordinates,followers_count,latitude,longditude,llat,llong
0,Sun May 31 20:25:00 +0000 2020,"[-0.12085, 51.4682]",411,[51.4682],[-0.12085],51.468201,-0.12085
1,Sun May 31 20:25:35 +0000 2020,"[-0.12085, 51.4682]",411,[51.4682],[-0.12085],51.468201,-0.12085
2,Sun May 31 20:26:40 +0000 2020,"[-0.12085, 51.4682]",411,[51.4682],[-0.12085],51.468201,-0.12085
3,Sun May 31 20:29:30 +0000 2020,"[-0.1094, 51.5141]",2096,[51.5141],[-0.1094],51.514099,-0.1094
4,Sun May 31 20:30:08 +0000 2020,"[0.107, 51.486]",824,[51.486],[0.107],51.486,0.107


In [501]:
import plotly.express as px

fig = px.scatter_mapbox(lldd_pdf, lat="llat", lon="llong", color="followers_count",
                        color_continuous_scale=px.colors.cyclical.IceFire, zoom=9.5, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [505]:
import plotly.express as px

fig = px.scatter_mapbox(lldd_pdf, lat="llat", lon="llong", color="followers_count",
                        color_continuous_scale=px.colors.cyclical.IceFire, zoom=9.5, height=600)
fig.update_layout(
    mapbox_style="carto-positron"
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [433]:
import pyarrow as pa
print(pa.__version__)

0.17.1


In [434]:
ldd.withColumn("latitude_pop", second_pass(col("latitude"))).show()

Py4JJavaError: An error occurred while calling o5808.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 211.0 failed 1 times, most recent failure: Lost task 1.0 in stage 211.0 (TID 334, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 291, in dump_stream
    batch = _create_batch(series, self._timezone)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 260, in _create_batch
    arrs = [create_array(s, t) for s, t in series]
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 260, in <listcomp>
    arrs = [create_array(s, t) for s, t in series]
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 258, in create_array
    return pa.Array.from_pandas(s, mask=mask, type=t, safe=False)
  File "pyarrow/array.pxi", line 805, in pyarrow.lib.Array.from_pandas
  File "pyarrow/array.pxi", line 265, in pyarrow.lib.array
  File "pyarrow/array.pxi", line 80, in pyarrow.lib._ndarray_to_array
  File "pyarrow/error.pxi", line 106, in pyarrow.lib.check_status
pyarrow.lib.ArrowNotImplementedError: NumPyConverter doesn't implement <list<item: float>> conversion. 

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:172)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:98)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:96)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:127)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:89)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:823)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:823)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor59.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 291, in dump_stream
    batch = _create_batch(series, self._timezone)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 260, in _create_batch
    arrs = [create_array(s, t) for s, t in series]
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 260, in <listcomp>
    arrs = [create_array(s, t) for s, t in series]
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 258, in create_array
    return pa.Array.from_pandas(s, mask=mask, type=t, safe=False)
  File "pyarrow/array.pxi", line 805, in pyarrow.lib.Array.from_pandas
  File "pyarrow/array.pxi", line 265, in pyarrow.lib.array
  File "pyarrow/array.pxi", line 80, in pyarrow.lib._ndarray_to_array
  File "pyarrow/error.pxi", line 106, in pyarrow.lib.check_status
pyarrow.lib.ArrowNotImplementedError: NumPyConverter doesn't implement <list<item: float>> conversion. 

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:172)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:98)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:96)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:127)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:89)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:823)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:823)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [391]:
ddf.withColumn("follower_count_times_two", multiply(col("tweets.user.`followers_count`"))).show()

+--------------------+------------------------+
|              tweets|follower_count_times_two|
+--------------------+------------------------+
|[,, Tue May 26 16...|                    1392|
|[,, Tue May 26 16...|                      18|
|[,, Tue May 26 16...|                    1690|
|[,, Tue May 26 16...|                     668|
|[,, Tue May 26 16...|                    1052|
|[,, Tue May 26 16...|                      88|
|[,, Tue May 26 16...|                     206|
|[,, Tue May 26 16...|                    1136|
|[,, Tue May 26 16...|                     384|
|[,, Tue May 26 16...|                    1614|
|[,, Tue May 26 16...|                   28216|
|[,, Tue May 26 16...|                     620|
|[,, Tue May 26 16...|                      56|
|[,, Tue May 26 16...|                      10|
|[,, Tue May 26 16...|                     114|
|[,, Tue May 26 16...|                     458|
|[,, Tue May 26 16...|                     254|
|[,, Tue May 26 16...|                  

In [358]:
ddf.withColumn("latitude", lati(col("tweets.coordinates.`coordinates`"))).show()

Py4JJavaError: An error occurred while calling o4424.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 117.0 failed 1 times, most recent failure: Lost task 0.0 in stage 117.0 (TID 153, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 290, in dump_stream
    for series in iterator:
  File "<string>", line 1, in <lambda>
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 101, in <lambda>
    return lambda *a: (verify_result_length(*a), arrow_return_type)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 92, in verify_result_length
    result = f(*a)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/pyspark/sql/udf.py", line 189, in wrapper
    return self(*args)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/sql/udf.py", line 169, in __call__
    return Column(judf.apply(_to_seq(sc, cols, _to_java_column)))
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/sql/column.py", line 65, in _to_seq
    cols = [converter(c) for c in cols]
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/sql/column.py", line 65, in <listcomp>
    cols = [converter(c) for c in cols]
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/sql/column.py", line 53, in _to_java_column
    "function.".format(col, type(col)))
TypeError: Invalid argument, not a string or column: 0       None
1       None
2       None
3       None
4       None
        ... 
2313    None
2314    None
2315    None
2316    None
2317    None
Name: _0, Length: 2318, dtype: object of type <class 'pandas.core.series.Series'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:172)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:98)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:96)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:127)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:89)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:823)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:823)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor59.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 290, in dump_stream
    for series in iterator:
  File "<string>", line 1, in <lambda>
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 101, in <lambda>
    return lambda *a: (verify_result_length(*a), arrow_return_type)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 92, in verify_result_length
    result = f(*a)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/pyspark/sql/udf.py", line 189, in wrapper
    return self(*args)
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/sql/udf.py", line 169, in __call__
    return Column(judf.apply(_to_seq(sc, cols, _to_java_column)))
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/sql/column.py", line 65, in _to_seq
    cols = [converter(c) for c in cols]
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/sql/column.py", line 65, in <listcomp>
    cols = [converter(c) for c in cols]
  File "/usr/local/Cellar/apache-spark/2.4.5/libexec/python/lib/pyspark.zip/pyspark/sql/column.py", line 53, in _to_java_column
    "function.".format(col, type(col)))
TypeError: Invalid argument, not a string or column: 0       None
1       None
2       None
3       None
4       None
        ... 
2313    None
2314    None
2315    None
2316    None
2317    None
Name: _0, Length: 2318, dtype: object of type <class 'pandas.core.series.Series'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:172)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:98)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:96)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:127)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:89)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:823)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:823)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [328]:
# @pandas_udf(ArrayType(LongType()))
# def func(v):
#     return v.apply(lambda x: x[1:])

In [334]:
# df.withColumn('lat', func(df["tweets.coordinates.coordinates"])).show()

AnalysisException: "Can't extract value from tweets#3047: need struct type but got string;"

In [238]:
tweets.stop()