In [1]:
import sys
import os

In [2]:
os.environ.get('JAVA_HOME')

'C:\\Program Files\\Java\\jdk1.8.0_311'

In [3]:
# import findspark
# findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [5]:
spark = SparkSession.builder.master("local[*]").appName("SparkSQL").getOrCreate()

In [6]:
json_data = [
    (0, """{"device_id": 0, "device_type": "sensor-ipad", "ip": "68.161.225.1", "cca3": "USA", "cn": "United States", "temp": 25, "signal": 23, "battery_level": 8, "c02_level": 917, "timestamp" :1475600496 }"""),
    (1, """{"device_id": 1, "device_type": "sensor-igauge", "ip": "213.161.254.1", "cca3": "NOR", "cn": "Norway", "temp": 30, "signal": 18, "battery_level": 6, "c02_level": 1413, "timestamp" :1475600498 }"""),
    (2, """{"device_id": 2, "device_type": "sensor-ipad", "ip": "88.36.5.1", "cca3": "ITA", "cn": "Italy", "temp": 18, "signal": 25, "battery_level": 5, "c02_level": 1372, "timestamp" :1475600500 }"""),
    (3, """{"device_id": 3, "device_type": "sensor-inest", "ip": "66.39.173.154", "cca3": "USA", "cn": "United States", "temp": 47, "signal": 12, "battery_level": 1, "c02_level": 1447, "timestamp" :1475600502 }"""),
    (4, """{"device_id": 4, "device_type": "sensor-ipad", "ip": "203.82.41.9", "cca3": "PHL", "cn": "Philippines", "temp": 29, "signal": 11, "battery_level": 0, "c02_level": 983, "timestamp" :1475600504 }"""),
    (5, """{"device_id": 5, "device_type": "sensor-istick", "ip": "204.116.105.67", "cca3": "USA", "cn": "United States", "temp": 50, "signal": 16, "battery_level": 8, "c02_level": 1574, "timestamp" :1475600506 }"""),
    (6, """{"device_id": 6, "device_type": "sensor-ipad", "ip": "220.173.179.1", "cca3": "CHN", "cn": "China", "temp": 21, "signal": 18, "battery_level": 9, "c02_level": 1249, "timestamp" :1475600508 }""")
]

In [11]:
rdd = spark.sparkContext.parallelize(json_data)

In [15]:
df = spark.createDataFrame(rdd, ["id", "device"])

In [16]:
df.show(truncate=False)

+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |device                                                                                                                                                                                                   |
+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0  |{"device_id": 0, "device_type": "sensor-ipad", "ip": "68.161.225.1", "cca3": "USA", "cn": "United States", "temp": 25, "signal": 23, "battery_level": 8, "c02_level": 917, "timestamp" :1475600496 }     |
|1  |{"device_id": 1, "device_type": "sensor-igauge", "ip": "213.161.254.1", "cca3": "NOR", "cn": "Norway", "temp": 30, "signal": 18, "battery_level": 6, "c02_level": 1

In [25]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- device: string (nullable = true)



## from_json()
Using the from_json() function within the select() Dataset, we can extract or decode data's attributes and values from a JSON string into a DataFrame as columns dictated by a schema. We use the schema, all associated attributes, and values within this JSON to represent an entity device. As such, you can use the device attributes to retrieve their respective value and all values using the * notation.

In [20]:
jsonSchema = StructType() \
    .add("battery_level", LongType()) \
    .add("c02_level", LongType()) \
    .add("cca3", StringType()) \
    .add("cn", StringType()) \
    .add("device_id", LongType()) \
    .add("device_type", StringType()) \
    .add("signal", LongType()) \
    .add("ip", StringType()) \
    .add("temp", LongType()) \
    .add("timestamp", TimestampType())

In [21]:
devicesDF = df.select(col("id"), from_json(col("device"), jsonSchema).alias("devices"))

In [23]:
devicesDF.show(truncate=False)

+---+--------------------------------------------------------------------------------------------+
|id |devices                                                                                     |
+---+--------------------------------------------------------------------------------------------+
|0  |{8, 917, USA, United States, 0, sensor-ipad, 23, 68.161.225.1, 25, 2016-10-04 13:01:36}     |
|1  |{6, 1413, NOR, Norway, 1, sensor-igauge, 18, 213.161.254.1, 30, 2016-10-04 13:01:38}        |
|2  |{5, 1372, ITA, Italy, 2, sensor-ipad, 25, 88.36.5.1, 18, 2016-10-04 13:01:40}               |
|3  |{1, 1447, USA, United States, 3, sensor-inest, 12, 66.39.173.154, 47, 2016-10-04 13:01:42}  |
|4  |{0, 983, PHL, Philippines, 4, sensor-ipad, 11, 203.82.41.9, 29, 2016-10-04 13:01:44}        |
|5  |{8, 1574, USA, United States, 5, sensor-istick, 16, 204.116.105.67, 50, 2016-10-04 13:01:46}|
|6  |{9, 1249, CHN, China, 6, sensor-ipad, 18, 220.173.179.1, 21, 2016-10-04 13:01:48}           |
+---+-----

In [24]:
devicesDF.printSchema()

root
 |-- id: long (nullable = true)
 |-- devices: struct (nullable = true)
 |    |-- battery_level: long (nullable = true)
 |    |-- c02_level: long (nullable = true)
 |    |-- cca3: string (nullable = true)
 |    |-- cn: string (nullable = true)
 |    |-- device_id: long (nullable = true)
 |    |-- device_type: string (nullable = true)
 |    |-- signal: long (nullable = true)
 |    |-- ip: string (nullable = true)
 |    |-- temp: long (nullable = true)
 |    |-- timestamp: timestamp (nullable = true)



In [27]:
devicesDF.select(col("id"), col("devices.*")).show(truncate=False)

+---+-------------+---------+----+-------------+---------+-------------+------+--------------+----+-------------------+
|id |battery_level|c02_level|cca3|cn           |device_id|device_type  |signal|ip            |temp|timestamp          |
+---+-------------+---------+----+-------------+---------+-------------+------+--------------+----+-------------------+
|0  |8            |917      |USA |United States|0        |sensor-ipad  |23    |68.161.225.1  |25  |2016-10-04 13:01:36|
|1  |6            |1413     |NOR |Norway       |1        |sensor-igauge|18    |213.161.254.1 |30  |2016-10-04 13:01:38|
|2  |5            |1372     |ITA |Italy        |2        |sensor-ipad  |25    |88.36.5.1     |18  |2016-10-04 13:01:40|
|3  |1            |1447     |USA |United States|3        |sensor-inest |12    |66.39.173.154 |47  |2016-10-04 13:01:42|
|4  |0            |983      |PHL |Philippines  |4        |sensor-ipad  |11    |203.82.41.9   |29  |2016-10-04 13:01:44|
|5  |8            |1574     |USA |United

## get_json_object()
The get_json_object() extracts JSON element from a JSON string based on JSON path specified. This function uses schema to extract individual columns. Below we are extracting the "device_type", "IP", "cca3" columns from JSON String.

In [30]:
json_df = df.select(
    col("id"),
    get_json_object(col("device"), "$.device_type").alias("device_type"),
    get_json_object(col("device"), "$.ip").alias("ip"),
    get_json_object(col("device"), "$.cca3").alias("cca3")
)

In [31]:
json_df.show(truncate=False)

+---+-------------+--------------+----+
|id |device_type  |ip            |cca3|
+---+-------------+--------------+----+
|0  |sensor-ipad  |68.161.225.1  |USA |
|1  |sensor-igauge|213.161.254.1 |NOR |
|2  |sensor-ipad  |88.36.5.1     |ITA |
|3  |sensor-inest |66.39.173.154 |USA |
|4  |sensor-ipad  |203.82.41.9   |PHL |
|5  |sensor-istick|204.116.105.67|USA |
|6  |sensor-ipad  |220.173.179.1 |CHN |
+---+-------------+--------------+----+



## to_json()
This converts earlier struct column from "devicesDF" to JSON string.

In [33]:
stringJsonDF = devicesDF.select(to_json(struct(col("*")))).toDF("devices")

In [35]:
stringJsonDF.printSchema()

root
 |-- devices: string (nullable = true)



In [34]:
stringJsonDF.show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|devices                                                                                                                                                                                                                      |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"id":0,"devices":{"battery_level":8,"c02_level":917,"cca3":"USA","cn":"United States","device_id":0,"device_type":"sensor-ipad","signal":23,"ip":"68.161.225.1","temp":25,"timestamp":"2016-10-04T13:01:36.000-04:00"}}     |
|{"id":1,"devices":{"battery_level":6,"c02_level":1413,"cca3":"NOR","cn":"Norway","device_id":1,"device_

## json_tuple()
Extract the Data from JSON String and create them as new columns. The result results in all columns in JSON string as StringType.

In [36]:
jstuple_df = df.select(json_tuple(col("device"), "battery_level", "c02_level", "cca3", "cn", "device_id"))

In [37]:
jstuple_df.show()

+---+----+---+-------------+---+
| c0|  c1| c2|           c3| c4|
+---+----+---+-------------+---+
|  8| 917|USA|United States|  0|
|  6|1413|NOR|       Norway|  1|
|  5|1372|ITA|        Italy|  2|
|  1|1447|USA|United States|  3|
|  0| 983|PHL|  Philippines|  4|
|  8|1574|USA|United States|  5|
|  9|1249|CHN|        China|  6|
+---+----+---+-------------+---+

