In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Convenience function for turning JSON strings into DataFrames.
def jsonToDataFrame(json, schema=None):
  # SparkSessions are available with Spark 2.0+
  reader = spark.read
  if schema:
    reader.schema(schema)
  return reader.json(sc.parallelize([json]))

In [2]:
schema = StructType().add("a", StructType().add("b", IntegerType()))
                          
events = jsonToDataFrame("""
{
  "a": {
     "b": 1
  }
}
""", schema)

display(events.select("a.b"))

In [3]:
# Using a map
schema = StructType().add("a", MapType(StringType(), IntegerType()))
                          
events = jsonToDataFrame("""
{
  "a": {
     "b": 1
  }
}
""", schema)

display(events.select("a.b"))

In [4]:
events = jsonToDataFrame("""
{
  "a": {
     "b": 1,
     "c": 2
  }
}
""")

display(events.select("a.*"))

In [5]:
events = jsonToDataFrame("""
{
  "a": 1,
  "b": 2,
  "c": 3
}
""")

display(events.select(struct(col("a").alias("y")).alias("x")))

In [6]:
events = jsonToDataFrame("""
{
  "a": 1,
  "b": 2
}
""")

display(events.select(struct("*").alias("x")))

In [7]:
events = jsonToDataFrame("""
{
  "a": [1, 2]
}
""")

display(events.select(col("a").getItem(0).alias("x")))

In [8]:
# Using a map
schema = StructType().add("a", MapType(StringType(), IntegerType()))

events = jsonToDataFrame("""
{
  "a": {
    "b": 1
  }
}
""", schema)

display(events.select(col("a").getItem("b").alias("x")))

In [9]:
events = jsonToDataFrame("""
{
  "a": [1, 2]
}
""")

display(events.select(explode("a").alias("x")))

In [10]:
# Using a map
schema = StructType().add("a", MapType(StringType(), IntegerType()))

events = jsonToDataFrame("""
{
  "a": {
    "b": 1,
    "c": 2
  }
}
""", schema)

display(events.select(explode("a").alias("x", "y")))

In [11]:
events = jsonToDataFrame("""
[{ "x": 1 }, { "x": 2 }]
""")

display(events.select(collect_list("x").alias("x")))

In [12]:
# using an aggregation
events = jsonToDataFrame("""
[{ "x": 1, "y": "a" }, { "x": 2, "y": "b" }]
""")

display(events.groupBy("y").agg(collect_list("x").alias("x")))

In [13]:
events = jsonToDataFrame("""
{
  "a": [
    {"b": 1},
    {"b": 2}
  ]
}
""")

display(events.select("a.b"))

In [14]:
events = jsonToDataFrame("""
{
  "a": {
    "b": 1
  }
}
""")

display(events.select(to_json("a").alias("c")))

In [15]:
events = jsonToDataFrame("""
{
  "a": "{\\"b\\":1}"
}
""")

schema = StructType().add("b", IntegerType())
display(events.select(from_json("a", schema).alias("c")))

In [16]:
events = jsonToDataFrame("""
{
  "a": "{\\"b\\":{\\"x\\":1,\\"y\\":{\\"z\\":2}}}"
}
""")

schema = StructType().add("b", StructType().add("x", IntegerType())
                            .add("y", StringType()))
display(events.select(from_json("a", schema).alias("c")))



In [17]:
events = jsonToDataFrame("""
{
  "a": "{\\"b\\":1}"
}
""")

display(events.select(json_tuple("a", "b").alias("c")))



In [18]:
events = jsonToDataFrame("""
[{ "a": "x: 1" }, { "a": "y: 2" }]
""")

display(events.select(regexp_extract("a", "([a-z]):", 1).alias("c")))

In [19]:
%scala
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

// Convenience function for turning JSON strings into DataFrames.
def jsonToDataFrame(json: String, schema: StructType = null): DataFrame = {
  // SparkSessions are available with Spark 2.0+
  val reader = spark.read
  Option(schema).foreach(reader.schema)
  reader.json(sc.parallelize(Array(json)))
}

In [20]:
%scala
// Using a struct
val schema = new StructType().add("a", new StructType().add("b", IntegerType))
                          
val events = jsonToDataFrame("""
{
  "a": {
     "b": 1
  }
}
""", schema)

display(events.select("a.b"))

In [21]:
%scala
val schema = new StructType().add("a", MapType(StringType, IntegerType))
                          
val events = jsonToDataFrame("""
{
  "a": {
     "b": 1
  }
}
""", schema)

display(events.select("a.b"))

In [22]:
%scala
val events = jsonToDataFrame("""
{
  "a": {
     "b": 1,
     "c": 2
  }
}
""")

display(events.select("a.*"))

In [23]:
%scala
val events = jsonToDataFrame("""
{
  "a": 1,
  "b": 2,
  "c": 3
}
""")

display(events.select(struct('a as 'y) as 'x))



In [24]:
%scala
val events = jsonToDataFrame("""
{
  "a": 1,
  "b": 2
}
""")

display(events.select(struct("*") as 'x))



In [25]:
%scala
val events = jsonToDataFrame("""
{
  "a": [1, 2]
}
""")

display(events.select('a.getItem(0) as 'x))

In [26]:
%scala
val schema = new StructType().add("a", MapType(StringType, IntegerType))

val events = jsonToDataFrame("""
{
  "a": {
    "b": 1
  }
}
""", schema)

display(events.select('a.getItem("b") as 'x))

In [27]:
%scala
val events = jsonToDataFrame("""
{
  "a": [1, 2]
}
""")

display(events.select(explode('a) as 'x))

In [28]:
%scala
// Using a map
val schema = new StructType().add("a", MapType(StringType, IntegerType))

val events = jsonToDataFrame("""
{
  "a": {
    "b": 1,
    "c": 2
  }
}
""", schema)

display(events.select(explode('a) as (Seq("x", "y"))))



In [29]:
%scala
val events = jsonToDataFrame("""
[{ "x": 1 }, { "x": 2 }]
""")

display(events.select(collect_list('x) as 'x))



In [30]:
%scala
// using an aggregation
val events = jsonToDataFrame("""
[{ "x": 1, "y": "a" }, { "x": 2, "y": "b" }]
""")

display(events.groupBy("y").agg(collect_list('x) as 'x))

	

In [31]:
%scala
val events = jsonToDataFrame("""
{
  "a": [
    {"b": 1},
    {"b": 2}
  ]
}
""")

display(events.select("a.b"))

In [32]:
%scala
val events = jsonToDataFrame("""
{
  "a": {
    "b": 1
  }
}
""")

display(events.select(to_json('a) as 'c))

In [33]:
%scala
val events = jsonToDataFrame("""
{
  "a": "{\"b\":1}"
}
""")

val schema = new StructType().add("b", IntegerType)
display(events.select(from_json('a, schema) as 'c))



In [34]:
%scala
val events = jsonToDataFrame("""
{
  "a": "{\"b\":{\"x\":1,\"y\":{\"z\":2}}}"
}
""")

val schema = new StructType().add("b", new StructType().add("x", IntegerType)
  .add("y", StringType))
display(events.select(from_json('a, schema) as 'c))

In [35]:
%scala
val events = jsonToDataFrame("""
{
  "a": "{\"b\":1}"
}
""")

display(events.select(json_tuple('a, "b") as 'c))

In [36]:
%scala
val events = jsonToDataFrame("""
[{ "a": "x: 1" }, { "a": "y: 2" }]
""")

display(events.select(regexp_extract('a, "([a-z]):", 1) as 'c))

In [37]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Convenience function for turning JSON strings into DataFrames.
def jsonToDataFrame(json, schema=None):
  # SparkSessions are available with Spark 2.0+
  reader = spark.read
  if schema:
    reader.schema(schema)
  reader.json(sc.parallelize([json])).createOrReplaceTempView("events")

In [38]:
# Using a struct
schema = StructType().add("a", StructType().add("b", IntegerType()))
                          
jsonToDataFrame("""
{
  "a": {
     "b": 1
  }
}
""", schema)

In [39]:
# Using a map
schema = StructType().add("a", MapType(StringType(), IntegerType()))
                          
jsonToDataFrame("""
{
  "a": {
     "b": 1
  }
}
""", schema)

In [40]:
%sql
select a.b from events

In [41]:
jsonToDataFrame("""
{
  "a": {
     "b": 1,
     "c": 2
  }
}
""")

In [42]:
%sql
select a.* from events

In [43]:
jsonToDataFrame("""
{
  "a": 1,
  "b": 2,
  "c": 3
}
""")

In [44]:
%sql
select named_struct("y", a) as x from events

In [45]:
jsonToDataFrame("""
{
  "a": 1,
  "b": 2
}
""")

In [46]:
%sql
select struct(*) as x from events

In [47]:
jsonToDataFrame("""
{
  "a": [1, 2]
}
""")

In [48]:
%sql
select a[0] as x from events

In [49]:
# Using a map
schema = StructType().add("a", MapType(StringType(), IntegerType()))

jsonToDataFrame("""
{
  "a": {
    "b": 1
  }
}
""", schema)

In [50]:
%sql
select a['b'] as x from events

In [51]:
jsonToDataFrame("""
{
  "a": [1, 2]
}
""")

In [52]:
%sql
select explode(a) as x from events

In [53]:
schema = StructType().add("a", MapType(StringType(), IntegerType()))

jsonToDataFrame("""
{
  "a": {
    "b": 1,
    "c": 2
  }
}
""", schema)

In [54]:
%sql
select explode(a) as (x, y) from events

In [55]:
jsonToDataFrame("""
[{ "x": 1 }, { "x": 2 }]
""")

In [56]:
%sql
select collect_list(x) as x from events

In [57]:
jsonToDataFrame("""
[{ "x": 1, "y": "a" }, { "x": 2, "y": "b" }]
""")


In [58]:
%sql
select y, collect_list(x) as x from events group by y

In [59]:
jsonToDataFrame("""
{
  "a": [
    {"b": 1},
    {"b": 2}
  ]
}
""")

In [60]:
%sql
select a.b from events

In [61]:
jsonToDataFrame("""
{
  "a": "{\\"b\\":1}"
}
""")

In [62]:
%sql
select json_tuple(a, "b") as c from events

In [63]:
jsonToDataFrame("""
[{ "a": "x: 1" }, { "a": "y: 2" }]
""")

In [64]:
%sql
select regexp_extract(a, "([a-z]):", 1) as c from events

In [65]:
%sql
CREATE OR REPLACE TEMPORARY VIEW nested_data AS
SELECT   id AS key,
         ARRAY(CAST(RAND(1) * 100 AS INT), CAST(RAND(2) * 100 AS INT), CAST(RAND(3) * 100 AS INT), CAST(RAND(4) * 100 AS INT), CAST(RAND(5) * 100 AS INT)) AS values
         ,
         ARRAY(ARRAY(CAST(RAND(1) * 100 AS INT), CAST(RAND(2) * 100 AS INT)), ARRAY(CAST(RAND(3) * 100 AS INT), CAST(RAND(4) * 100 AS INT), CAST(RAND(5) * 100 AS INT))) AS nested_values
FROM range(5)

In [66]:
%sql SELECT * FROM nested_data

In [67]:
%sql
SELECT  key,
        values,
        TRANSFORM(values, value -> value + 1) AS values_plus_one
FROM    nested_data

In [68]:
%sql
SELECT  key,
        values,
        TRANSFORM(values, value -> value + key) AS values_plus_key
FROM    nested_data

In [69]:
%sql
SELECT   key,
         nested_values,
         TRANSFORM(nested_values,
           values -> TRANSFORM(values,
             value -> value + key + SIZE(values))) AS new_nested_values
FROM     nested_data

In [70]:
%sql
SELECT   key,
         values,
         TRANSFORM(values, value -> value + key) transformed_values
FROM     nested_data

In [71]:
%sql
SELECT   key,
         values,
         EXISTS(values, value -> value % 10 == 1) filtered_values
FROM     nested_data

In [72]:
%sql
SELECT   key,
         values,
         FILTER(values, value -> value > 50) filtered_values
FROM     nested_data

In [73]:
%sql
SELECT   key,
         values,
         REDUCE(values, 0, (value, acc) -> value + acc, acc -> acc) summed_values,
         REDUCE(values, 0, (value, acc) -> value + acc) summed_values_simple
FROM     nested_data

In [74]:
%sql
SELECT   key,
         values,
         AGGREGATE(values,
           (1.0 AS product, 0 AS N),
           (buffer, value) -> (value * buffer.product, buffer.N + 1),
           buffer -> Power(buffer.product, 1.0 / buffer.N)) geomean
FROM     nested_data

In [75]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

schema = StructType() \
          .add("dc_id", StringType()) \
          .add("source", MapType(StringType(), StructType() \
                        .add("description", StringType()) \
                        .add("ip", StringType()) \
                        .add("id", IntegerType()) \
                        .add("temp", ArrayType(IntegerType())) \
                        .add("c02_level", ArrayType(IntegerType())) \
                        .add("geo", StructType() \
                              .add("lat", DoubleType()) \
                              .add("long", DoubleType()))))

In [76]:
def jsonToDataFrame(json, schema=None):
  # SparkSessions are available with Spark 2.0+
  reader = spark.read
  if schema:
    reader.schema(schema)
  return reader.json(sc.parallelize([json]))

In [77]:
dataDF = jsonToDataFrame( """{

    "dc_id": "dc-101",
    "source": {
        "sensor-igauge": {
        "id": 10,
        "ip": "68.28.91.22",
        "description": "Sensor attached to the container ceilings",
        "temp":[35,35,35,36,35,35,32,35,30,35,32,35],
        "c02_level": [1475,1476,1473],
        "geo": {"lat":38.00, "long":97.00}                        
      },
      "sensor-ipad": {
        "id": 13,
        "ip": "67.185.72.1",
        "description": "Sensor ipad attached to carbon cylinders",
        "temp": [45,45,45,46,45,45,42,35,40,45,42,45],
        "c02_level": [1370,1371,1372],
        "geo": {"lat":47.41, "long":-122.00}
      },
      "sensor-inest": {
        "id": 8,
        "ip": "208.109.163.218",
        "description": "Sensor attached to the factory ceilings",
        "temp": [40,40,40,40,40,43,42,40,40,45,42,45],
        "c02_level": [1346,1345, 1343],
        "geo": {"lat":33.61, "long":-111.89}
      },
      "sensor-istick": {
        "id": 5,
        "ip": "204.116.105.67",
        "description": "Sensor embedded in exhaust pipes in the ceilings",
        "temp":[30,30,30,30,40,43,42,40,40,35,42,35],
        "c02_level": [1574,1570, 1576],
        "geo": {"lat":35.93, "long":-85.46}
      }
    }
  }""", schema)

display(dataDF)

In [78]:
dataDF.printSchema()

In [79]:
explodedDF = dataDF.select("dc_id", explode("source"))
display(explodedDF)

In [80]:
devicesDataDF = explodedDF.select("dc_id", "key", \
                        "value.ip", \
                        col("value.id").alias("device_id"), \
                        col("value.c02_level").alias("c02_levels"), \
                        "value.temp")
display(devicesDataDF)

In [81]:
devicesDataDF.printSchema()

In [82]:
devicesDataDF.createOrReplaceTempView("data_center_iot_devices")

In [83]:
%sql select * from data_center_iot_devices

In [84]:
%sql describe data_center_iot_devices

In [85]:
%sql select key, ip, device_id, temp,
     transform (temp, t -> ((t * 9) div 5) + 32 ) as fahrenheit_temp
     from data_center_iot_devices

In [86]:
%sql select dc_id, key, ip, device_id, c02_levels, temp, 
     transform (c02_levels, t -> t > 1300) as high_c02_levels
     from data_center_iot_devices

In [87]:
%sql select dc_id, key, ip, device_id, c02_levels, temp, 
     filter (c02_levels, t -> t > 1300) as high_c02_levels
     from data_center_iot_devices

In [88]:
%sql select dc_id, key, ip, device_id, c02_levels, temp, 
     filter (c02_levels, t -> t < 1300 ) as high_c02_levels
     from data_center_iot_devices

In [89]:
%sql select dc_id, key, ip, device_id, c02_levels, temp, 
     exists (temp, t -> t = 45 ) as value_exists
     from data_center_iot_devices

In [90]:
%sql select dc_id, key, ip, device_id, c02_levels, temp, 
     exists (c02_levels, t -> t = 1570 ) as high_c02_levels
     from data_center_iot_devices

In [91]:
%sql select key, ip, device_id, temp,
    reduce(temp, 0, (t, acc) -> t + acc, acc-> (acc div size(temp) * 9 div 5) + 32 ) as average_f_temp
    from data_center_iot_devices
    sort by average_f_temp desc

In [92]:
%sql select key, ip, device_id, c02_levels,
    reduce(c02_levels, 0, (t, acc) -> t + acc, acc-> acc div size(c02_levels)) as average_c02_levels
    from data_center_iot_devices
    sort by  average_c02_levels desc


In [93]:
%sql select key, ip, device_id, c02_levels,
     aggregate(c02_levels,
               (1.0 as product, 0 as N),
               (buffer, c02) -> (c02 * buffer.product, buffer.N+1),
               buffer -> round(Power(buffer.product, 1.0 / buffer.N))) as c02_geomean
     from data_center_iot_devices
     sort by c02_geomean desc

In [94]:
schema2 = StructType() \
                    .add("device_id", IntegerType()) \
                    .add("battery_level", ArrayType(IntegerType())) \
                    .add("c02_level", ArrayType(IntegerType())) \
                    .add("signal", ArrayType(IntegerType())) \
                    .add("temp", ArrayType(IntegerType())) \
                    .add("cca3", ArrayType(StringType())) \
                    .add("device_type", StringType()) \
                    .add("ip", StringType()) \
                    .add("timestamp", TimestampType())

In [95]:
dataDF2 = jsonToDataFrame("""[
  {"device_id": 0, "device_type": "sensor-ipad", "ip": "68.161.225.1", "cca3": ["USA", "United States"], "temp": [25,26, 27], "signal": [23,22,24], "battery_level": [8,9,7], "c02_level": [917, 921, 925], "timestamp" :1475600496 }, 
  {"device_id": 1, "device_type": "sensor-igauge", "ip": "213.161.254.1", "cca3": ["NOR", "Norway"], "temp": [30, 32,35], "signal": [18,18,19], "battery_level": [6, 6, 5], "c02_level": [1413, 1416, 1417], "timestamp" :1475600498 }, 
  {"device_id": 3, "device_type": "sensor-inest", "ip": "66.39.173.154", "cca3": ["USA", "United States"], "temp":[47, 47, 48], "signal": [12,12,13], "battery_level": [1, 1, 0],  "c02_level": [1447,1446, 1448], "timestamp" :1475600502 }, 
  {"device_id": 4, "device_type": "sensor-ipad", "ip": "203.82.41.9", "cca3":["PHL", "Philippines"], "temp":[29, 29, 28], "signal":[11, 11, 11], "battery_level":[0, 0, 0], "c02_level": [983, 990, 982], "timestamp" :1475600504 },
  {"device_id": 5, "device_type": "sensor-istick", "ip": "204.116.105.67", "cca3": ["USA", "United States"], "temp":[50,51,50], "signal": [16,16,17], "battery_level": [8,8, 8], "c02_level": [1574,1575,1576], "timestamp" :1475600506 }, 
  {"device_id": 6, "device_type": "sensor-ipad", "ip": "220.173.179.1", "cca3": ["CHN", "China"], "temp": [21,21,22], "signal": [18,18,19], "battery_level": [9,9,9], "c02_level": [1249,1249,1250], "timestamp" :1475600508 },
  {"device_id": 7, "device_type": "sensor-ipad", "ip": "118.23.68.227", "cca3": ["JPN", "Japan"], "temp":[27,27,28], "signal": [15,15,29], "battery_level":[0,0,0], "c02_level": [1531,1532,1531], "timestamp" :1475600512 },
  {"device_id": 8, "device_type": "sensor-inest", "ip": "208.109.163.218", "cca3": ["USA", "United States"], "temp":[40,40,41], "signal": [16,16,17], "battery_level":[ 9, 9, 10], "c02_level": [1208,1209,1208], "timestamp" :1475600514},
  {"device_id": 9, "device_type": "sensor-ipad", "ip": "88.213.191.34", "cca3": ["ITA", "Italy"], "temp": [19,28,5], "signal": [11, 5, 24], "battery_level": [0,-1,0], "c02_level": [1171, 1240, 1400], "timestamp" :1475600516 },
  {"device_id": 10, "device_type": "sensor-igauge", "ip": "68.28.91.22", "cca3": ["USA", "United States"], "temp": [32,33,32], "signal": [26,26,25], "battery_level": [7,7,8], "c02_level": [886,886,887], "timestamp" :1475600518 },
  {"device_id": 11, "device_type": "sensor-ipad", "ip": "59.144.114.250", "cca3": ["IND", "India"], "temp": [46,45,44], "signal": [25,25,24], "battery_level": [4,5,5], "c02_level": [863,862,864], "timestamp" :1475600520 },
  {"device_id": 12, "device_type": "sensor-igauge", "ip": "193.156.90.200", "cca3": ["NOR", "Norway"], "temp": [18,17,18], "signal": [26,25,26], "battery_level": [8,9,8], "c02_level": [1220,1221,1220], "timestamp" :1475600522 },
  {"device_id": 13, "device_type": "sensor-ipad", "ip": "67.185.72.1", "cca3": ["USA", "United States"], "temp": [34,35,34], "signal": [20,21,20], "battery_level": [8,8,8], "c02_level": [1504,1504,1503], "timestamp" :1475600524 },
  {"device_id": 14, "device_type": "sensor-inest", "ip": "68.85.85.106", "cca3": ["USA", "United States"], "temp": [39,40,38], "signal": [17, 17, 18], "battery_level": [8,8,7], "c02_level": [831,832,831], "timestamp" :1475600526 },
  {"device_id": 15, "device_type": "sensor-ipad", "ip": "161.188.212.254", "cca3": ["USA", "United States"], "temp": [27,27,28], "signal": [26,26,25], "battery_level": [5,5,5], "c02_level": [1378,1376,1378], "timestamp" :1475600528 },
  {"device_id": 16, "device_type": "sensor-igauge", "ip": "221.3.128.242", "cca3": ["CHN", "China"], "temp": [10,10,11], "signal": [24,24,23], "battery_level": [6,5,6], "c02_level": [1423, 1423, 1423], "timestamp" :1475600530 },
  {"device_id": 17, "device_type": "sensor-ipad", "ip": "64.124.180.215", "cca3": ["USA", "United States"], "temp": [38,38,39], "signal": [17,17,17], "battery_level": [9,9,9], "c02_level": [1304,1304,1304], "timestamp" :1475600532 },
  {"device_id": 18, "device_type": "sensor-igauge", "ip": "66.153.162.66", "cca3": ["USA", "United States"], "temp": [26, 0, 99], "signal": [10, 1, 5], "battery_level": [0, 0, 0], "c02_level": [902,902, 1300], "timestamp" :1475600534 },
  {"device_id": 19, "device_type": "sensor-ipad", "ip": "193.200.142.254", "cca3": ["AUT", "Austria"], "temp": [32,32,33], "signal": [27,27,28], "battery_level": [5,5,5], "c02_level": [1282, 1282, 1281], "timestamp" :1475600536 }
  ]""", schema2)

display(dataDF2)

In [96]:
dataDF2.printSchema()

In [97]:
dataDF2.createOrReplaceTempView("iot_nested_data")

In [98]:
%sql select cca3, device_type, battery_level,
     transform (battery_level, bl -> bl > 0) as boolean_battery_level
     from iot_nested_data

In [99]:
%sql select cca3,
     transform (cca3, c -> lcase(c)) as lower_cca3,
     transform (cca3, c -> ucase(c)) as upper_cca3
     from iot_nested_data

In [100]:
%sql select cca3, device_type, battery_level,
     filter (battery_level, bl -> bl < 5) as low_levels
     from iot_nested_data

In [101]:
%sql select cca3, device_type, battery_level,
     reduce(battery_level, 0, (t, acc) -> t + acc,  acc -> acc div size(battery_level) ) as average_battery_level
     from iot_nested_data
     sort by average_battery_level desc

In [102]:
%sql select cca3, device_type, temp,
     reduce(temp, 0, (t, acc) -> t + acc,  acc -> acc div size(temp) ) as average_temp
     from iot_nested_data
     sort by average_temp desc

In [103]:
%sql select cca3, device_type, c02_level,
     reduce(c02_level, 0, (t, acc) -> t + acc,  acc -> acc div size(c02_level) ) as average_c02_level
     from iot_nested_data
     sort by average_c02_level desc

In [104]:
%sql select cca3, device_type, signal, temp, c02_level,
     reduce(signal, 0, (s, sacc) -> s + sacc,  sacc -> sacc div size(signal) ) as average_signal,
     reduce(temp, 0, (t, tacc) -> t + tacc,  tacc -> tacc div size(temp) ) as average_temp,
     reduce(c02_level, 0, (c, cacc) -> c + cacc,  cacc -> cacc div size(c02_level) ) as average_c02_level
     from iot_nested_data
     sort by average_signal desc

In [105]:
%scala
import org.apache.spark.sql.expressions.MutableAggregationBuffer
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

class GeometricMean extends UserDefinedAggregateFunction {
  // This is the input fields for your aggregate function.
  override def inputSchema: org.apache.spark.sql.types.StructType =
    StructType(StructField("value", DoubleType) :: Nil)

  // This is the internal fields you keep for computing your aggregate.
  override def bufferSchema: StructType = StructType(
    StructField("count", LongType) ::
    StructField("product", DoubleType) :: Nil
  )

  // This is the output type of your aggregatation function.
  override def dataType: DataType = DoubleType

  override def deterministic: Boolean = true

  // This is the initial value for your buffer schema.
  override def initialize(buffer: MutableAggregationBuffer): Unit = {
    buffer(0) = 0L
    buffer(1) = 1.0
  }

  // This is how to update your buffer schema given an input.
  override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    buffer(0) = buffer.getAs[Long](0) + 1
    buffer(1) = buffer.getAs[Double](1) * input.getAs[Double](0)
  }

  // This is how to merge two objects with the bufferSchema type.
  override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    buffer1(0) = buffer1.getAs[Long](0) + buffer2.getAs[Long](0)
    buffer1(1) = buffer1.getAs[Double](1) * buffer2.getAs[Double](1)
  }

  // This is where you output the final value, given the final value of your bufferSchema.
  override def evaluate(buffer: Row): Any = {
    math.pow(buffer.getDouble(1), 1.toDouble / buffer.getLong(0))
  }
}

In [106]:
%scala
sqlContext.udf.register("gm", new GeometricMean)

In [107]:
%scala
// Create a DataFrame and Spark SQL Table to query.
import org.apache.spark.sql.functions._

val ids = sqlContext.range(1, 20)
ids.registerTempTable("ids")
val df = sqlContext.sql("select id, id % 3 as group_id from ids")
df.registerTempTable("simple")

In [108]:
%sql
-- Use a group_by statement and call the UDAF.
select group_id, gm(id) from simple group by group_id

In [109]:
%scala
// Or use Dataframe syntax to call the aggregate function.

// Create an instance of UDAF GeometricMean.
val gm = new GeometricMean

// Show the geometric mean of values of column "id".
df.groupBy("group_id").agg(gm(col("id")).as("GeometricMean")).show()

// Invoke the UDAF by its assigned name.
df.groupBy("group_id").agg(expr("gm(id) as GeometricMean")).show()

In [110]:
def squared(s):
  return s * s
sqlContext.udf.register("squaredWithPython", squared)

In [111]:
from pyspark.sql.types import LongType
def squared_typed(s):
  return s * s
sqlContext.udf.register("squaredWithPython", squared, LongType())

In [112]:
sqlContext.range(1, 20).registerTempTable("test")

In [113]:
%sql select id, squaredWithPython(id) as id_squared from test

In [114]:
%sql select * FROM test

In [115]:
from pyspark.sql.functions import udf
squared_udf = udf(squared, LongType())
df = sqlContext.table("test")
display(df.select("id", squared_udf("id").alias("id_squared")))

In [116]:
%scala
val squared = (s: Int) => {
  s * s
}
sqlContext.udf.register("square", squared)

In [117]:
%scala
sqlContext.range(1, 20).registerTempTable("test")

In [118]:
%sql select id, square(id) as id_squared from test