## ICRI Envirosensor Data Analysis

#### Download dependencies

In [1]:
#Import json
import json

#Import ijson to read large json files iteratively
import ijson

#Import Time module
import time

#Import Apache Spark
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql import functions

from pyspark import SparkContext

#Import relevant data types from pyspark schema definition
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

#### Identify data location

In [35]:
from os import path

#In windows the 'r' preceding the file name string indcates that it is a raw sting so that slashes are interpreted correctly
#filename = path.expanduser(r'data\here_east_envirosensors.json')

#Test file
#filename = path.expanduser(r'data\Envirosensor_TEST.json')

filename = path.expanduser(r'data\iotp_kfb22t_envirosensor_2018-w19.json')

#Check that the name has been assigned to the variable correctly
print(filename)

data\iotp_kfb22t_envirosensor_2018-w20.json


#### Check file size in KB to ensure it can be loaded into memory

In [36]:
path.getsize(filename) / (1<<10)

83830.802734375

## Parse JSON

#### Parse JSON data iteratively with ijson

In [37]:
#Define function to iteratively parse json file
def iterativeParse(json):
    with open(json, 'r') as json_file:
        #Use the items method in ijson to extract a list of objects specifying the file and key path to to the list
        objects = ijson.items(json_file, 'item.data')
        #The items fuction returns a generator which we turn into a list of payloads with the list function 
        parsedData = list(objects)

    return parsedData

#Time execution of the iterativeParse function and assign the result to payloads variable
start = time.time()
payloads = iterativeParse(filename)
end = time.time()

#Display time taken to complete process
print(f'\nTime to complete: {end - start:.2f}s\n')

#Count items in the payloads list
print('Total Sensor Payloads =', len(payloads), '\n')

#Check the first item in the payloads list
print('Example Sensor Payload:\n')
print(payloads[0])


Time to complete: 15.09s

Total Sensor Payloads = 171144 

Example Sensor Payload:

{ "DeviceID": "8003", "DeviceType": "Envirosensor", "Event": "event", "Time": "2018-05-19 07:08:07.329146", "Data": { "OPT": "47.88", "TMP": "34.343", "BAT": "34.79", "HDT": "34.60", "BAR": "1024.65", "HDH": "11.68" } }


## Apache Spark

#### Setup Apache Spark

In [38]:
#Create a Spark session
spark = SparkSession.builder.appName("Envirosensor Data Analysis").getOrCreate()

SparkContext.setSystemProperty('spark.executor.memory', '4g')

In [39]:
#Check Spark 
spark

## Parse JSON Directly into Spark Dataframe

#### Read JSON data directly into Apache Spark data frame

In [40]:
#Place JSON into a Spark dataframe and time execution
start = time.time()
df = spark.read.json(filename, multiLine=True)
end = time.time()

print(f'\nTime to complete: {end - start:.2f}s\n')


Time to complete: 0.39s



#### Display the dataframe

In [41]:
# Display the content of the DataFrame
df.show(5)

+--------------------+--------+------------+---------+------+--------------------+--------------------+
|                data|deviceId|  deviceType|eventType|format|    json_featuretype|           timestamp|
+--------------------+--------+------------+---------+------+--------------------+--------------------+
|{ "DeviceID": "80...|    8003|Envirosensor|    event|  json|iotp_kfb22t_envir...|2018-05-19T07:08:...|
|{ "DeviceID": "80...|    8015|Envirosensor|    event|  json|iotp_kfb22t_envir...|2018-05-17T08:50:...|
|{ "DeviceID": "80...|    8004|Envirosensor|    event|  json|iotp_kfb22t_envir...|2018-05-18T15:23:...|
|{ "DeviceID": "80...|    8018|Envirosensor|    event|  json|iotp_kfb22t_envir...|2018-05-14T01:16:...|
|{ "DeviceID": "80...|    8018|Envirosensor|    event|  json|iotp_kfb22t_envir...|2018-05-14T17:15:...|
+--------------------+--------+------------+---------+------+--------------------+--------------------+
only showing top 5 rows



## Spark Resilient Distributed Datasets (RDD's)

#### Establish sensor data payload schema by loading a sample into a data frame

In [42]:
print(payloads[0:1])

#Time execution of loading payloads into RDD's
start = time.time()
sc = spark.sparkContext
testRDD = sc.parallelize(payloads[0:1],8)
end = time.time()

print(f'\nTime to complete: {end - start:.2f}s\n')

test_df = spark.read.json(testRDD)

test_df.show(5, False)

test_df.printSchema()
test_df.schema

['{ "DeviceID": "8003", "DeviceType": "Envirosensor", "Event": "event", "Time": "2018-05-19 07:08:07.329146", "Data": { "OPT": "47.88", "TMP": "34.343", "BAT": "34.79", "HDT": "34.60", "BAR": "1024.65", "HDH": "11.68" } }']

Time to complete: 0.01s

+---------------------------------------------+--------+------------+-----+--------------------------+
|Data                                         |DeviceID|DeviceType  |Event|Time                      |
+---------------------------------------------+--------+------------+-----+--------------------------+
|[1024.65, 34.79, 11.68, 34.60, 47.88, 34.343]|8003    |Envirosensor|event|2018-05-19 07:08:07.329146|
+---------------------------------------------+--------+------------+-----+--------------------------+

root
 |-- Data: struct (nullable = true)
 |    |-- BAR: string (nullable = true)
 |    |-- BAT: string (nullable = true)
 |    |-- HDH: string (nullable = true)
 |    |-- HDT: string (nullable = true)
 |    |-- OPT: string (nullable =

StructType(List(StructField(Data,StructType(List(StructField(BAR,StringType,true),StructField(BAT,StringType,true),StructField(HDH,StringType,true),StructField(HDT,StringType,true),StructField(OPT,StringType,true),StructField(TMP,StringType,true))),true),StructField(DeviceID,StringType,true),StructField(DeviceType,StringType,true),StructField(Event,StringType,true),StructField(Time,StringType,true)))

#### Build and store the data schema

In [43]:
#Build and return a schema to use for the sample data
def build_schema():
    
    schema = test_df.schema
    
    return schema

#### Load payloads into RDD's

In [44]:
#Time execution of loading payloads into RDD's
start = time.time()
sc = spark.sparkContext
payloadsRDD = sc.parallelize(payloads,8)
end = time.time()

print(f'\nTime to complete: {end - start:.2f}s\n')


Time to complete: 0.09s



#### Convert RDD to dataframe using data schema

In [45]:
processedData = spark.read.json(payloadsRDD, schema=build_schema())

#### Display the Spark Dataframe

In [47]:
processedData.show(5,False)

+---------------------------------------------+--------+------------+-----+--------------------------+
|Data                                         |DeviceID|DeviceType  |Event|Time                      |
+---------------------------------------------+--------+------------+-----+--------------------------+
|[1024.65, 34.79, 11.68, 34.60, 47.88, 34.343]|8003    |Envirosensor|event|2018-05-19 07:08:07.329146|
|[1027.47, 34.52, 11.18, 34.36, 5.70, 34.156] |8015    |Envirosensor|event|2018-05-17 08:50:42.519376|
|[1023.23, 34.77, 6.57, 34.67, 7.15, 34.531]  |8004    |Envirosensor|event|2018-05-18 15:23:13.753004|
|[1017.18, 36.80, 8.17, 36.56, 0.08, 36.250]  |8018    |Envirosensor|event|2018-05-14 01:16:08.350742|
|[1018.68, 34.50, 9.42, 34.31, 84.96, 33.968] |8018    |Envirosensor|event|2018-05-14 17:15:15.889760|
+---------------------------------------------+--------+------------+-----+--------------------------+
only showing top 5 rows



In [48]:
processedData.count() 

171144

In [49]:
start = time.time()
flat_df = processedData.select("DeviceType", "DeviceID", "Event", "Time", "Data.BAR", "Data.BAT", "Data.HDH", "Data.HDT", "Data.OPT", "Data.TMP")
#flat_df.show(5,False)
end = time.time()

print(f'\nTime to complete: {end - start:.2f}s\n')


Time to complete: 0.02s



In [50]:
flat_df.count()

171144

In [51]:
flat_df.printSchema()
flat_df.schema

root
 |-- DeviceType: string (nullable = true)
 |-- DeviceID: string (nullable = true)
 |-- Event: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- BAR: string (nullable = true)
 |-- BAT: string (nullable = true)
 |-- HDH: string (nullable = true)
 |-- HDT: string (nullable = true)
 |-- OPT: string (nullable = true)
 |-- TMP: string (nullable = true)



StructType(List(StructField(DeviceType,StringType,true),StructField(DeviceID,StringType,true),StructField(Event,StringType,true),StructField(Time,StringType,true),StructField(BAR,StringType,true),StructField(BAT,StringType,true),StructField(HDH,StringType,true),StructField(HDT,StringType,true),StructField(OPT,StringType,true),StructField(TMP,StringType,true)))

## Display Summary Statistics

In [110]:
flat_df.describe('DeviceType', 'DeviceID', 'Event', 'Time').show()
flat_df.describe('BAR', 'BAT', 'HDH', 'HDT', 'OPT', 'TMP').show()

+-------+------------+------------------+------+--------------------+
|summary|  DeviceType|          DeviceID| Event|                Time|
+-------+------------+------------------+------+--------------------+
|  count|      171144|            171144|171144|              171144|
|   mean|        null|8009.8408299443745|  null|                null|
| stddev|        null| 5.603288143194462|  null|                null|
|    min|Envirosensor|              8001| event|2018-05-11 16:27:...|
|    max|Envirosensor|              8019| event|2018-05-20 23:59:...|
+-------+------------+------------------+------+--------------------+

+-------+------------------+------------------+------------------+------------------+-----------------+------------------+
|summary|               BAR|               BAT|               HDH|               HDT|              OPT|               TMP|
+-------+------------------+------------------+------------------+------------------+-----------------+------------------+


## Analyse by Device

#### Filter results by Device

In [113]:
sensor_df = flat_df.filter(flat_df["DeviceID"]=="8001")

#### Display summary for sensor

In [114]:
sensor_df.describe('DeviceType', 'DeviceID', 'Event', 'Time').show()
sensor_df.describe('BAR', 'BAT', 'HDH', 'HDT', 'OPT', 'TMP').show()

+-------+------------+--------------------+-----+--------------------+
|summary|  DeviceType|            DeviceID|Event|                Time|
+-------+------------+--------------------+-----+--------------------+
|  count|       10077|               10077|10077|               10077|
|   mean|        null|              8001.0| null|                null|
| stddev|        null|2.280485101424363...| null|                null|
|    min|Envirosensor|                8001|event|2018-05-14 00:00:...|
|    max|Envirosensor|                8001|event|2018-05-20 23:59:...|
+-------+------------+--------------------+-----+--------------------+

+-------+------------------+------------------+------------------+-------------------+------------------+------------------+
|summary|               BAR|               BAT|               HDH|                HDT|               OPT|               TMP|
+-------+------------------+------------------+------------------+-------------------+------------------+-----

#### Display sorted sensor readings

In [115]:
sensor_df.sort('Time').show(10,False)

+------------+--------+-----+--------------------------+-------+-----+-----+-----+----+------+
|DeviceType  |DeviceID|Event|Time                      |BAR    |BAT  |HDH  |HDT  |OPT |TMP   |
+------------+--------+-----+--------------------------+-------+-----+-----+-----+----+------+
|Envirosensor|8001    |event|2018-05-14 00:00:02.060920|1017.05|36.27|14.27|35.92|0.64|35.718|
|Envirosensor|8001    |event|2018-05-14 00:01:02.059995|1017.10|36.27|14.27|35.92|2.33|35.718|
|Envirosensor|8001    |event|2018-05-14 00:02:02.066215|1017.10|36.27|14.27|35.92|2.33|35.718|
|Envirosensor|8001    |event|2018-05-14 00:03:02.064938|1017.10|36.27|14.27|35.92|0.80|35.718|
|Envirosensor|8001    |event|2018-05-14 00:04:02.082527|1017.01|36.26|14.27|35.92|0.64|35.718|
|Envirosensor|8001    |event|2018-05-14 00:05:02.123239|1017.10|36.27|14.27|35.92|2.33|35.718|
|Envirosensor|8001    |event|2018-05-14 00:06:02.143987|1017.07|36.26|14.27|35.93|2.33|35.718|
|Envirosensor|8001    |event|2018-05-14 00:07:02.1