## ICRI Envirosensor Data Analysis

#### 1. Download dependencies

In [1]:
#Import json
import json

#Import ijson to read large json files iteratively
import ijson

#Import Time module
import time

#Import Apache Spark
from pyspark.sql import SparkSession

#Import Pandas for data analysis 
import pandas as pd
from pandas import DataFrame
from pandas.io.json import json_normalize

#### 2. Identify data location

In [2]:
from os import path

#In windows the 'r' preceding the file name string indcates that it is a raw sting so that slashes are interpreted correctly
#filename = path.expanduser(r'data\here_east_envirosensors.json')

#Test file
filename = path.expanduser(r'data\Envirosensor_TEST.json')

#Check that the name has been assigned to the variable correctly
print(filename)

data\Envirosensor_TEST.json


#### 3. Check file size in KB to ensure it can be loaded into memory

In [3]:
path.getsize(filename) / (1<<10)

5.390625

#### 4. Parse JSON data iteratively with ijson

In [4]:
#Define function to iteratively parse json file
def iterativeParse(json):
    with open(json, 'r') as json_file:
        #Use the items method in ijson to extract a list of objects specifying the file and key path to to the list
        objects = ijson.items(json_file, 'item.data')
        #The items fuction returns a generator which we turn into a list of payloads with the list function 
        parsedData = list(objects)

    return parsedData

#Time execution of the iterativeParse function and assign the result to payloads variable
start = time.time()
payloads = iterativeParse(filename)
end = time.time()

print(f'\nTime to complete: {end - start:.2f}s\n')

#Check the first item in the payloads list
print(payloads[0])


Time to complete: 0.00s

{ "DeviceID": "8010", "DeviceType": "Envirosensor", "Event": "event", "Time": "2018-06-03 20:40:41.629620", "Data": { "TMP": "36.187", "OPT": "4.17", "BAT": "36.58", "HDT": "36.32", "BAR": "1016.17", "HDH": "22.65" } }


#### 5. Display number of payloads

In [5]:
#Count items in the payloads list
print('Total Sensor Payloads =', len(payloads))

Total Sensor Payloads = 11


#### 6. Loop through payloads list and add each to an empty Pandas dataframe

In [47]:
#Create an empty dataframe
df = DataFrame()

#Define a function to loop through the collection of payloads and add each one to the dataframe
def normaliseDataFrame(x):
    tempdf = DataFrame()
    for i in x:
        row = json_normalize(json.loads(i))
        tempdf = tempdf.append(row, sort=False)
        print(row)
        
    return tempdf

#Time execution of the normaliseDataFrame function and assign the result to payloads variable
start = time.time()
df = normalisedDataFrame(payloads)
end = time.time()

print(f'\nTime to complete: {end - start:.2f}s\n')
    

  Data.BAR Data.BAT Data.HDH Data.HDT Data.OPT Data.TMP DeviceID  \
0  1016.17    36.58    22.65    36.32     4.17   36.187     8010   

     DeviceType  Event                        Time  
0  Envirosensor  event  2018-06-03 20:40:41.629620  
  Data.BAR Data.BAT Data.HDH Data.HDT Data.OPT Data.TMP DeviceID  \
0  1016.14    35.06    28.68    34.76     0.00   34.562     8017   

     DeviceType  Event                        Time  
0  Envirosensor  event  2018-06-01 04:22:22.881564  
  Data.BAR Data.BAT Data.HDH Data.HDT Data.OPT Data.TMP DeviceID  \
0  1017.23    35.49    26.23    35.06     6.17   34.781     8008   

     DeviceType  Event                        Time  
0  Envirosensor  event  2018-06-01 13:04:56.152669  
  Data.BAR Data.BAT Data.HDH Data.HDT Data.OPT Data.TMP DeviceID  \
0  1013.88    34.07    21.58    34.14    73.04   33.843     8019   

     DeviceType  Event                        Time  
0  Envirosensor  event  2018-05-29 22:48:18.852665  
  Data.BAR Data.BAT Data.HDH

#### 7. Display dataframe summary information

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11 entries, 0 to 0
Data columns (total 10 columns):
Data.BAR      11 non-null object
Data.BAT      11 non-null object
Data.HDH      11 non-null object
Data.HDT      11 non-null object
Data.OPT      11 non-null object
Data.TMP      11 non-null object
DeviceID      11 non-null object
DeviceType    11 non-null object
Event         11 non-null object
Time          11 non-null object
dtypes: object(10)
memory usage: 968.0+ bytes


In [49]:
df

Unnamed: 0,Data.BAR,Data.BAT,Data.HDH,Data.HDT,Data.OPT,Data.TMP,DeviceID,DeviceType,Event,Time
0,1016.17,36.58,22.65,36.32,4.17,36.187,8010,Envirosensor,event,2018-06-03 20:40:41.629620
0,1016.14,35.06,28.68,34.76,0.0,34.562,8017,Envirosensor,event,2018-06-01 04:22:22.881564
0,1017.23,35.49,26.23,35.06,6.17,34.781,8008,Envirosensor,event,2018-06-01 13:04:56.152669
0,1013.88,34.07,21.58,34.14,73.04,33.843,8019,Envirosensor,event,2018-05-29 22:48:18.852665
0,1017.59,35.05,27.41,34.77,6.67,34.625,8010,Envirosensor,event,2018-06-01 13:19:15.175496
0,1017.23,36.19,18.9,36.06,4.89,35.906,8004,Envirosensor,event,2018-06-02 23:12:12.230449
0,1018.17,36.51,21.66,36.07,8.5,35.812,8008,Envirosensor,event,2018-06-03 07:33:16.774434
0,1013.19,34.93,20.34,35.01,74.0,34.718,8019,Envirosensor,event,2018-05-30 09:25:24.052756
0,1018.45,35.87,22.71,35.52,108.72,35.281,8005,Envirosensor,event,2018-06-03 00:30:56.678239
0,1018.45,37.08,17.05,36.84,0.24,36.5,8018,Envirosensor,event,2018-06-03 09:34:58.436492


## Apache Spark

In [6]:
#Create a Spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [80]:
#Place JSON into a Spark dataframe
df = spark.read.json(filename, multiLine=True)
# Displays the content of the DataFrame to stdout
df.show()

+--------------------+--------+------------+---------+------+--------------------+--------------------+
|                data|deviceId|  deviceType|eventType|format|    json_featuretype|           timestamp|
+--------------------+--------+------------+---------+------+--------------------+--------------------+
|{ "DeviceID": "80...|    8010|Envirosensor|    event|  json|iotp_kfb22t_envir...|2018-06-03T20:40:...|
|{ "DeviceID": "80...|    8017|Envirosensor|    event|  json|iotp_kfb22t_envir...|2018-06-01T04:22:...|
|{ "DeviceID": "80...|    8008|Envirosensor|    event|  json|iotp_kfb22t_envir...|2018-06-01T13:04:...|
|{ "DeviceID": "80...|    8019|Envirosensor|    event|  json|iotp_kfb22t_envir...|2018-05-29T22:48:...|
|{ "DeviceID": "80...|    8010|Envirosensor|    event|  json|iotp_kfb22t_envir...|2018-06-01T13:19:...|
|{ "DeviceID": "80...|    8004|Envirosensor|    event|  json|iotp_kfb22t_envir...|2018-06-02T23:12:...|
|{ "DeviceID": "80...|    8008|Envirosensor|    event|  json|iot

In [81]:
df.printSchema()

root
 |-- data: string (nullable = true)
 |-- deviceId: string (nullable = true)
 |-- deviceType: string (nullable = true)
 |-- eventType: string (nullable = true)
 |-- format: string (nullable = true)
 |-- json_featuretype: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [82]:
df.select("data").show()

+--------------------+
|                data|
+--------------------+
|{ "DeviceID": "80...|
|{ "DeviceID": "80...|
|{ "DeviceID": "80...|
|{ "DeviceID": "80...|
|{ "DeviceID": "80...|
|{ "DeviceID": "80...|
|{ "DeviceID": "80...|
|{ "DeviceID": "80...|
|{ "DeviceID": "80...|
|{ "DeviceID": "80...|
|{ "DeviceID": "80...|
+--------------------+



In [45]:
columns = ['data']
values = ['1']
df = spark.createDataFrame(values, columns)

#Define a function to loop through the collection of payloads and add each one to the dataframe
def normalisedSparkDataFrame(x):
    for i in x:
        data = json_normalize(json.loads(i))
        row = spark.createDataFrame(data)
        tempdf = row.union(row)
        print(tempdf)
        
    return tempdf

#Time execution of the normaliseDataFrame function and assign the result to payloads variable
start = time.time()
df = normalisedSparkDataFrame(payloads)
end = time.time()

print(f'\nTime to complete: {end - start:.2f}s\n')


TypeError: Can not infer schema for type: <class 'str'>

In [27]:
df.show()

Py4JJavaError: An error occurred while calling o2985.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 5.0 failed 1 times, most recent failure: Lost task 0.0 in stage 5.0 (TID 5, localhost, executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:108)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)
	... 38 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3384)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2545)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2545)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3365)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2545)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2759)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:255)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:292)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:108)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)
	... 38 more
