## ICRI Envirosensor Data Analysis

#### 1. Download dependencies

In [11]:
#Import ijson to read large json files iteratively
import ijson

#Import json
import json

#Import Pandas for data analysis 
import pandas as pd
from pandas import DataFrame
from pandas.io.json import json_normalize

#### 2. Identify data location

In [18]:
from os import path

#In windows the 'r' preceding the file name string indcates that it is a raw sting so that slashes are interpreted correctly
filename = path.expanduser(r'data\here_east_envirosensors.json')

#Test file
#filename = path.expanduser(r'data\Envirosensor_TEST.json')

#Check that the name has been assigned to the variable correctly
print(filename)

data\here_east_envirosensors.json


#### 3. Check file size in KB to ensure it can be loaded into memory

In [19]:
path.getsize(filename) / (1<<10)

1596960.076171875

#### 4. Parse JSON data iteratively with ijson

In [20]:
with open(filename, 'r') as json_file:
    #Use the items method in ijson to extract a list of objects specifying the file and key path to to the list
    objects = ijson.items(json_file, 'item.data')
    #The items fuction returns a generator which we turn into a list of payloads with the list function 
    payloads = list(objects)
    #Check the first item in the payloads list
    print(payloads[0])

{ "DeviceID": "8010", "DeviceType": "Envirosensor", "Event": "event", "Time": "2018-06-03 20:40:41.629620", "Data": { "TMP": "36.187", "OPT": "4.17", "BAT": "36.58", "HDT": "36.32", "BAR": "1016.17", "HDH": "22.65" } }


#### 5. Display number of payloads

In [21]:
#Count items in the payloads list
print('Total Sensor Payloads =', len(payloads))

Total Sensor Payloads = 3258298


#### 6. Loop through payloads list and add each to an empty Pandas dataframe

In [16]:
#Create an empty dataframe
df = DataFrame()

#Loop through collection of payloads and add each one to the dataframe
for i in payloads:
    row = json_normalize(json.loads(i))
    df = df.append(row, sort=False)
    print(row)

  Data.BAR Data.BAT Data.HDH Data.HDT Data.OPT Data.TMP DeviceID  \
0  1016.17    36.58    22.65    36.32     4.17   36.187     8010   

     DeviceType  Event                        Time  
0  Envirosensor  event  2018-06-03 20:40:41.629620  
  Data.BAR Data.BAT Data.HDH Data.HDT Data.OPT Data.TMP DeviceID  \
0  1016.14    35.06    28.68    34.76     0.00   34.562     8017   

     DeviceType  Event                        Time  
0  Envirosensor  event  2018-06-01 04:22:22.881564  
  Data.BAR Data.BAT Data.HDH Data.HDT Data.OPT Data.TMP DeviceID  \
0  1017.23    35.49    26.23    35.06     6.17   34.781     8008   

     DeviceType  Event                        Time  
0  Envirosensor  event  2018-06-01 13:04:56.152669  
  Data.BAR Data.BAT Data.HDH Data.HDT Data.OPT Data.TMP DeviceID  \
0  1013.88    34.07    21.58    34.14    73.04   33.843     8019   

     DeviceType  Event                        Time  
0  Envirosensor  event  2018-05-29 22:48:18.852665  
  Data.BAR Data.BAT Data.HDH

#### 7. Display dataframe summary information

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11 entries, 0 to 0
Data columns (total 10 columns):
Data.BAR      11 non-null object
Data.BAT      11 non-null object
Data.HDH      11 non-null object
Data.HDT      11 non-null object
Data.OPT      11 non-null object
Data.TMP      11 non-null object
DeviceID      11 non-null object
DeviceType    11 non-null object
Event         11 non-null object
Time          11 non-null object
dtypes: object(10)
memory usage: 968.0+ bytes


In [8]:
df

Unnamed: 0,Data.BAR,Data.BAT,Data.HDH,Data.HDT,Data.OPT,Data.TMP,DeviceID,DeviceType,Event,Time
0,1016.17,36.58,22.65,36.32,4.17,36.187,8010,Envirosensor,event,2018-06-03 20:40:41.629620
0,1016.14,35.06,28.68,34.76,0.0,34.562,8017,Envirosensor,event,2018-06-01 04:22:22.881564
0,1017.23,35.49,26.23,35.06,6.17,34.781,8008,Envirosensor,event,2018-06-01 13:04:56.152669
0,1013.88,34.07,21.58,34.14,73.04,33.843,8019,Envirosensor,event,2018-05-29 22:48:18.852665
0,1017.59,35.05,27.41,34.77,6.67,34.625,8010,Envirosensor,event,2018-06-01 13:19:15.175496
0,1017.23,36.19,18.9,36.06,4.89,35.906,8004,Envirosensor,event,2018-06-02 23:12:12.230449
0,1018.17,36.51,21.66,36.07,8.5,35.812,8008,Envirosensor,event,2018-06-03 07:33:16.774434
0,1013.19,34.93,20.34,35.01,74.0,34.718,8019,Envirosensor,event,2018-05-30 09:25:24.052756
0,1018.45,35.87,22.71,35.52,108.72,35.281,8005,Envirosensor,event,2018-06-03 00:30:56.678239
0,1018.45,37.08,17.05,36.84,0.24,36.5,8018,Envirosensor,event,2018-06-03 09:34:58.436492
