# Home sensors

In [103]:
import json
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession, DataFrame
from IPython.display import display

conf = SparkConf().setAppName("HomeSensors").setMaster("local[1]")

sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)
ss = SparkSession.builder.getOrCreate()

In [104]:
data = sc.textFile("./data/sensor_data.txt")
data.cache().count()

250

In [105]:
json_data = data.map(lambda x: json.loads(x))
json_data.cache().count()

250

In [106]:
sensor_data = json_data.flatMap(lambda x: x["data"]) # [[1, 2, 3], [4, 5, 6], [7, 8, 9]] -> [1, 2, 3, 4, 5, 6, 7, 8, 9]
sensor_data.cache().count()

717

In [107]:
def to_timestamp(df: DataFrame, field: str):
    df = df.withColumn(field, from_unixtime(f'{field}_ms').cast(TimestampType()))
    df = df.drop(f'{field}_ms')
    return df

In [108]:
# creating a DF

from pyspark.sql.types import Row, TimestampType
from pyspark.sql.functions import from_unixtime

sensor_data_df = sensor_data.map(lambda l: Row(**dict(l))).toDF()
sensor_data_df = to_timestamp(sensor_data_df, "timestamp")
display(sensor_data_df)

sqlContext.registerDataFrameAsTable(sensor_data_df, "sensorData")

device_id,measurement,type,unit,timestamp
temp1,22.671717271141567,temperature,celsius,2020-09-01 03:00:00
temp1,22.427013031545386,temperature,celsius,2020-09-01 03:15:00
temp1,23.21266136616038,temperature,celsius,2020-09-01 03:30:00
temp1,22.280158977203943,temperature,celsius,2020-09-01 03:45:00
temp2,14.348216499635887,temperature,celsius,2020-09-01 03:00:00
temp2,14.968794892912364,temperature,celsius,2020-09-01 03:15:00
temp2,14.271602819920131,temperature,celsius,2020-09-01 03:30:00
temp2,13.578581273189716,temperature,celsius,2020-09-01 03:45:00
temp3,15.4336063013519,temperature,celsius,2020-09-01 03:00:00
temp3,15.73280028139962,temperature,celsius,2020-09-01 03:15:00


In [109]:
# creating a DF, the complicated way
def remove_property(data):
    del data["data"]
    return data

device_data_df = json_data.map(remove_property).map(lambda l: Row(**dict(l))).toDF()
device_data_df = to_timestamp(device_data_df, "timestamp")
display(device_data_df)

sqlContext.registerDataFrameAsTable(device_data_df, "deviceData")


id,timestamp
temp1,2020-09-01 03:00:00
temp2,2020-09-01 03:00:00
temp3,2020-09-01 03:00:00
weather1,2020-09-01 03:00:00
weather2,2020-09-01 03:00:00
weather3,2020-09-01 03:00:00
contact1,2020-09-01 03:00:00
contact2,2020-09-01 03:00:00
air1,2020-09-01 03:00:00
air2,2020-09-01 03:00:00


In [110]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

device_locations_df = ss.read.csv(
    "data/device_locations.csv",
    header=True,
    mode="DROPMALFORMED",
    schema=StructType([
        StructField("device_id", StringType(), True),
        StructField("locations", StringType(), True)
    ])
)
sqlContext.registerDataFrameAsTable(device_locations_df, "deviceLocations")
display(device_locations_df)

device_id,locations
temp1,living_room
temp2,master_bedroom
temp3,hallway
weather1,living_room
weather2,bathroom
weather3,master_bedroom
contact1,balcony
contact2,hallway
air1,master_bedroom
air2,living_room


In [111]:
# group by
groupByExample = sqlContext.sql("""
    SELECT *
    FROM deviceData
    GROUP BY id, timestamp
    ORDER  BY id
""")

display(groupByExample)

id,timestamp
air1,2020-09-01 12:00:00
air1,2020-09-01 05:00:00
air1,2020-09-02 01:00:00
air1,2020-09-01 22:00:00
air1,2020-09-01 16:00:00
air1,2020-09-01 14:00:00
air1,2020-09-01 18:00:00
air1,2020-09-01 23:00:00
air1,2020-09-01 19:00:00
air1,2020-09-01 03:00:00


In [112]:
# dataframe joins
joinDFs = sqlContext.sql("""
    SELECT *
    FROM sensorData
    INNER JOIN deviceLocations ON sensorData.device_id = deviceLocations.device_id
    LIMIT 10
""")

display(joinDFs)

device_id,measurement,type,unit,timestamp,device_id.1,locations
temp1,22.671717271141567,temperature,celsius,2020-09-01 03:00:00,temp1,living_room
temp1,22.427013031545386,temperature,celsius,2020-09-01 03:15:00,temp1,living_room
temp1,23.21266136616038,temperature,celsius,2020-09-01 03:30:00,temp1,living_room
temp1,22.280158977203943,temperature,celsius,2020-09-01 03:45:00,temp1,living_room
temp2,14.348216499635887,temperature,celsius,2020-09-01 03:00:00,temp2,master_bedroom
temp2,14.968794892912364,temperature,celsius,2020-09-01 03:15:00,temp2,master_bedroom
temp2,14.271602819920131,temperature,celsius,2020-09-01 03:30:00,temp2,master_bedroom
temp2,13.578581273189716,temperature,celsius,2020-09-01 03:45:00,temp2,master_bedroom
temp3,15.4336063013519,temperature,celsius,2020-09-01 03:00:00,temp3,hallway
temp3,15.73280028139962,temperature,celsius,2020-09-01 03:15:00,temp3,hallway


In [113]:
temps = sqlContext.sql("""
    SELECT device_id, type, timestamp, measurement
    FROM sensorData
    WHERE type = 'temperature' AND device_id = 'temp1'
    ORDER BY timestamp
""")

display(temps)

device_id,type,timestamp,measurement
temp1,temperature,2020-09-01 03:00:00,22.671717271141567
temp1,temperature,2020-09-01 03:15:00,22.427013031545386
temp1,temperature,2020-09-01 03:30:00,23.21266136616038
temp1,temperature,2020-09-01 03:45:00,22.280158977203943
temp1,temperature,2020-09-01 04:00:00,22.280158977203943
temp1,temperature,2020-09-01 04:15:00,22.755368861939
temp1,temperature,2020-09-01 04:30:00,22.828357539779773
temp1,temperature,2020-09-01 04:45:00,22.41430333062398
temp1,temperature,2020-09-01 05:00:00,22.531936785837352
temp1,temperature,2020-09-01 05:15:00,22.32209520818083


In [88]:
temps = sqlContext.sql("""
    SELECT device_id, type, timestamp, measurement
    FROM sensorData
    WHERE type = 'temperature' AND device_id = 'temp1'
    ORDER BY timestamp
""")

display(temps)

device_id,type,timestamp,measurement
temp1,temperature,2020-09-01 03:00:00,22.671717271141567
temp1,temperature,2020-09-01 03:15:00,22.427013031545386
temp1,temperature,2020-09-01 03:30:00,23.21266136616038
temp1,temperature,2020-09-01 03:45:00,22.280158977203943
temp1,temperature,2020-09-01 04:00:00,22.280158977203943
temp1,temperature,2020-09-01 04:15:00,22.755368861939
temp1,temperature,2020-09-01 04:30:00,22.828357539779773
temp1,temperature,2020-09-01 04:45:00,22.41430333062398
temp1,temperature,2020-09-01 05:00:00,22.531936785837352
temp1,temperature,2020-09-01 05:15:00,22.32209520818083
