# Casas smarthome ADS Coursework

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import datetime
import os
import re
import seaborn as sns
from collections import Counter
from datetime import datetime

from keras.preprocessing import sequence

In [3]:
sensor_codes = ['M001', 'M002', 'M003', 'M004', 'M005', 'M006', 'M007', 
                'M008', 'M009', 'M010', 'M011', 'M012', 'M013', 'M014',
                'M015', 'M016', 'M017', 'M018', 'M019', 'M020', 'M021', 
                'M022', 'M023', 'M024', 'M025', 'M026', 'M027', 'M028',
                'M029', 'M030', 'D001', 'D002', 'D003', 'D004', 'T001',
                'T002', 'T003', 'T004', 'T005']

# Data cleaning

In [4]:
def load_dataset(filename):
    timestamps = []
    sensors = []
    values = []
    activities = []
    current_activity = ''  # empty

    with open(filename, 'rb') as features:
        database = features.readlines()
        
        for i, line in enumerate(database):  # each line
            f_info = line.decode().split()  # find fields
            try:
                if 'M' == f_info[2][0] or 'D' == f_info[2][0] or 'T' == f_info[2][0]:
                    if str(np.array(f_info[2])) in sensor_codes:
                        sensors.append(str(np.array(f_info[2])))
                    else:
                        continue
                       
                    if 'OFF' in f_info[3]:
                        values.append('OFF')
                    elif 'ON' in f_info[3]:
                        values.append('ON')
                    else:
                        try:
                            values.append(float(str(np.array(f_info[3]))))
                        except ValueError:
                            del sensors[-1]
                            continue
                            
                    # choose only M D T sensors, avoiding unexpected errors
                    if not ('.' in str(np.array(f_info[0])) + str(np.array(f_info[1]))):
                        f_info[1] = f_info[1] + '.000000'
                        
                    try:
                        timestamps.append(datetime.strptime(str(np.array(f_info[0])) + str(np.array(f_info[1])),
                                                        "%Y-%m-%d%H:%M:%S.%f"))
                    except ValueError:
                        del sensors[-1]
                        del values[-1]
                        continue
                        
                    if len(f_info) == 4:  # if activity does not exist
                        activities.append(current_activity)
                    else:  # if activity exists
                        des = str(' '.join(np.array(f_info[4:])))
                        if 'begin' in des:
                            current_activity = re.sub('begin', '', des)
                            if current_activity[-1] == ' ':  # if white space at the end
                                current_activity = current_activity[:-1]  # delete white space
                            activities.append(current_activity)
                        if 'end' in des:
                            activities.append(current_activity)
                            current_activity = ''
            except IndexError:
                print(i, line)
                
    features.close()
    
    return pd.DataFrame(np.transpose(np.array([timestamps, sensors, values, activities])), 
                        columns = ['Timestamps', 'Sensors', 'Values', 'Activities'])
    

In [5]:
aruba_dataset = load_dataset("./datasets/aruba/data")

In [6]:
copy_aruba = aruba_dataset.copy();

In [7]:
def add_transition_labels(aruba_set):
    new_activity = ''
    previous_activity = ''
    
    for i, entry in enumerate(aruba_set['Activities'][:5000]): 
        if entry == '':
            if new_activity != '':
                aruba_set.at[i, 'Activities'] = new_activity
            else:
                for next_entry in aruba_set['Activities'][i:]:
                    if next_entry != '':
                        new_activity = 'Transition_' + previous_activity + '_' + next_entry
                        aruba_set.at[i, 'Activities'] = new_activity
                        break            
        else:
            previous_activity = entry
            new_activity = ''
    return aruba_set

In [8]:
copy_aruba['Activities'][49:53]

49                 
50                 
51                 
52    Bed_to_Toilet
Name: Activities, dtype: object

In [9]:
complete_aruba = add_transition_labels(copy_aruba)

In [None]:
complete_aruba.head(60)

# Pre-processing

In [24]:
def process_time_data(dataset):
    weekday = []
    
    for instance in dataset["Timestamps"][:50]:
        #weekday.append(instance.get_weekday())
        #print(instance.day_of_week)
        
        print(instance.hour * 3600 + instance.minute * 60 + instance.second)

In [25]:
process_time_data(complete_aruba)

230
237
908
1819
1819
2122
2425
2728
3942
4548
5459
6369
6976
8492
8492
9153
9158
9705
10615
12131
13341
13345
13345
13792
13797
14254
14254
15272
15273
15277
15279
16377
16457
16459
16460
16461
16984
18803
19107
20017
20427
20429
20432
20433
20434
20440
20440
20442
20443
20444


In [None]:
complete_aruba.hist(bins = 50)


In [None]:
complete_aruba.describe()

In [None]:
sns.catplot(x = 'Activities', y = 'Timestamps', data = complete_aruba)