In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder

- Data is stored in a nested structure
- Two parent folders, one for phone data, and one for watch data
- Inside each are two more folders, one for accelerometer data, one for gyroscope data


In [3]:
#Path Names
participant_id = list(range(1600,1651))

#Initialize empty lists for each pathname
accel_phone = []
gyro_phone = []
accel_watch = []
gyro_watch = []

for i in participant_id:
    accel_phone.append(f"../Accelerometer-Action-Dectection/wisdm-dataset/raw/phone/accel/data_{i}_accel_phone.txt")
    gyro_phone.append(f"../Accelerometer-Action-Dectection/wisdm-dataset/raw/phone/gyro/data_{i}_gyro_phone.txt")
    accel_watch.append(f"../Accelerometer-Action-Dectection/wisdm-dataset/raw/watch/accel/data_{i}_accel_watch.txt")
    gyro_watch.append(f"../Accelerometer-Action-Dectection/wisdm-dataset/raw/watch/gyro/data_{i}_gyro_watch.txt")
    
#Check to see it worked
print(accel_phone[10])
print(gyro_phone[10])
print(accel_watch[10])
print(gyro_watch[10])

../Accelerometer-Action-Dectection/wisdm-dataset/raw/phone/accel/data_1610_accel_phone.txt
../Accelerometer-Action-Dectection/wisdm-dataset/raw/phone/gyro/data_1610_gyro_phone.txt
../Accelerometer-Action-Dectection/wisdm-dataset/raw/watch/accel/data_1610_accel_watch.txt
../Accelerometer-Action-Dectection/wisdm-dataset/raw/watch/gyro/data_1610_gyro_watch.txt


In [4]:
def Clean_Lines(pathnames):
    
    #initialize empty list for data
    master_list = []
    
    #iterate through each pathname
    for i in pathnames:
        infile = open(i, 'r') #open file
        lines = infile.readlines() #read file line by line
        
        #empty list for each participant
        clean_lines = []
        
        #clean up lines
        for line in lines: 
            n_line = line.strip() #get rid of trailing newline characters at the end of the line
            n_line = n_line.strip(';\n') #get rid of trailing newline characters at the end of the line
            n_line = n_line.split(',')  # separates line into a list of items split on commas
            clean_lines.append(n_line) #add to data list
            
        #appened cleaned lines to master list, so that the list is organized by each individual participant
        master_list.append(clean_lines)

    #close the file
    infile.close()
    
    #return clean_lines
    return master_list

In [5]:
#apply function to each of the data folders
accel_phone_data = Clean_Lines(accel_phone)
gyro_phone_data = Clean_Lines(gyro_phone)
accel_watch_data = Clean_Lines(accel_watch)
gyro_watch_data = Clean_Lines(gyro_watch)

In [6]:
gyro_watch_data[50][1000]

['1650', 'A', '2428077847903174', '-0.09590363', '-0.09410573', '0.012912337']

## Shorter Naming Conventions:
- a = accelerometer
- g = gryoscope
- w = watch
- p = phone
- X = independent variable
- Y = dependent variable, targets

In [7]:
def DataCreator(data):
   
    #initialize an empty list for each relevant variable
    master_list = []
    
    #iterate through data and append each data point to it's relevant list for each row
    #also convert data types from string
    for user in data:
        timestamp = []
        X = []
        Y = []
        Z = []
        for i in user:
            timestamp.append(np.array(int(i[2])))
            X.append(np.array(float(i[3])))
            Y.append(np.array(float(i[4])))
            Z.append(np.array(float(i[5])))
        master_list.append(np.array([timestamp, X, Y, Z]))
    
    #convert list of lits to  numpy array for faster calculations and easier data access
    return master_list

In [8]:
#apply function to each data subdivision
ap_data_X = DataCreator(accel_phone_data)
gp_data_X = DataCreator(gyro_phone_data)
aw_data_X = DataCreator(accel_watch_data)
gw_data_X = DataCreator(gyro_watch_data)

In [9]:
#check to see that data is orgnaized by participant
len(ap_data_X)

51

In [10]:
#check shape of individual participant data
ap_data_X[0].shape

(4, 64311)

# Labels

In [16]:
def LabelCreator(data):
    
    #intialize empty lists
    master_list = []
    
    #iterate and append relevant information by row
    for user in data:
        timestamp = []
        activity = []
        participant = []
        
        for i in user:
            timestamp.append(int(i[2]))
            activity.append(i[1])
            participant.append(i[0])
        master_list.append(np.array([timestamp, activity, participant]))
    #return numpy array
    return master_list

In [17]:
ap_labels_Y = LabelCreator(accel_phone_data)
gp_labels_Y = LabelCreator(gyro_phone_data)
aw_labels_Y = LabelCreator(accel_watch_data)
gw_labels_Y = LabelCreator(gyro_watch_data)

# Padding
- Although each participant performed each activity for ~3 minutes, the exact number of data points differ slightly
- The length of the arrays needs to be standardized to feed into the model
- This can be done through padding them with zeros

In [18]:
def Length(data):
    data_length = []
    for user in data:
        data_length.append(len(user[0]))
    return data_length

In [19]:
#Check maximum lengths for each type of data (gyroscope, accelerometer, watch, phone)
length_ap = Length(ap_data_X)
length_ap.sort()
print("AP max length: ", length_ap[-1])

length_gp = Length(gp_data_X)
length_gp.sort()
print("GP max length: ", length_gp[-1])

length_aw = Length(aw_data_X)
length_aw.sort()
print("AW max length: ", length_aw[-1])

length_gw = Length(gw_data_X)
length_gw.sort()
print("GW max length: ", length_gw[-1])

AP max length:  163768
GP max length:  128500
AW max length:  160799
GW max length:  129738


## Max Length
- 163,768 is the maximum number of entries out of all of the data types and participants
- It is the number we will used to pad the length of each entry

## Reformatting Data to Extract only Relevant Details
- We want the X, Y, Z data from each data type's independent variable data
- And we want the activity label class from the dependent variable
- Because each participant's 54 minutes of data (~3 minutes per activity) will be padded to idnentical seqeunce lenghts with their associated labe;s, the time stamps become irrelevant
- the timestamps in this data set were not synchronous among participants, though the amount of time spent performing each activity was roughly the same

In [20]:
def DataExtractor(data):
    master_list = []
    
    for user in data:
        master_list.append(user[1:4]) # The X, Y, Z columns
    return master_list

In [21]:
X_ap = DataExtractor(ap_data_X)
X_gp = DataExtractor(gp_data_X)
X_aw = DataExtractor(aw_data_X)
X_gw = DataExtractor(gw_data_X)

In [22]:
def Padding(data):
    master_list = []
    
    for user in data:
        pad = keras.preprocessing.sequence.pad_sequences(user, padding = "post", maxlen = 163768, dtype='float32')
        master_list.append(pad)
    return master_list

In [23]:
X_ap_clean = Padding(X_ap)
X_gp_clean = Padding(X_gp)
X_aw_clean = Padding(X_aw)
X_gw_clean = Padding(X_gw)

In [24]:
len(X_gw_clean[0][1])

163768

## Labels

In [25]:
#Get labels for each time stamp
def LabelExtractor(data):
    master_list = []
    
    for user in data:
        master_list.append(user[1]) # Activity Label Column
    return master_list

In [26]:
Y_ap = LabelExtractor(ap_labels_Y)
Y_gp = LabelExtractor(gp_labels_Y)
Y_aw = LabelExtractor(aw_labels_Y)
Y_gw = LabelExtractor(gw_labels_Y)

In [29]:
Y_ap[0]

array(['A', 'A', 'A', ..., 'S', 'S', 'S'], dtype='<U21')

In [None]:
test = keras.preprocessing.sequence.pad_sequences(Y_ap[0], padding = "post", maxlen = 163768, dtype='str')

In [28]:
def Padding_Labels(data):
    master_list = []
    
    for user in data:
        pad = keras.preprocessing.sequence.pad_sequences(user, padding = "post", maxlen = 163768, dtype='str')
        master_list.append(pad)
        
    return master_list

In [None]:
Y_ap_pad = Padding_Labels(Y_ap)

In [None]:
Y_ap_pad

In [1]:
Y_ap_pad = Padding_Labels(Y_ap)
Y_gp_pad = Padding_Labels(Y_gp)
Y_aw_pad = Padding_Labels(Y_aw)
Y_gp_pad = Padding_Labels(Y_gp)

NameError: name 'Padding' is not defined

In [181]:
len(Y_ap[0])

64311

In [234]:
def OHE(data):
    ohe = OneHotEncoder()
    master_list = []
    for user in data:
        sparse_matrix = ohe.fit_transform(Y_ap[0].reshape(-1, 1))
        master_list.append(sparse_matrix)
    return master_list

In [41]:
test3 = OHE(Y_ap)

NameError: name 'OHE' is not defined

In [40]:
len(test3[0].todense())

NameError: name 'test3' is not defined

In [33]:
ohe = OneHotEncoder(sparse = False)

In [34]:
test = ohe.fit_transform(test.reshape(-1, 1))

In [39]:
len(test[0])

19

In [38]:
test.categories_

AttributeError: 'numpy.ndarray' object has no attribute 'categories_'

In [214]:
test.transform

<bound method OneHotEncoder.transform of OneHotEncoder()>

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
>>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
>>> enc.fit(X)
OneHotEncoder(handle_unknown='ignore')
>>> enc.categories_
[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
>>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
array([[1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.]])
>>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
array([['Male', 1],
       [None, 2]], dtype=object)
>>> enc.get_feature_names_out(['gender', 'group'])
array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...)