In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
import pickle

- Data is stored in a nested structure
- Two parent folders, one for phone data, and one for watch data
- Inside each are two more folders, one for accelerometer data, one for gyroscope data


# Naming Conventions:
- a = accelerometer
- g = gryoscope
- w = watch
- p = phone
- X = independent variable
- Y = dependent variable, targets

In [2]:
#Path Names
participant_id = list(range(1600,1651))

#Initialize empty lists for each pathname
accel_phone = []
gyro_phone = []
accel_watch = []
gyro_watch = []

for i in participant_id:
    accel_phone.append(f"../Accelerometer-Action-Dectection/wisdm-dataset/raw/phone/accel/data_{i}_accel_phone.txt")
    gyro_phone.append(f"../Accelerometer-Action-Dectection/wisdm-dataset/raw/phone/gyro/data_{i}_gyro_phone.txt")
    accel_watch.append(f"../Accelerometer-Action-Dectection/wisdm-dataset/raw/watch/accel/data_{i}_accel_watch.txt")
    gyro_watch.append(f"../Accelerometer-Action-Dectection/wisdm-dataset/raw/watch/gyro/data_{i}_gyro_watch.txt")
    
#Check to see it worked
print(accel_phone[10])
print(gyro_phone[10])
print(accel_watch[10])
print(gyro_watch[10])

../Accelerometer-Action-Dectection/wisdm-dataset/raw/phone/accel/data_1610_accel_phone.txt
../Accelerometer-Action-Dectection/wisdm-dataset/raw/phone/gyro/data_1610_gyro_phone.txt
../Accelerometer-Action-Dectection/wisdm-dataset/raw/watch/accel/data_1610_accel_watch.txt
../Accelerometer-Action-Dectection/wisdm-dataset/raw/watch/gyro/data_1610_gyro_watch.txt


In [3]:
def Clean_Lines(pathnames):
    
    #initialize empty list for data
    master_list = []
    
    #iterate through each pathname
    for i in pathnames:
        infile = open(i, 'r') #open file
        lines = infile.readlines() #read file line by line
        
        #empty list for each participant
        clean_lines = []
        
        #clean up lines
        for line in lines: 
            n_line = line.strip() #get rid of trailing newline characters at the end of the line
            n_line = n_line.strip(';\n') #get rid of trailing newline characters at the end of the line
            n_line = n_line.split(',')  # separates line into a list of items split on commas
            clean_lines.append(n_line) #add to data list
            
        #appened cleaned lines to master list, so that the list is organized by each individual participant
        master_list.append(clean_lines)

    #close the file
    infile.close()
    
    #return clean_lines
    return master_list

In [4]:
#apply function to each of the data folders
ap_data = Clean_Lines(accel_phone)
gp_data = Clean_Lines(gyro_phone)
aw_data = Clean_Lines(accel_watch)
gw_data = Clean_Lines(gyro_watch)

In [5]:
ap_data[0]

[['1600', 'A', '252207666810782', '-0.36476135', '8.793503', '1.0550842'],
 ['1600', 'A', '252207717164786', '-0.8797302', '9.768784', '1.0169983'],
 ['1600', 'A', '252207767518790', '2.0014954', '11.10907', '2.619156'],
 ['1600', 'A', '252207817872794', '0.45062256', '12.651642', '0.18455505'],
 ['1600', 'A', '252207868226798', '-2.1643524', '13.928436', '-4.4224854'],
 ['1600', 'A', '252207918580802', '-4.332779', '13.361191', '-0.7188721'],
 ['1600', 'A', '252207968934806', '-0.31944275', '13.318359', '-0.23202515'],
 ['1600', 'A', '252208019288809', '1.566452', '9.515274', '-0.01777649'],
 ['1600', 'A', '252208069642813', '-0.32374573', '5.262665', '0.32234192'],
 ['1600', 'A', '252208119996817', '-1.811676', '3.7105103', '1.3739319'],
 ['1600', 'A', '252208170350821', '-1.1340485', '4.538269', '2.2975464'],
 ['1600', 'A', '252208220704825', '0.09291077', '6.7061005', '1.8996277'],
 ['1600', 'A', '252208271058829', '-1.0363159', '15.611984', '2.6415253'],
 ['1600', 'A', '2522083214

In [6]:
#Create Function to Clean Up Timestamps
def TimestampClean(cell):
        return  round((float(cell[0:-8] + '.'+ cell[-8:])), 0)

In [7]:
def MakeDataFrame(data):
    df = pd.DataFrame(data, columns = ['Participant ID','Activity Label','Timestamp', 'X', 'Y', 'Z'])
    df["Timestamp"] = df["Timestamp"].apply(TimestampClean)
    df['X'] = df['X'].astype('float')
    df['Y'] = df['Y'].astype('float')
    df['Z'] = df['Z'].astype('float')
    df = df.sort_values(by=['Timestamp'])
    df = df.groupby(['Timestamp']).agg({'Participant ID':'first', 'Activity Label':'first', 
                'X':'mean', 'Y':'mean', 'Z':'mean'})
    df = df.reset_index()
    return df

In [8]:
def DataMerge(df_ap, df_gp, df_aw, df_gw):
    merge1 = df_ap.merge(df_gp, left_on='Timestamp', right_on='Timestamp',
          suffixes=('_ap', '_gp'))
    merge2 = df_aw.merge(df_gw, left_on='Timestamp', right_on='Timestamp',
          suffixes=('_aw', '_gw'))
    merge3 = pd.concat([merge1, merge2], axis=1)
    
    #Drop unnecessary columns
    merge3 = merge3.drop(['Participant ID_gp', 'Participant ID_aw', 'Participant ID_gw','Activity Label_gp',
                    'Activity Label_aw', 'Activity Label_gw'], axis=1) #drop explicitly named columns
    merge3 = merge3.drop(merge3.columns[[9]],axis = 1) #Redundancy in Timestamp due to the final merge, drop by index
    
    merge3 = merge3.fillna(method='ffill')
    return merge3

In [9]:
participant_number = range(51)
participant_list = []
for i in participant_number:
    #participant_list.append(MakeDataFrame(ap_data[i]))
    df_ap = MakeDataFrame(ap_data[i])
    df_gp = MakeDataFrame(gp_data[i])
    df_aw = MakeDataFrame(aw_data[i])
    df_gw = MakeDataFrame(gw_data[i])
    
    participant_list.append(DataMerge(df_ap, df_gp, df_aw, df_gw))

In [10]:
len(participant_list[50])

32391

In [11]:
participant_list[50]

Unnamed: 0,Participant ID_ap,Activity Label_ap,X_ap,Y_ap,Z_ap,X_gp,Y_gp,Z_gp,X_aw,Y_aw,Z_aw,X_gw,Y_gw,Z_gw
0,1650,D,0.433978,0.442121,9.951307,0.000710,-0.001065,0.000000,-2.230049,-5.131672,8.090007,-0.107734,-0.114160,0.002739
1,1650,D,0.422002,0.446432,9.959450,0.002131,0.001598,0.000000,-1.738041,-4.497208,8.309078,-0.157802,-0.254775,0.025642
2,1650,D,0.429667,0.450264,9.956096,0.001775,0.001420,0.000000,-2.031331,-4.427777,8.991425,0.240607,-0.128008,-0.021230
3,1650,D,-0.572889,0.250519,9.975736,0.004261,0.003728,0.194411,-2.038513,-4.734235,8.270770,0.051523,0.025390,-0.115506
4,1650,D,-0.562231,-0.536484,9.989028,0.015446,0.022903,1.261273,-1.914015,-4.437354,8.441955,-0.117322,0.040303,-0.001523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32386,1650,P,1.195594,-9.047905,2.391188,0.022726,-0.174348,0.072083,-4.435408,-4.841974,-3.186682,1.635164,-0.938315,-0.102149
32387,1650,P,1.685615,-9.085746,2.091331,0.069242,-0.433563,0.191748,-4.435408,-4.841974,-3.186682,1.635164,-0.938315,-0.102149
32388,1650,P,1.638672,-9.268247,1.665018,0.162985,-0.286911,0.080250,-4.435408,-4.841974,-3.186682,1.635164,-0.938315,-0.102149
32389,1650,P,0.746288,-9.263457,2.150728,0.213053,-0.023968,0.056459,-4.435408,-4.841974,-3.186682,1.635164,-0.938315,-0.102149


In [59]:
X_test = participant_list[50].drop(columns = ['Activity Label_ap', 'Participant ID_ap'])
(np.array(X_test)).T.shape

(12, 32391)

## Reformatting Data to Extract only Relevant Details
- We want the X, Y, Z data from each type of sensor for each participants (12 different columns)
- And we want the activity label class from the dependent variable
- Because each participant's 54 minutes of data (~3 minutes per activity) will be padded to idnentical sequence lenghts with their associated labe;s, the time stamps become irrelevant
- the timestamps in this data set were not synchronous among participants, though the amount of time spent performing each activity was roughly the same

In [145]:
def DataCreator(data):
    X_list = []
    Y_list = []
    for participant in data:
        X = participant.drop(columns = ['Activity Label_ap', 'Participant ID_ap'])
        Y = participant.iloc[:, 1]
        X_list.append(np.array(X))
        Y_list.append(np.array(Y))
        
    return np.array(X_list), np.array(Y_list)

In [146]:
X, Y = DataCreator(participant_list)
X = [i.T for i in X]

In [147]:
X[0].shape

(12, 32372)

## Padding
- Although each participant performed each activity for ~3 minutes, the exact number of data points differ slightly
- The length of the arrays needs to be standardized to feed into the model
- This can be done through padding them with zeros

In [148]:
length_list = []
for i in participant_number:
    length_list.append(len(participant_list[i]))
length_list.sort(reverse = True)
print("Max length to pad: ", length_list[0])

Max length to pad:  32409


## Max Length
- 32,409 is the maximum number of entries out of all of the data types and participants
- It is the number we will used to pad the length of each entry

In [149]:
def Padding(data):
    master_list = []
    
    for user in data:
        pad = keras.preprocessing.sequence.pad_sequences(user, padding = "post", maxlen = 32409, dtype='float32')
        master_list.append(pad)
    return master_list

In [170]:
#The X data has been created and is ready to save to be used for modelling!
X_pad = Padding(X)

## Convert to Numeric Sequence & Pad Sequence

In [151]:
def Label_Prep(data):
    
    master_list = []
    output = []
    label_encoder = LabelEncoder().fit(data[0])
    
    for user in data:
        values_of_seq = np.array(user)
        integer_encoded = label_encoder.transform(values_of_seq)
        master_list.append(integer_encoded)
        
    for item in master_list:
        pad = keras.preprocessing.sequence.pad_sequences(item.reshape(1,item.shape[0]), padding = "post", maxlen = 32409, dtype='int64', value= 18)
        output.append(pad)
    return output

In [169]:
# Now the Labels have been sequenced and padded to the max length 
Y_prep = Label_Prep(Y)

## One Hot Encoding into Sparse Matrices

In [154]:
def OHE(data):
    ohe = OneHotEncoder()
    master_list = []
    for user in data:
        sparse_matrix = ohe.fit_transform(user.reshape(-1, 1))
        master_list.append(sparse_matrix)
    return master_list

In [168]:
# Now all of the labels are One Hot Encoded and Ready to be fed into the model!
Y_ohe = OHE(Y_prep)

# Save the Cleaned Data!
- Now that the data has been cleaned and preprocessed, save it in a pickle file for easy recall in the modelling notebook

In [171]:
#X Data
with open('X.pkl','wb') as f:
    pickle.dump(X_pad, f) 
    
#Y Data
with open('Y.pkl','wb') as f:
    pickle.dump(Y_ohe, f)    

## Notes
- need to pad everything out still
- and then split into train/test
- OHE Categorical Variables
- convert to array again for plugging into model?
- Adapt the older functions?

- down the line need to get visuals/beautiful AI presentation

## Note: 
The sensor's are not in sync on the same time scale. For instance, note how the gyroscope watch data and the phone accelerometer change from the first activity to the second at different times