# Identifying sedimentary and active activities using a smart watch

In [68]:
import pandas as pd
import glob 
import os
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [4]:
df_phone_acc = pd.read_csv('./wisdm-dataset/raw/phone/accel/data_1601_accel_phone.txt', sep=',',names=['id','ac','timestamp','x','y','z'])

In [5]:
df_phone_acc

Unnamed: 0,id,ac,timestamp,x,y,z
0,1601,A,265073308304101,4.703409,9.127296,0.06404489;
1,1601,A,265073348330612,5.354632,15.635334,-0.6290765;
2,1601,A,265073388368581,6.399701,12.926893,0.45010993;
3,1601,A,265073428111445,10.532093,13.207614,-1.0247183;
4,1601,A,265073468081082,16.129736,2.683301,1.1426327;
...,...,...,...,...,...,...
81452,1601,S,258908699056416,2.015319,9.988011,0.74639237;
81453,1601,S,258908738947822,1.681927,10.074801,1.7262194;
81454,1601,S,258908778855321,1.148020,9.127296,1.4921862;
81455,1601,S,258908818435165,1.417966,9.126099,1.0779893;


<h3> Combining files for accelerator and gyroscope data </h3>

In [2]:
#Combine all files for accel
path="./wisdm-dataset/raw/watch/accel"                    
all_files = [i for i in glob.glob(os.path.join(path, "*.txt"))]  
accel_df = pd.concat([pd.read_csv(f,sep=',',names=['id','ac','timestamp','a-x','a-y','a-z']) for f in all_files], ignore_index=True)
accel_df.head()

Unnamed: 0,id,ac,timestamp,a-x,a-y,a-z
0,1600,A,90426708196641,7.091625,-0.591667,8.195502;
1,1600,A,90426757696641,4.972757,-0.158317,6.6967316;
2,1600,A,90426807196641,3.25372,-0.191835,6.107758;
3,1600,A,90426856696641,2.801216,-0.155922,5.997625;
4,1600,A,90426906196641,3.770868,-1.051354,7.731027;


In [3]:
#Combine all files for gyro
path="./wisdm-dataset/raw/watch/gyro"                    
all_files = [i for i in glob.glob(os.path.join(path, "*.txt"))]  
gyro_df = pd.concat([pd.read_csv(f,sep=',',names=['id','ac','timestamp','g-x','g-y','g-z']) for f in all_files], ignore_index=True)
gyro_df.head()

Unnamed: 0,id,ac,timestamp,g-x,g-y,g-z
0,1600,A,90426757696641,0.314944,-1.022277,-0.3099616;
1,1600,A,90426807196641,0.387382,-0.618541,-0.048971802;
2,1600,A,90426856696641,0.070999,-0.20948,-0.1959783;
3,1600,A,90426906196641,0.037975,0.254976,-0.1565635;
4,1600,A,90426955696641,0.073129,0.719431,-0.0010349044;


In [36]:
#Check for missing values -- there are no missing values
#accel_df_clean[accel_df_clean.isnull().any(axis=1)]
#gyro_df_clean[gyro_df_clean.isnull().any(axis=1)]

Unnamed: 0,ac,g-x,g-y,g-z


In [7]:
combined_df = accel_df.merge(gyro_df, on='timestamp', how='inner')
combined_df.head()

Unnamed: 0,id_x,ac_x,timestamp,a-x,a-y,a-z,id_y,ac_y,g-x,g-y,g-z
0,1600,A,90426757696641,4.972757,-0.158317,6.6967316;,1600,A,0.314944,-1.022277,-0.3099616;
1,1600,A,90426807196641,3.25372,-0.191835,6.107758;,1600,A,0.387382,-0.618541,-0.048971802;
2,1600,A,90426856696641,2.801216,-0.155922,5.997625;,1600,A,0.070999,-0.20948,-0.1959783;
3,1600,A,90426906196641,3.770868,-1.051354,7.731027;,1600,A,0.037975,0.254976,-0.1565635;
4,1600,A,90426955696641,4.661511,0.169689,9.684695;,1600,A,0.073129,0.719431,-0.0010349044;


<h3> Check if labels and ids match across gyroscope and accelerator </h3>

In [96]:
#Check if labels match across gyroscope and accelerator 
label_check=[]
for r in tqdm(np.arange(len(combined_df))):
    if (combined_df.ac_x[r] == combined_df.ac_y[r]):
        label_check.append('True')
    else:
        label_check.append('False')

100%|██████████████████████████████| 3370861/3370861 [08:22<00:00, 6708.43it/s]


In [99]:
label_check.count('True')

3368542

In [100]:
label_check.count('False')

2319

In [105]:
(2319/3368542)*100

0.06884284061175429

0.07% of the data is labelled incorrectly (most likely due to computer error), this should not make a significant difference to the results

In [15]:
#Check if ids match across gyroscope and accelerator 
id_check=[]
for r in tqdm(np.arange(len(combined_df))):
    if (combined_df.id_x[r] == combined_df.id_y[r]):
        id_check.append('True')
    else:
        id_check.append('False')

100%|██████████████████████████████| 3370861/3370861 [08:14<00:00, 6818.10it/s]


In [16]:
id_check.count('True')

3370861

In [17]:
id_check.count('False')

0

<h3> Clean up the combined table and give binary labels </h3>
<p>The activities will be split into two categories: Sedimentry and active. <br>
    Sedimentary activies are: sitting, typing, writing (D, F, Q)<br>
    Active activities are: Walking, jogging, stairs, standing, kicking (football), playing catch w/tennis ball, dribbling, folding clothes (A, B, C, E, M, O, P, S) </p>
    

In [80]:
clean_combined_df=combined_df.copy()
clean_combined_df=clean_combined_df[['ac_x','timestamp','a-x','a-y','a-z','g-x','g-y','g-z']]
clean_combined_df.head()

Unnamed: 0,ac_x,timestamp,a-x,a-y,a-z,g-x,g-y,g-z
0,A,90426757696641,4.972757,-0.158317,6.6967316;,0.314944,-1.022277,-0.3099616;
1,A,90426807196641,3.25372,-0.191835,6.107758;,0.387382,-0.618541,-0.048971802;
2,A,90426856696641,2.801216,-0.155922,5.997625;,0.070999,-0.20948,-0.1959783;
3,A,90426906196641,3.770868,-1.051354,7.731027;,0.037975,0.254976,-0.1565635;
4,A,90426955696641,4.661511,0.169689,9.684695;,0.073129,0.719431,-0.0010349044;


In [81]:
#delete semicolon and convert to integer
clean_combined_df['a-z'] = clean_combined_df['a-z'].str.replace(';', '').astype(float)
clean_combined_df['g-z'] = clean_combined_df['g-z'].str.replace(';', '').astype(float)

In [82]:
cat_combined_df = clean_combined_df[clean_combined_df['ac_x'].str.contains('A|B|C|D|E|F|M|O|P|Q|S')]
cat_combined_df.head()

Unnamed: 0,ac_x,timestamp,a-x,a-y,a-z,g-x,g-y,g-z
0,A,90426757696641,4.972757,-0.158317,6.696732,0.314944,-1.022277,-0.309962
1,A,90426807196641,3.25372,-0.191835,6.107758,0.387382,-0.618541,-0.048972
2,A,90426856696641,2.801216,-0.155922,5.997625,0.070999,-0.20948,-0.195978
3,A,90426906196641,3.770868,-1.051354,7.731027,0.037975,0.254976,-0.156563
4,A,90426955696641,4.661511,0.169689,9.684695,0.073129,0.719431,-0.001035


In [83]:
label_combined_df=cat_combined_df.copy()
label_combined_df['cat label']=(label_combined_df['ac_x'].str.contains('A|B|C|E|M|O|P|S'))*1
print(label_combined_df['cat label'])

0          1
1          1
2          1
3          1
4          1
          ..
3370856    1
3370857    1
3370858    1
3370859    1
3370860    1
Name: cat label, Length: 2057809, dtype: int32


<h3>Data classification using desision trees</h3> 

In [84]:
y=label_combined_df[['cat label']].copy()

Use accelerator and gyroscope features to predict the category of activity

In [85]:
features=['a-x','a-y','a-z','g-x','g-y','g-z']

In [86]:
X = label_combined_df[features].copy()

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324)

Training phase

In [88]:
activities_classifier = DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)
activities_classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

Testing phase

In [89]:
predictions = activities_classifier.predict(X_test)

Accuracy test

In [93]:
accuracy_score(y_true = y_test, y_pred = predictions)

0.9299460885878921

We can distinguish between sedimentary and active activities to a 93% accuracy