In [1]:
import pandas as pd
import numpy as np
import tsfel
import util
import zipfile

In [2]:
# Load the dataset from online repository
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip

# Unzip the dataset
zip_ref = zipfile.ZipFile("UCI HAR Dataset.zip", 'r')
zip_ref.extractall()
zip_ref.close()

# Store the dataset as a Pandas dataframe.
x_train_sig = np.loadtxt('UCI HAR Dataset/train/Inertial Signals/total_acc_x_train.txt', dtype='float32')
X_train_sig = pd.DataFrame(np.hstack(x_train_sig), columns=["total_acc_x"])

--2022-01-12 08:50:45--  https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60999314 (58M) [application/x-httpd-php]
Saving to: ‘UCI HAR Dataset.zip.1’


2022-01-12 08:50:49 (19.5 MB/s) - ‘UCI HAR Dataset.zip.1’ saved [60999314/60999314]



In [3]:
subject_info = pd.read_csv("input/data_subjects_info.csv")

In [4]:
subject_info

Unnamed: 0,code,weight,height,age,gender
0,1,102,188,46,1
1,2,72,180,28,1
2,3,48,161,28,0
3,4,90,176,31,1
4,5,48,164,23,0
5,6,76,180,28,1
6,7,62,175,30,0
7,8,52,161,24,0
8,9,93,190,32,1
9,10,72,164,31,0


In [5]:
ACT_LABELS = ["dws","ups", "wlk", "jog", "std", "sit"]
TRIAL_CODES = {
    ACT_LABELS[0]:[1,2,11],
    ACT_LABELS[1]:[3,4,12],
    ACT_LABELS[2]:[7,8,15],
    ACT_LABELS[3]:[9,16],
    ACT_LABELS[4]:[6,14],
    ACT_LABELS[5]:[5,13]
}

## Here we set parameter to build labeld time-series from dataset of "(A)DeviceMotion_data"
## attitude(roll, pitch, yaw); gravity(x, y, z); rotationRate(x, y, z); userAcceleration(x,y,z)
sdt = ["attitude", "userAcceleration"]
print("[INFO] -- Selected sensor data types: "+str(sdt))    
act_labels = ACT_LABELS [0:4]
print("[INFO] -- Selected activites: "+str(act_labels))    
trial_codes = [TRIAL_CODES[act] for act in act_labels]
dt_list = util.set_data_types(sdt)
dataset = util.creat_time_series(dt_list, act_labels, trial_codes, mode="raw", labeled=True)
print("[INFO] -- Shape of time-Series dataset:"+str(dataset.shape))    
dataset.head()

[INFO] -- Selected sensor data types: ['attitude', 'userAcceleration']
[INFO] -- Selected activites: ['dws', 'ups', 'wlk', 'jog']
[INFO] -- Data subjects' information is imported.
[INFO] -- Creating Time-Series
[INFO] -- Shape of time-Series dataset:(767660, 13)


Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,userAcceleration.x,userAcceleration.y,userAcceleration.z,act,id,weight,height,age,gender,trial
0,1.528132,-0.733896,0.696372,0.294894,-0.184493,0.377542,0.0,0.0,102.0,188.0,46.0,1.0,1.0
1,1.527992,-0.716987,0.677762,0.219405,0.035846,0.114866,0.0,0.0,102.0,188.0,46.0,1.0,1.0
2,1.527765,-0.706999,0.670951,0.010714,0.134701,-0.167808,0.0,0.0,102.0,188.0,46.0,1.0,1.0
3,1.516768,-0.704678,0.675735,-0.008389,0.136788,0.094958,0.0,0.0,102.0,188.0,46.0,1.0,1.0
4,1.493941,-0.703918,0.672994,0.199441,0.353996,-0.044299,0.0,0.0,102.0,188.0,46.0,1.0,1.0


In [6]:
dt_list

[['attitude.roll', 'attitude.pitch', 'attitude.yaw'],
 ['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z']]

In [7]:
labels = dataset['act'].astype(int)
labels

0         0
1         0
2         0
3         0
4         0
         ..
767655    3
767656    3
767657    3
767658    3
767659    3
Name: act, Length: 767660, dtype: int64

In [8]:
df = dataset
df["id"] = df["id"].astype(int)
df["age"] = df["age"].astype(int)
df["act"] = df["act"].astype(int)
df["trial"] = df["trial"].astype(int)
df["gender"] = df["gender"].astype(int)

In [9]:
df

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,userAcceleration.x,userAcceleration.y,userAcceleration.z,act,id,weight,height,age,gender,trial
0,1.528132,-0.733896,0.696372,0.294894,-0.184493,0.377542,0,0,102.0,188.0,46,1,1
1,1.527992,-0.716987,0.677762,0.219405,0.035846,0.114866,0,0,102.0,188.0,46,1,1
2,1.527765,-0.706999,0.670951,0.010714,0.134701,-0.167808,0,0,102.0,188.0,46,1,1
3,1.516768,-0.704678,0.675735,-0.008389,0.136788,0.094958,0,0,102.0,188.0,46,1,1
4,1.493941,-0.703918,0.672994,0.199441,0.353996,-0.044299,0,0,102.0,188.0,46,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
767655,-1.553643,-1.207026,-2.015848,-0.090358,0.364632,1.418838,3,23,74.0,173.0,18,0,16
767656,-1.533934,-1.243476,-2.010537,0.775085,-0.704872,-1.384102,3,23,74.0,173.0,18,0,16
767657,-1.301587,-1.362634,-1.783378,0.862655,0.054028,-1.188137,3,23,74.0,173.0,18,0,16
767658,-0.524913,-1.486641,-0.952944,0.660700,0.977416,-1.382904,3,23,74.0,173.0,18,0,16


In [48]:
df_sub_0 = df[(df["id"] == 0) & (df["trial"] == 1) & (df["act"] == 0)].drop(columns=["id", "act", 'trial'])

In [50]:
cfg = tsfel.get_features_by_domain(json_path="features.json")

# Extract features
X = tsfel.time_series_features_extractor(cfg, df_sub_0, fs=50, window_size=7)

*** Feature extraction started ***



*** Feature extraction finished ***


In [51]:
X

Unnamed: 0,0_Absolute energy,0_Area under the curve,0_Autocorrelation,0_Centroid,0_ECDF Percentile Count_0,0_ECDF Percentile Count_1,0_ECDF Percentile_0,0_ECDF Percentile_1,0_ECDF_0,0_ECDF_1,...,9_Wavelet variance_0,9_Wavelet variance_1,9_Wavelet variance_2,9_Wavelet variance_3,9_Wavelet variance_4,9_Wavelet variance_5,9_Wavelet variance_6,9_Wavelet variance_7,9_Wavelet variance_8,9_Zero crossing rate
0,38.612340,0.281982,38.612340,0.059135,1.0,5.0,-2.384464,-2.340799,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,37.304400,0.277439,37.304400,0.061019,1.0,5.0,-2.448825,-2.288794,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,28.220228,0.239270,28.220228,0.047715,1.0,5.0,-2.370140,-1.881167,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.453427,0.108571,6.453427,0.052676,1.0,5.0,-1.237540,-0.802936,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,35.784325,0.272415,35.784325,0.068953,1.0,5.0,-2.498675,-2.225172,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,51.863856,0.323605,51.863856,0.055296,1.0,5.0,-3.051735,-2.615604,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
502,53.976588,0.331502,53.976588,0.065286,1.0,5.0,-3.091725,-2.606126,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
503,37.925732,0.265916,37.925732,0.065309,1.0,5.0,-2.829900,-2.091278,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
504,62.645446,0.359298,62.645446,0.059340,1.0,5.0,-3.052120,-2.981562,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
X.columns

Index(['0_Absolute energy', '0_Area under the curve', '0_Autocorrelation',
       '0_Centroid', '0_ECDF Percentile Count_0', '0_ECDF Percentile Count_1',
       '0_ECDF Percentile_0', '0_ECDF Percentile_1', '0_ECDF_0', '0_ECDF_1',
       ...
       '9_Wavelet variance_0', '9_Wavelet variance_1', '9_Wavelet variance_2',
       '9_Wavelet variance_3', '9_Wavelet variance_4', '9_Wavelet variance_5',
       '9_Wavelet variance_6', '9_Wavelet variance_7', '9_Wavelet variance_8',
       '9_Zero crossing rate'],
      dtype='object', length=1331)

In [37]:
np.sum(X.std() > 0.5)

242

In [53]:
def get_features(df):
    cfg = tsfel.get_features_by_domain(json_path="features.json")
    # Extract features
    X = tsfel.time_series_features_extractor(cfg, df_sub_0, fs=50, window_size=7, verbose=0)
    return X

In [55]:
X_features = df.groupby(['id', 'act', 'trial']).apply(get_features)

*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***
