In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

### Import data

In [341]:
# read data

import os

root_folder_path = 'data'

# DataFrame for all the data
all_data = pd.DataFrame(columns=['X', 'Y', 'Z', 'target'])

# look for every folder in root folder
for folder in os.listdir(root_folder_path):
    folder_path = os.path.join(root_folder_path, folder)

    # look for every file in the folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        # read data from the file, skipping titles
        data = pd.read_csv(file_path, skiprows=0)
        # add columns 
        data['target'] = folder
        data.columns = ['X', 'Y', 'Z', 'target']

        #add data to the general DataFrame
        all_data = pd.concat([all_data, data], ignore_index=True)

  all_data = pd.concat([all_data, data], ignore_index=True)


In [342]:
all_data.head()

Unnamed: 0,X,Y,Z,target
0,1.000776,4.616021,8.576031,idle
1,0.718261,4.209007,8.446744,idle
2,-0.909797,-0.282516,9.203311,idle
3,5.09965,0.148441,8.418014,idle
4,1.762132,-0.162806,9.251195,idle


In [343]:
all_data.shape

(193860, 4)

### Data Normalization

In [346]:
# data normalization

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
all_data[['X', 'Y', 'Z']] = scaler.fit_transform(all_data[['X', 'Y', 'Z']])

### Split data into train dataset and test dataset

In [347]:
from sklearn.model_selection import train_test_split

X = all_data[['X', 'Y', 'Z']]
y = all_data['target']

train_indices, test_indices = train_test_split(np.arange(len(all_data) // 30), test_size=0.2, random_state=42)

X_train = pd.concat([X.iloc[i*30:(i+1)*30] for i in train_indices], ignore_index=True)
y_train = pd.concat([y.iloc[i*30:(i+1)*30] for i in train_indices], ignore_index=True)

X_test = pd.concat([X.iloc[i*30:(i+1)*30] for i in test_indices], ignore_index=True)
y_test = pd.concat([y.iloc[i*30:(i+1)*30] for i in test_indices], ignore_index=True)


In [348]:
X_train.shape, X_test.shape

((155070, 3), (38790, 3))

In [349]:
all_data = pd.concat((X_train, y_train), axis = 1)
all_data

Unnamed: 0,X,Y,Z,target
0,-0.900561,0.638839,-0.167744,running
1,0.657052,0.445369,-0.681104,running
2,3.184897,1.776248,0.061303,running
3,-0.519989,1.173571,-0.846897,running
4,0.270212,-1.690103,-0.597208,running
...,...,...,...,...
155065,-0.198097,-0.126982,1.108662,idle
155066,-0.196958,-0.131973,1.110660,idle
155067,-0.196958,-0.131589,1.107331,idle
155068,-0.196958,-0.133892,1.107996,idle


### Create new features for train dataset

In [350]:
mean_x = []
mean_y = []
mean_z = []
target = []
variance_x = []
variance_y = []
variance_z = []
std_x = []
std_y = []
std_z = []
root_mean_square_x = []
root_mean_square_y = []
root_mean_square_z = []
min_x = []
min_y = []
min_z = []
max_x = []
max_y = []
max_z = []
median_x = []
median_y = []
median_z = []

fft_power_x = []
fft_power_y = []
fft_power_z = []
fft_energy_x = []
fft_energy_y = []
fft_energy_z = []
fft_magnitude_x = []
fft_magnitude_y = []
fft_magnitude_z = []
fft_area_x = []
fft_area_y = []
fft_area_z = []
fft_max_amplitude_x = []
fft_max_amplitude_y = []
fft_max_amplitude_z = []
fft_min_amplitude_x = []
fft_min_amplitude_y = []
fft_min_amplitude_z = []
fft_min_index_x = []
fft_min_index_y = []
fft_min_index_z = []
fft_max_index_x = []
fft_max_index_y = []
fft_max_index_z = []
fft_entropy_x = []
fft_entropy_y = []
fft_entropy_z = []
fft_skewness_x = []
fft_skewness_y = []
fft_skewness_z = []
fft_kurtosis_x = []
fft_kurtosis_y = []
fft_kurtosis_z = []
fft_interquartile_range_x = []
fft_interquartile_range_y = []
fft_interquartile_range_z = []
fft_mean_absolute_deviation_x = []
fft_mean_absolute_deviation_y = []
fft_mean_absolute_deviation_z = []

In [351]:
from scipy.stats import entropy, skew, kurtosis

for column in all_data.columns:
    for i in range(0, len(all_data['X']), 30):

        segment = all_data[column].loc[i:i+30]

        if column == 'X':

            mean_x.append(segment.mean())
            min_x.append(segment.min())
            max_x.append(segment.min())
            median_x.append(np.median(segment))
            variance_x.append(np.var(segment))
            std_x.append(np.std(segment))
            root_mean_square_x.append(np.sqrt(np.mean(np.square(segment))))

            x_fft = np.fft.fft(segment, n=len(segment))

            fft_power_x.append((np.mean(np.abs(x_fft) ** 2) / (2 * np.pi)))
            fft_energy_x.append(np.sum(np.abs(x_fft) ** 2) / len(x_fft))

            magnitude_x = np.abs(x_fft)
            fft_magnitude_x.append(np.mean(magnitude_x))
            fft_area_x.append(np.trapz(magnitude_x))

            fft_max_amplitude_x.append(np.max(np.abs(x_fft)))
            fft_min_amplitude_x.append(np.min(np.abs(x_fft)))

            fft_max_index_x.append(np.argmax(np.abs(x_fft)))
            fft_min_index_x.append(np.argmin(np.abs(x_fft)))

            fft_entropy_x = entropy(np.abs(x_fft))
            fft_skewness_x = skew(np.abs(x_fft))
            fft_kurtosis_x = kurtosis(np.abs(x_fft))

            fft_interquartile_range_x = np.percentile(np.abs(x_fft), 75) - np.percentile(np.abs(x_fft), 25)
            fft_mean_absolute_deviation_x = np.mean(np.abs(np.abs(x_fft) - np.mean(np.abs(x_fft))))

        if column == 'Y':

            mean_y.append(segment.mean())
            min_y.append(segment.min())
            max_y.append(segment.min())
            median_y.append(np.median(segment))
            variance_y.append(np.var(segment))
            std_y.append(np.std(segment))
            root_mean_square_y.append(np.sqrt(np.mean(np.square(segment))))

            y_fft = np.fft.fft(segment, n=len(segment))

            fft_power_y.append((np.mean(np.abs(y_fft) ** 2) / (2 * np.pi)))
            fft_energy_y.append(np.sum(np.abs(y_fft) ** 2) / len(y_fft))

            magnitude_y = np.abs(y_fft)
            fft_magnitude_y.append(np.mean(magnitude_y))
            fft_area_y.append(np.trapz(magnitude_y))

            fft_max_amplitude_y.append(np.max(np.abs(y_fft)))
            fft_min_amplitude_y.append(np.min(np.abs(y_fft)))

            fft_max_index_y.append(np.argmax(np.abs(y_fft)))
            fft_min_index_y.append(np.argmin(np.abs(y_fft)))

            fft_entropy_y = entropy(np.abs(y_fft))
            fft_skewness_y = skew(np.abs(y_fft))
            fft_kurtosis_y = kurtosis(np.abs(y_fft))

            fft_interquartile_range_y = np.percentile(np.abs(y_fft), 75) - np.percentile(np.abs(y_fft), 25)
            fft_mean_absolute_deviation_y = np.mean(np.abs(np.abs(y_fft) - np.mean(np.abs(y_fft))))

        if column == 'Z':

            mean_z.append(segment.mean())
            min_z.append(segment.min())
            max_z.append(segment.min())
            median_z.append(np.median(segment))
            variance_z.append(np.var(segment))
            std_z.append(np.std(segment))
            root_mean_square_z.append(np.sqrt(np.mean(np.square(segment))))

            z_fft = np.fft.fft(segment, n=len(segment))

            fft_power_z.append((np.mean(np.abs(z_fft) ** 2) / (2 * np.pi)))
            fft_energy_z.append(np.sum(np.abs(z_fft) ** 2) / len(z_fft))

            magnitude_z = np.abs(z_fft)
            fft_magnitude_z.append(np.mean(magnitude_z))
            fft_area_z.append(np.trapz(magnitude_z))

            fft_max_amplitude_z.append(np.max(np.abs(z_fft)))
            fft_min_amplitude_z.append(np.min(np.abs(z_fft)))

            fft_max_index_z.append(np.argmax(np.abs(z_fft)))
            fft_min_index_z.append(np.argmin(np.abs(z_fft)))

            fft_entropy_z = entropy(np.abs(z_fft))
            fft_skewness_z = skew(np.abs(z_fft))
            fft_kurtosis_z = kurtosis(np.abs(z_fft))

            fft_interquartile_range_z = np.percentile(np.abs(z_fft), 75) - np.percentile(np.abs(z_fft), 25)
            fft_mean_absolute_deviation_z = np.mean(np.abs(np.abs(z_fft) - np.mean(np.abs(z_fft))))

        if column == 'target':
            
            target.append(all_data[column].loc[i])

In [352]:
data = {
    'target': target,
    'mean_x': mean_x,
    'mean_y': mean_y,
    'mean_z': mean_z,
    'variance_x': variance_x,
    'variance_y': variance_y,
    'variance_z': variance_z,
    'std_x': std_x,
    'std_y': std_y,
    'std_z': std_z,
    'root_mean_square_x': root_mean_square_x,
    'root_mean_square_y': root_mean_square_y,
    'root_mean_square_z': root_mean_square_z,
    'min_x': min_x,
    'min_y': min_y,
    'min_z': min_z,
    'max_x': max_x,
    'max_y': max_y,
    'max_z': max_z,
    'median_x': median_x,
    'median_y': median_y,
    'median_z': median_z,
    'fft_power_x': fft_power_x,
    'fft_power_y': fft_power_y,
    'fft_power_z': fft_power_z,
    'fft_energy_x': fft_energy_x,
    'fft_energy_y': fft_energy_y,
    'fft_energy_z': fft_energy_z,
    'fft_magnitude_x': fft_magnitude_x,
    'fft_magnitude_y': fft_magnitude_y,
    'fft_magnitude_z': fft_magnitude_z,
    'fft_area_x': fft_area_x,
    'fft_area_y': fft_area_y,
    'fft_area_z': fft_area_z,
    'fft_max_amplitude_x': fft_max_amplitude_x,
    'fft_max_amplitude_y': fft_max_amplitude_y,
    'fft_max_amplitude_z': fft_max_amplitude_z,
    'fft_min_amplitude_x': fft_min_amplitude_x,
    'fft_min_amplitude_y': fft_min_amplitude_y,
    'fft_min_amplitude_z': fft_min_amplitude_z,
    'fft_min_index_x': fft_min_index_x,
    'fft_min_index_y': fft_min_index_y,
    'fft_min_index_z': fft_min_index_z,
    'fft_max_index_x': fft_max_index_x,
    'fft_max_index_y': fft_max_index_y,
    'fft_max_index_z': fft_max_index_z,
    'fft_entropy_x': fft_entropy_x,
    'fft_entropy_y': fft_entropy_y,
    'fft_entropy_z': fft_entropy_z,
    'fft_skewness_x': fft_skewness_x,
    'fft_skewness_y': fft_skewness_y,
    'fft_skewness_z': fft_skewness_z,
    'fft_kurtosis_x': fft_kurtosis_x,
    'fft_kurtosis_y': fft_kurtosis_y,
    'fft_kurtosis_z': fft_kurtosis_z,
    'fft_interquartile_range_x': fft_interquartile_range_x,
    'fft_interquartile_range_y': fft_interquartile_range_y,
    'fft_interquartile_range_z': fft_interquartile_range_z,
    'fft_mean_absolute_deviation_x': fft_mean_absolute_deviation_x,
    'fft_mean_absolute_deviation_y': fft_mean_absolute_deviation_y,
    'fft_mean_absolute_deviation_z': fft_mean_absolute_deviation_z
}

In [353]:
df = pd.DataFrame(data)
df

Unnamed: 0,target,mean_x,mean_y,mean_z,variance_x,variance_y,variance_z,std_x,std_y,std_z,...,fft_skewness_z,fft_kurtosis_x,fft_kurtosis_y,fft_kurtosis_z,fft_interquartile_range_x,fft_interquartile_range_y,fft_interquartile_range_z,fft_mean_absolute_deviation_x,fft_mean_absolute_deviation_y,fft_mean_absolute_deviation_z
0,running,0.340361,0.533312,0.132526,1.224302,1.145717,2.170678,1.106482,1.070382,1.473322,...,5.199445,25.004007,25.014514,25.034322,0.029413,0.020257,0.017771,0.374815,0.245728,2.140642
1,running,1.268214,0.446273,-0.189889,2.816933,2.107026,0.495583,1.678372,1.451560,0.703976,...,5.199445,25.004007,25.014514,25.034322,0.029413,0.020257,0.017771,0.374815,0.245728,2.140642
2,running,0.325714,0.382042,0.039373,0.659337,1.113117,1.311203,0.811996,1.055044,1.145078,...,5.199445,25.004007,25.014514,25.034322,0.029413,0.020257,0.017771,0.374815,0.245728,2.140642
3,walking,-0.683130,-0.978073,-0.593428,0.212022,0.289191,0.676491,0.460459,0.537765,0.822491,...,5.199445,25.004007,25.014514,25.034322,0.029413,0.020257,0.017771,0.374815,0.245728,2.140642
4,stairs,0.071766,-1.003928,-0.465287,0.132404,0.314774,1.706477,0.363873,0.561047,1.306322,...,5.199445,25.004007,25.014514,25.034322,0.029413,0.020257,0.017771,0.374815,0.245728,2.140642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5164,running,0.412274,0.436490,-0.023494,0.843319,1.088478,0.514717,0.918324,1.043302,0.717438,...,5.199445,25.004007,25.014514,25.034322,0.029413,0.020257,0.017771,0.374815,0.245728,2.140642
5165,walking,-0.454967,-0.858305,-0.444002,0.111146,0.158627,0.141397,0.333385,0.398281,0.376028,...,5.199445,25.004007,25.014514,25.034322,0.029413,0.020257,0.017771,0.374815,0.245728,2.140642
5166,walking,-0.174628,-1.028100,-0.433499,0.377781,0.405404,1.030450,0.614639,0.636714,1.015111,...,5.199445,25.004007,25.014514,25.034322,0.029413,0.020257,0.017771,0.374815,0.245728,2.140642
5167,walking,-0.428356,-0.916913,-0.413545,0.201396,0.287543,1.586627,0.448772,0.536230,1.259614,...,5.199445,25.004007,25.014514,25.034322,0.029413,0.020257,0.017771,0.374815,0.245728,2.140642


In [354]:
all_data.shape, df.shape

((155070, 4), (5169, 61))

### Select better features

In [387]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X_train = df.iloc[:, 1:]
y_train = df['target']

info_classif_selector = SelectKBest(score_func=mutual_info_classif, k='all')
X_info_classif = info_classif_selector.fit_transform(X_train, y_train)
info_classif_scores = info_classif_selector.scores_

print("Mutual Information (InfoGain) scores:")
for column_number, score in enumerate(info_classif_scores):
    print(f"Feature {column_number}: {score}")

Mutual Information (InfoGain) scores:
Feature 0: 0.6926372552430495
Feature 1: 0.877487199295405
Feature 2: 0.6764534183230491
Feature 3: 0.827060250177643
Feature 4: 0.8438639862913042
Feature 5: 0.4226352091057477
Feature 6: 0.8269745271299043
Feature 7: 0.84387172429162
Feature 8: 0.4226454054832067
Feature 9: 0.7374700306301183
Feature 10: 0.5119978531126965
Feature 11: 0.1481020180966921
Feature 12: 0.7994691349567427
Feature 13: 0.8305112267793269
Feature 14: 0.9052686699617867
Feature 15: 0.8021128257167931
Feature 16: 0.8291485659768947
Feature 17: 0.9044728783777396
Feature 18: 0.8162975855967645
Feature 19: 0.9324163368893756
Feature 20: 0.7303713543643129
Feature 21: 0.7374862797639505
Feature 22: 0.5119012764600086
Feature 23: 0.14798891774936274
Feature 24: 0.7374862797639505
Feature 25: 0.5119012764600086
Feature 26: 0.14798891774936274
Feature 27: 0.8052496189124834
Feature 28: 0.7455394663362125
Feature 29: 0.3561578779379364
Feature 30: 0.8181362423580718
Feature 31: 0

In [388]:
top_features = info_classif_scores.argsort()[::-1]
features_indexed = []
for column_number, index in enumerate(top_features):
    if info_classif_scores[index] > 0.80:
        features_indexed.append(index)
        #print(f"Feature {index} {X_train.columns[index]}: Importance {info_classif_scores[index]}")

print(X_train.columns[features_indexed])

Index(['median_y', 'min_z', 'max_z', 'mean_y', 'std_y', 'variance_y',
       'fft_max_amplitude_y', 'min_y', 'max_y', 'variance_x', 'std_x',
       'fft_area_x', 'median_x', 'fft_magnitude_x', 'max_x'],
      dtype='object')


In [389]:
features_list = X_train.columns[features_indexed]
X_train = X_train[features_list]
X_train.shape, y_train.shape

((5169, 15), (5169,))

### Create features for test dataset

In [390]:
X_train.columns

Index(['median_y', 'min_z', 'max_z', 'mean_y', 'std_y', 'variance_y',
       'fft_max_amplitude_y', 'min_y', 'max_y', 'variance_x', 'std_x',
       'fft_area_x', 'median_x', 'fft_magnitude_x', 'max_x'],
      dtype='object')

In [391]:
max_x = []
max_y = []
max_z = []
#min_x = []
min_y = []
min_z = []
std_x = []
std_y = []
mean_y = []
median_x = []
median_y = []
variance_x = []
variance_y = []
fft_magnitude_x = []
fft_max_amplitude_y = []
fft_area_x = []
target = []

In [392]:
test_data = pd.concat((X_test, y_test), axis=1)
test_data.head()

Unnamed: 0,X,Y,Z,target
0,-0.763828,-0.959213,-0.036575,running
1,-0.173029,0.58932,0.689852,running
2,-3.055809,2.246873,-1.679191,running
3,-1.010517,-0.775339,-0.389468,running
4,-0.480678,1.335948,-0.009276,running


In [393]:
for column in test_data.columns:
    for i in range(0, len(test_data['target']), 30):

        segment = test_data[column].loc[i:i+30]

        if column == 'X':

            #min_x.append(segment.min())
            max_x.append(segment.min())
            std_x.append(np.std(segment))
            median_x.append(np.median(segment))
            variance_x.append(np.var(segment))

            x_fft = np.fft.fft(segment, n=len(segment))

            magnitude_x = np.abs(x_fft)
            fft_magnitude_x.append(np.mean(magnitude_x))
            fft_area_x.append(np.trapz(magnitude_x))

        if column == 'Y':

            mean_y.append(segment.mean())
            min_y.append(segment.min())
            max_y.append(segment.min())
            median_y.append(np.median(segment))
            variance_y.append(np.var(segment))
            std_y.append(np.std(segment))

            y_fft = np.fft.fft(segment, n=len(segment))

            fft_max_amplitude_y.append(np.max(np.abs(y_fft)))

        if column == 'Z':

            min_z.append(segment.min())
            max_z.append(segment.min())

        if column == 'target':
            
            target.append(test_data[column].loc[i])

In [394]:
data_test = {
    'target': target,
    'max_x': max_x,
    'max_y': max_y,
    'max_z': max_z,
    #'min_x': min_x,
    'min_y': min_y,
    'min_z': min_z,
    'std_x': std_x,
    'std_y': std_y,
    'mean_y': mean_y,
    'median_x': median_x,
    'median_y': median_y,
    'variance_x': variance_x,
    'variance_y': variance_y,
    'fft_magnitude_x': fft_magnitude_x,
    'fft_max_amplitude_y': fft_max_amplitude_y,
    'fft_area_x': fft_area_x
}

df_test = pd.DataFrame(data_test)

In [395]:
df_test.columns

Index(['target', 'max_x', 'max_y', 'max_z', 'min_y', 'min_z', 'std_x', 'std_y',
       'mean_y', 'median_x', 'median_y', 'variance_x', 'variance_y',
       'fft_magnitude_x', 'fft_max_amplitude_y', 'fft_area_x'],
      dtype='object')

In [396]:
X_test = df_test.iloc[:, 1:]
y_test = df_test['target']

In [401]:
X_test = X_test.sort_index(axis=1)
X_train = X_train.sort_index(axis=1)

In [402]:
X_test.shape, X_train.shape, y_test.shape, y_train.shape

((1293, 15), (5169, 15), (1293,), (5169,))

### Support Vector Machine (SVM)

In [403]:
from sklearn.svm import SVC

In [404]:
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)
svm_prediction = svm_classifier.predict(X_test)

### Random Forest

In [406]:
from sklearn.ensemble import RandomForestClassifier

In [407]:
random_forest_classifier = RandomForestClassifier()
random_forest_classifier.fit(X_train, y_train)
random_forest_prediction = random_forest_classifier.predict(X_test)

### Classification Report

In [411]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [419]:
print('SVM')
print(confusion_matrix(y_test, svm_prediction))
print('Random Forest')
print(confusion_matrix(y_test, random_forest_prediction))

SVM
[[217   3   0   0]
 [  3 683   0   3]
 [  0   0   0  30]
 [  0  10   0 344]]
Random Forest
[[220   0   0   0]
 [  0 689   0   0]
 [  0   0  25   5]
 [  0   0   2 352]]


In [410]:
print('SVM')
print(classification_report(y_test, svm_prediction, zero_division=1))
print('Random Forest')
print(classification_report(y_test, random_forest_prediction, zero_division=1))

SVM
              precision    recall  f1-score   support

        idle       0.99      0.99      0.99       220
     running       0.98      0.99      0.99       689
      stairs       1.00      0.00      0.00        30
     walking       0.91      0.97      0.94       354

    accuracy                           0.96      1293
   macro avg       0.97      0.74      0.73      1293
weighted avg       0.96      0.96      0.95      1293

Random Forest
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       220
     running       1.00      1.00      1.00       689
      stairs       0.93      0.83      0.88        30
     walking       0.99      0.99      0.99       354

    accuracy                           0.99      1293
   macro avg       0.98      0.96      0.97      1293
weighted avg       0.99      0.99      0.99      1293

