In [48]:
# This is the template for the submission. If you want, you can develop your algorithm in a regular Python script and copy the code here for submission.

# Team members (e-mail, legi):
# zhisun@ethz.ch, 22-958-227
# enjcao@ethz.ch, 22-942-700
# yifzhou@ethz.ch, 22-940-381

In [None]:
%pip install xgboost 

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.stats import entropy
from scipy.signal import welch
from scipy.fftpack import fft

from Lilygo.Recording import Recording, data_integrity
from Lilygo.Dataset import Dataset

import joblib
import math

import xgboost as xgb
from xgboost import XGBClassifier

## Filtering and Feature Extraction

In [7]:
# data pre-processing
# This function aims to find the component caused by gravity from data, which means the signal around 0 Hz
def get_gravity(data):
    filtered_data = np.zeros_like(data)
    # Parameters in IIR filter
    alpla = [1, -1.979133761292768, 0.979521463540373]
    beta = [0.000086384997973502, 0.00012769995947004, 0.000086384997973502]
    # Formula of IIR filter
    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

def get_highpass(data):
    filtered_data = np.zeros_like(data)  # filtered_data
    alpla = [1, -1.905384612118461, 0.910092542787947]
    beta = [0.953986986993339, -1.907503180919730, 0.953986986993339]

    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

# This funciton aims to realize a high-pass filter with cutoff frequency = 5 Hz. Because according to massive amounts of data, the general 
# maximum frequency of human walking is about 5 Hz
def get_lowpass(data):
    filtered_data = np.zeros_like(data)  # filtered_data
    alpla = [1, -1.80898117793047, 0.827224480562408]
    beta = [0.096665967120306, -0.172688631608676, 0.095465967120306]

    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

def pre_process(data):
    # Find the component caused by gravity from data and remove it from the singanl
    data_gravity = get_gravity(data)
    data_user = data - data_gravity
    # Get user's acceleration along the gravity direction by dot product
    data_acc = data_user * data_gravity
    # Add low pass and high pass filter to reduce noise in signal (possible human walking rate:1 - 5Hz)
    data_filtered = get_highpass(data_acc)
    data_filtered = get_lowpass(data_filtered)
    return data_filtered

def preprocess_and_extract_features(trace, window_size=15, sampling_rate=200):
    """
    Preprocess the data and extract features from the 3D accelerometer, gyroscope, and magnetometer data.

    Args:
    trace (Lilygo.Recording.Recording): Object containing the raw data with accelerometer data stored in lists (e.g. trace.data['ax'])
    window_size (int): The window size in seconds for splitting the data
    sampling_rate (int): The sampling rate of the data in Hz

    Returns:
    pd.DataFrame: A DataFrame with the extracted features and location labels
    """
    # Read data from trace
    ax = pre_process(trace.data['ax'].values)
    ay = pre_process(trace.data['ay'].values)
    az = pre_process(trace.data['az'].values)
    
    gx = pre_process(trace.data['gx'].values)
    gy = pre_process(trace.data['gy'].values)
    gz = pre_process(trace.data['gz'].values)
    
    '''mx = trace.data['mx'].values
    my = trace.data['my'].values
    mz = trace.data['mz'].values'''
    
    # Compute the length of each window in samples
    window_samples = window_size * sampling_rate
    
    # Compute the number of windows in the recording
    num_windows = len(ax) // window_samples

    # Initialize lists for storing extracted features and location labels
    features = []
    loc_labels = []

    # Helper function to compute the magnitude of a vector
    magnitude = lambda vec: np.sqrt(np.sum(vec**2, axis=1))

    for i in range(num_windows):
       
        # Extract the accelerometer, gyroscope, and magnetometer data for the current window
        acc_data = np.array([ax[i*window_samples:(i+1)*window_samples],
                             ay[i*window_samples:(i+1)*window_samples],
                             az[i*window_samples:(i+1)*window_samples]]).T
        gyro_data = np.array([gx[i*window_samples:(i+1)*window_samples],
                              gy[i*window_samples:(i+1)*window_samples],
                              gz[i*window_samples:(i+1)*window_samples]]).T
        '''mag_data = np.array([mx[i*window_samples:(i+1)*window_samples],
                             my[i*window_samples:(i+1)*window_samples],
                             mz[i*window_samples:(i+1)*window_samples]]).T'''
        
        '''figure,ax = plt.subplots(3, 1, figsize=(10, 6))
        ax[0].plot(acc_data[:,0])
        ax[0].set_ylabel('ax')
        ax[1].plot(acc_data[:,1])
        ax[1].set_ylabel('ay')
        ax[2].plot(acc_data[:,2])
        ax[2].set_ylabel('az')'''

        # Compute magnitudes
        acc_magnitude = np.sqrt(np.sum(acc_data**2, axis=1))
        acc_mag_mean = abs(np.mean(acc_magnitude))
        acc_mag_std = np.std(acc_magnitude)
        #gyro_magnitude = magnitude(gyro_data)
        #mag_magnitude = magnitude(mag_data)


        # ----ACCELERATOR TIME DOMAIN----
        ax_mean = abs(np.mean(acc_data[:,0]))
        ay_mean = abs(np.mean(acc_data[:,1]))
        az_mean = abs(np.mean(acc_data[:,2]))
        a_mean_list = [ax_mean, ay_mean, az_mean]
        a_mean_list.sort() # Sorting list of numbers in ascending
        Am = a_mean_list[2] # Feature Am: the maximum mean among all dimensions (represents motion range for location)
        Bm = a_mean_list[2]/a_mean_list[1] # Feature Bm and Cm: ratio of the maximum mean in different axes (represents DoF in movement for location)
        Cm = a_mean_list[2]/a_mean_list[0] 

        ax_range = acc_data[np.argmax(acc_data[:,0]), 0] - acc_data[np.argmin(acc_data[:,0]), 0]
        ay_range = acc_data[np.argmax(acc_data[:,1]), 1] - acc_data[np.argmin(acc_data[:,1]), 1]
        az_range = acc_data[np.argmax(acc_data[:,2]), 2] - acc_data[np.argmin(acc_data[:,2]), 2]
        a_range_list = [ax_range, ay_range, az_range]
        a_range_list.sort() # Sorting list of numbers in ascending
        A = a_range_list[2] # Feature A: the maximum range among all dimensions (represents motion range for location)
        B = a_range_list[2]/a_range_list[1] # Feature B and C: ratio of the maximum ranges in different axes (represents DoF in movement for location)
        C = a_range_list[2]/a_range_list[0] 
        
        
        # ----GYROSCOPE TIME DOMAIN----
        gx_mean = abs(np.mean(gyro_data[:,0]))
        gy_mean = abs(np.mean(gyro_data[:,1]))
        gz_mean = abs(np.mean(gyro_data[:,2]))
        g_mean_list = [gx_mean, gy_mean, gz_mean]
        g_mean_list.sort() # Sorting list of numbers in ascending
        Gm = g_mean_list[2] 
        Hm = g_mean_list[2]/g_mean_list[1] 
        Im = g_mean_list[2]/g_mean_list[0] 

        gx_range = gyro_data[np.argmax(gyro_data[:,0]), 0] - gyro_data[np.argmin(gyro_data[:,0]), 0]
        gy_range = gyro_data[np.argmax(gyro_data[:,1]), 1] - gyro_data[np.argmin(gyro_data[:,1]), 1]
        gz_range = gyro_data[np.argmax(gyro_data[:,2]), 2] - gyro_data[np.argmin(gyro_data[:,2]), 2]
        g_range_list = [gx_range, gy_range, gz_range]
        g_range_list.sort() # Sorting list of numbers in ascending
        G = g_range_list[2] 
        H = g_range_list[2]/g_range_list[1] 
        I = g_range_list[2]/g_range_list[0]


        # ----ACCELERATOR FREQUENCY DOMAIN----
        freq, Pxx = welch(acc_magnitude, fs=sampling_rate) # use of the fast Fourier transform for the estimation of power spectra
        freq_band = np.logical_and(freq >= 0.3, freq <= 15) 
        power_in_band = Pxx[freq_band] 
        freq_in_band = freq[freq_band] 
        #plt.plot(freq,Pxx)

        # D and F reflects impact of strides on acceleration
        D = np.max(power_in_band) # Feature D: the maximum energy captured by the accelerator, 
        total_power = np.sum(power_in_band) # Feature F: total power in the frequencies between 0.3 and 15 Hz:

        norm_Pxx = Pxx / total_power # normalize the power spectrum
        E = entropy(norm_Pxx) # Feature E: normalized information entropy of the discrete FFT component magnitudes

        

        sorted_idx = np.argsort(power_in_band)[::-1] 
        first_freq = freq_in_band[sorted_idx[0]] 
        second_freq = freq_in_band[sorted_idx[1]] 
        first_power = power_in_band[sorted_idx[0]] 
        second_power = power_in_band[sorted_idx[1]]

        R1 = np.sum(power_in_band[freq_in_band  < 3]) / total_power
        R3 = np.sum(Pxx[(freq >= 1.5) & (freq <= 2.5)]) / total_power 

        

        # ----MOVING VS: STANDING----
        moving = False
        # Append the features to the list
        if acc_mag_mean > 0.1 and total_power >0.0001:
            moving = True
        
        # Calculate the timestamp for the current window as the median of the timestamps (not necessary for location)
        # timestamp = np.median(trace.timestamp[i*window_samples:(i+1)*window_samples]) 
        try:
            # Determine the location label for the current window based on the timestamp
            loc_label = trace.labels.get('board_loc')
            # Append the label to the labels list
            loc_labels.append(loc_label)
        except Exception as error:
            #print("!-This might be testing trace and does not have labels. Error: ",error)
            pass
        features.append([moving, A, B, C, Am,Bm,Cm, acc_mag_mean , 
                         D, E, total_power, first_freq, first_power,
                         G,H,I,Gm,Hm,Im,acc_mag_std])

    # Create a DataFrame with the extracted features and location labels
    features_df = pd.DataFrame(features, columns=['moving','A', 'B', 'C','Am', 'Bm', 'Cm','acc_mag' ,
                                                  'D', 'E', 'total_power', 'first_freq', 'first_power',
                                                  'G','H','I','Gm','Hm','Im','acc_std'])
    features_df['loc_label'] = loc_labels
    #print(features_df)

    return features_df

In [53]:
# path_training = '/kaggle/input/mobile-health-2023-path-detection/data/train'
path_training = "./data/train/"
traceNames=os.listdir(path_training)

'''# Get features of training traces
X = []
y = []
for traceName in traceNames[3:20]:
    if traceName[-5:] == '.json':
        trace = Recording(path_training+'/'+ traceName, no_labels=False, mute=True)

        figure, ax = plt.subplots(6,1,figsize=(60, 5))
        figure.suptitle(str(trace.labels))
        ax[0].plot(get_lowpass(get_highpass(trace.data['ax'].values)))
        ax[0].plot(get_lowpass(get_highpass(trace.data['ay'].values)))
        ax[0].plot(get_lowpass(get_highpass(trace.data['az'].values)))
        
        
        # Get features of data
        features_df = preprocess_and_extract_features(trace)
        ax[1].plot(features_df['acc_mag'])
        ax[1].fill_between(np.arange(len(features_df['acc_mag'])), 0, 
                           features_df['acc_mag'], 
                           where=features_df['moving']==True, 
                           color='red', alpha=0.3)

        ax[2].plot(features_df['A'])
        ax[2].plot(features_df['B'])
        ax[2].plot(features_df['C'])
        ax[2].legend(['A: Acc-Max_mag','B: Ratio','C: Ratio'])

        ax[3].plot(features_df['D'])
        #ax[3].plot(features_df['E']) # Entropy makes no difference for location
        ax[3].plot(features_df['F'])
        ax[3].legend(['D: Max_energy','F: Overall energy'])

        ax[4].plot(features_df['G'])
        ax[4].plot(features_df['H'])
        ax[4].plot(features_df['I'])
        ax[4].legend(['G: Gyro-Max_mag','B: Ratio','C: Ratio'])

        ax[5].plot(features_df['acc_std'])'''

"# Get features of training traces\nX = []\ny = []\nfor traceName in traceNames[3:20]:\n    if traceName[-5:] == '.json':\n        trace = Recording(path_training+'/'+ traceName, no_labels=False, mute=True)\n\n        figure, ax = plt.subplots(6,1,figsize=(60, 5))\n        figure.suptitle(str(trace.labels))\n        ax[0].plot(get_lowpass(get_highpass(trace.data['ax'].values)))\n        ax[0].plot(get_lowpass(get_highpass(trace.data['ay'].values)))\n        ax[0].plot(get_lowpass(get_highpass(trace.data['az'].values)))\n        \n        \n        # Get features of data\n        features_df = preprocess_and_extract_features(trace)\n        ax[1].plot(features_df['acc_mag'])\n        ax[1].fill_between(np.arange(len(features_df['acc_mag'])), 0, \n                           features_df['acc_mag'], \n                           where=features_df['moving']==True, \n                           color='red', alpha=0.3)\n\n        ax[2].plot(features_df['A'])\n        ax[2].plot(features_df['B']

## Load training traces and train

In [8]:
# path_training = '/kaggle/input/mobile-health-2023-path-detection/data/train'
# path_training = "./data/train/"
dir_data = 'E:\\Sunzhichao\\ETHz\\2223Spring\\Mobile_Health\\data\\'
path_training = dir_data + 'train\\'
traceNames=os.listdir(path_training)

# Get features of training traces
X = pd.DataFrame()
y = pd.DataFrame()
for traceName in traceNames[0:220]:
    if traceName[-5:] == '.json':
        trace = Recording(path_training+'/'+ traceName, no_labels=False, mute=True)

        features_df = preprocess_and_extract_features(trace)
        features_df = features_df[features_df['moving']] # Keep only moving windows
        
        # Prepare data for classification
        # X_trace = features_df.drop('loc_label', axis=1)
        X_trace = features_df.drop(['loc_label', 'Gm', 'Hm', 'Im'], axis=1)
        y_trace = features_df['loc_label']
        X = pd.concat([X, X_trace], axis=0).reset_index(drop=True)
        y = pd.concat([y, y_trace], axis=0).reset_index(drop=True)
        
# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Using features of all training traces as training set
X_train = X_scaled
y_train = y
#X_train, _, y_train, _ = train_test_split(X_scaled, y, test_size=0.01, random_state=42)

# Check the number of classes in y_train
'''if len(set(y_train)) < 2:
    raise ValueError("The number of classes has to be greater than one; got %d class" % len(set(y_train)))'''
  

'if len(set(y_train)) < 2:\n    raise ValueError("The number of classes has to be greater than one; got %d class" % len(set(y_train)))'

### Train XGBoost with determined hyperparameters

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.3, shuffle = True)

# Create the XGBoost DMatrix object
dtrain = xgb.DMatrix(X_train, label=y_train)

# Set the parameters for the XGBoost model
params = {'eta': 0.3, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000, 'num_class': 3, 'objective': 'multi:softmax'}

# Train the model
xgb_model = xgb.train(params, dtrain)
print("Training XGBoost classifier for activity")

# Create the XGBoost DMatrix object for the test data
dtest = xgb.DMatrix(X_test)
y_pred = xgb_model.predict(dtest)

print("accuracy: ", accuracy_score(y_test, y_pred))

# Save the trained xgboost model
joblib.dump(xgb_model, './trained_models/location_xgboost_model.joblib')


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Training XGBoost classifier for activity
accuracy:  0.9548872180451128


['./trained_models/location_xgboost_model.joblib']

### Train SVM

In [None]:
# Train the SVM classifier with RBF kernel
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
grid = GridSearchCV(SVC(kernel='rbf'), param_grid, refit=True, verbose=2)
grid.fit(X_train, np.ravel(y_train))

# Print the best hyperparameters and the corresponding accuracy score
print("Best Hyperparameters: ", grid.best_params_)
print("Best Accuracy Score: ", grid.best_score_)

# Save the trained SVM model
joblib.dump(grid.best_estimator_, './trained_models/location_svm_model.joblib')#
# joblib.dump(grid, './trained_models/location_svm_model.joblib')

In [55]:
model = 'xgBoost'

if model == 'SVM':
    # Train the SVM classifier with RBF kernel
    param_grid = {
        'C': [0.1, 1, 10, 100], 
        'gamma': [1, 0.1, 0.01, 0.001]
        }
    grid = GridSearchCV(SVC(kernel='rbf'), param_grid, refit=True, verbose=2)
    grid.fit(X_train, np.ravel(y_train))
    
    # Save the trained SVM model
    joblib.dump(grid.best_estimator_, './trained_models/location_svm_model.joblib')#
    # joblib.dump(grid, './trained_models/location_svm_model.joblib')
    
elif model == 'xgBoost':
    # Define the XGBoost DMatrix object
    dtrain = xgb.DMatrix(X_train, label=y_train)

    # Define the hyperparameters to be tuned
    params = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.05, 0.01],
        'eta': [0.3, 0.2, 0.1, 0.05, 0.01],
        'n_estimators': [50, 100, 500, 1000],
        'objective': ['multi:softmax'],
        'num_class': [3], #Best Hyperparameters:  {'eta': 0.3, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000, 'num_class': 3, 'objective': 'multi:softmax'} Best Accuracy Score:  0.916336131918875
    }
    '''params = {
        'max_depth': [3, 7],
        'learning_rate': [0.1, 0.01],
        'eta': [ 0.2, 0.1],
        'n_estimators': [50, 100],
        'objective': ['multi:softmax'],
        'num_class': [3],
    } ''' #Best Hyperparameters:  {'eta': 0.2, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'num_class': 3, 'objective': 'multi:softmax'} Best Accuracy Score:  0.9072221049713012

    # Define the XGBoost classifier
    xgb_classifier = xgb.XGBClassifier()

    # Define the GridSearchCV object
    grid = GridSearchCV(estimator = xgb_classifier, param_grid = params, cv=5, n_jobs=-1, scoring='accuracy')

    # Fit the GridSearchCV object to the training data
    grid.fit(X_train, y_train)

    
    # Save the trained xgboost model
    joblib.dump(grid, './trained_models/location_xgboost_model.joblib')


# Print the best hyperparameters and the corresponding accuracy score
print("Best Hyperparameters: ", grid.best_params_)
print("Best Accuracy Score: ", grid.best_score_)

Best Hyperparameters:  {'eta': 0.3, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000, 'num_class': 3, 'objective': 'multi:softmax'}
Best Accuracy Score:  0.916336131918875


## Predict on testing traces with trained model and Evaluation predictions

In [54]:

plot_wrong_prediton = False
model = 'xgBoost'

if model == 'SVM':
    loaded_model = joblib.load('./trained_models/location_svm_model.joblib')
elif model == 'xgBoost':
    loaded_model = joblib.load('./trained_models/location_xgboost_model.joblib')



y_labels = [] # labels of all testing traces
y_overalls = [] # final overall predictions for all testing traces

for traceName in traceNames[221:-1]:
    if traceName[-5:] == '.json':
        # Load trace and extract features
        trace = Recording(path_training+'/'+ traceName, no_labels=False, mute=True)
        features_df = preprocess_and_extract_features(trace)
        features_df = features_df[features_df['moving']] # Keep only moving windows
    
        # Prepare data for classification
        y_trace = features_df['loc_label'] # loc labels for every window in a trace (the same loc for one trace)
        y_label = np.argmax(np.bincount(y_trace.astype(int))) # Squeeze the same loc labels for every window of a trace into the only label for the whole trace
        y_labels.append(y_label)

        X_test = features_df.drop(['loc_label', 'Gm', 'Hm', 'Im'], axis=1)
        assert all(X_test['moving']), "Not all features are extracted from moving part"
        X_test = scaler.fit_transform(X_test)
        if model == 'xgBoost':
            dtest = xgb.DMatrix(X_test) # Create the XGBoost DMatrix object for the test data
        
        # Predict the location with loaded model
        y_pred = loaded_model.predict(dtest)
        y_pred = np.squeeze(y_pred)
        y_overall = np.argmax(np.bincount(y_pred.astype(int))) # Get the mode of predictions of every windows in a trace of the final overall predicition for the whole trace
        y_overalls.append(y_overall)
        print('(figure above, if pred wrong) label:',y_label,'prediction:',y_overall)
        
        if y_overall != y_label and plot_wrong_prediton:
            figure, ax = plt.subplots(4,1,figsize=(60, 5))
            figure.suptitle(str(trace.labels))
            ax[0].plot(get_lowpass(get_highpass(trace.data['ax'].values)))
            ax[0].plot(get_lowpass(get_highpass(trace.data['ay'].values)))
            ax[0].plot(get_lowpass(get_highpass(trace.data['az'].values)))

            ax[1].plot(features_df['acc_mag'])
        
            ax[2].plot(features_df['A'])
            ax[2].plot(features_df['B'])
            ax[2].plot(features_df['C'])
            ax[2].legend(['A: Acc-Max_mag','B: Ratio','C: Ratio'])

            ax[3].plot(y_pred)
            plt.show()
        

# Convert list to array    
y_labels = np.array(y_labels)
y_overalls = np.array(y_overalls)

# Evaluate the classifier
print("Confusion Matrix:\n", confusion_matrix(y_labels, y_overalls))
print("Classification Report:\n", classification_report(y_labels, y_overalls))
print("Accuracy Score:", accuracy_score(y_labels, y_overalls))
        

(figure above, if pred wrong) label: 1 prediction: 0
(figure above, if pred wrong) label: 2 prediction: 2
(figure above, if pred wrong) label: 1 prediction: 0
(figure above, if pred wrong) label: 0 prediction: 2
(figure above, if pred wrong) label: 1 prediction: 0
(figure above, if pred wrong) label: 0 prediction: 0
(figure above, if pred wrong) label: 2 prediction: 2
(figure above, if pred wrong) label: 1 prediction: 0
(figure above, if pred wrong) label: 0 prediction: 2
(figure above, if pred wrong) label: 2 prediction: 0
(figure above, if pred wrong) label: 1 prediction: 0
(figure above, if pred wrong) label: 2 prediction: 0
(figure above, if pred wrong) label: 0 prediction: 0
(figure above, if pred wrong) label: 1 prediction: 1
(figure above, if pred wrong) label: 2 prediction: 0
(figure above, if pred wrong) label: 0 prediction: 0
(figure above, if pred wrong) label: 1 prediction: 0
(figure above, if pred wrong) label: 0 prediction: 0
(figure above, if pred wrong) label: 1 predict