In [48]:
# This is the template for the submission. If you want, you can develop your algorithm in a regular Python script and copy the code here for submission.

# Team members (e-mail, legi):
# zhisun@ethz.ch, 22-958-227
# enjcao@ethz.ch, 22-942-700
# yifzhou@ethz.ch, 22-940-381

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from os import listdir
from os.path import isfile, join

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.stats import entropy
from scipy.signal import welch
from scipy.fftpack import fft

from Lilygo.Recording import Recording, data_integrity
from Lilygo.Dataset import Dataset

import joblib

# Path

In [4]:
# Get the path of all traces
dir_data = 'E:\\Sunzhichao\\ETHz\\2223Spring\\Mobile_Health\\data\\'
dir_traces_train = dir_data + 'train\\'
dir_traces_test = dir_data + 'test\\'
dir_labels = dir_data + 'labels\\'
dir_load = dir_data + 'Loaded_data\\'

## Filtering and Feature Extraction

In [34]:
def get_highpass(data):
    filtered_data = np.zeros_like(data)  # filtered_data
    alpla = [1, -1.905384612118461, 0.910092542787947]
    beta = [0.953986986993339, -1.907503180919730, 0.953986986993339]

    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

# This funciton aims to realize a high-pass filter with cutoff frequency = 5 Hz. Because according to massive amounts of data, the general 
# maximum frequency of human walking is about 5 Hz
def get_lowpass(data):
    filtered_data = np.zeros_like(data)  # filtered_data
    alpla = [1, -1.80898117793047, 0.827224480562408]
    beta = [0.096665967120306, -0.172688631608676, 0.095465967120306]

    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

def preprocess_and_extract_features(trace, window_size=60, sampling_rate=50):
    """
    Preprocess the data and extract features from the 3D accelerometer, gyroscope, and magnetometer data.

    Args:
    trace (Lilygo.Recording.Recording): Object containing the raw data with accelerometer data stored in lists (e.g. trace.data['ax'])
    window_size (int): The window size in seconds for splitting the data
    sampling_rate (int): The sampling rate of the data in Hz

    Returns:
    pd.DataFrame: A DataFrame with the extracted features and location labels
    """
    # Read data from trace
    # To-Do: filter raw data with implemented function
    ax = get_lowpass(get_highpass(trace.data['ax'].values))
    ay = get_lowpass(get_highpass(trace.data['ay'].values))
    az = get_lowpass(get_highpass(trace.data['az'].values))
    
    '''gx = trace.data['gx'].values
    gy = trace.data['gy'].values
    gz = trace.data['gz'].values
    
    mx = trace.data['mx'].values
    my = trace.data['my'].values
    mz = trace.data['mz'].values'''
    
    # Compute the length of each window in samples
    window_samples = window_size * sampling_rate
    
    # Compute the number of windows in the recording
    num_windows = len(ax) // window_samples

    # Initialize lists for storing extracted features and location labels
    features = []
    loc_labels = []

    # Helper function to compute the magnitude of a vector
    magnitude = lambda vec: np.sqrt(np.sum(vec**2, axis=1))

    for i in range(num_windows):
       
        # Extract the accelerometer, gyroscope, and magnetometer data for the current window
        acc_data = np.array([ax[i*window_samples:(i+1)*window_samples],
                             ay[i*window_samples:(i+1)*window_samples],
                             az[i*window_samples:(i+1)*window_samples]]).T
        '''gyro_data = np.array([gx[i*window_samples:(i+1)*window_samples],
                              gy[i*window_samples:(i+1)*window_samples],
                              gz[i*window_samples:(i+1)*window_samples]]).T
        mag_data = np.array([mx[i*window_samples:(i+1)*window_samples],
                             my[i*window_samples:(i+1)*window_samples],
                             mz[i*window_samples:(i+1)*window_samples]]).T'''
        
        '''figure,ax = plt.subplots(3, 1, figsize=(10, 6))
        ax[0].plot(acc_data[:,0])
        ax[0].set_ylabel('ax')
        ax[1].plot(acc_data[:,1])
        ax[1].set_ylabel('ay')
        ax[2].plot(acc_data[:,2])
        ax[2].set_ylabel('az')'''

        # Compute magnitudes
        acc_magnitude = np.sqrt(np.sum(acc_data**2, axis=1))
        #gyro_magnitude = magnitude(gyro_data)
        #mag_magnitude = magnitude(mag_data)

        # Calculate features (according to https://www.sciencedirect.com/science/article/pii/S1574119211001222)
        '''ax_amp = acc_data[np.argmax(acc_data[:,0]), 0] - acc_data[np.argmin(acc_data[:,0]), 0]
        ay_amp = acc_data[np.argmax(acc_data[:,1]), 1] - acc_data[np.argmin(acc_data[:,1]), 1]
        az_amp = acc_data[np.argmax(acc_data[:,2]), 2] - acc_data[np.argmin(acc_data[:,2]), 2]'''
        
        ax_amp = abs(np.mean(acc_data[:,0]))
        ay_amp = abs(np.mean(acc_data[:,1]))
        az_amp = abs(np.mean(acc_data[:,2]))
        a_amp_list = [ax_amp, ay_amp, az_amp]
        a_amp_list.sort() # Sorting list of numbers in ascending
        #print('ax_amp:',ax_amp,'ay_amp:',ay_amp,'az_amp',az_amp)
        A = a_amp_list[2] #Feature A: the maximum amplitude among all dimensions (represents motion range for location)
        B = a_amp_list[2]/a_amp_list[1] # Feature B and C: ratio of the maximum amplitudes in different axes (represents DoF in movement for location)
        C = a_amp_list[2]/a_amp_list[0]
        #print('A:',A,'B:',B,'C',C)

        # Calculate the energy and entropy of acc_mag in the frequency domain (D and F)
        freq, Pxx = welch(acc_magnitude, fs=sampling_rate) # use of the fast Fourier transform for the estimation of power spectra
        #plt.plot(freq,Pxx)
        
        D = np.max(Pxx) # Feature D: the maximum energy captured by the accelerator
        F = np.sum(Pxx) # Feature F: the overall energy captured by the accelerator
        norm_Pxx = Pxx / F # normalize the power spectrum
        E = entropy(norm_Pxx) # Feature E: normalized information entropy of the discrete FFT component magnitudes

   
        # Append the features to the list
        features.append([A, B, C, D, E, F])
        
        # Calculate the timestamp for the current window as the median of the timestamps (not necessary for location)
        # timestamp = np.median(trace.timestamp[i*window_samples:(i+1)*window_samples]) 

        # Determine the location label for the current window based on the timestamp
        # loc_label = trace.labels.get('board_loc')

        # Append the label to the labels list
        # loc_labels.append(loc_label)
    #plt.show()

    # Create a DataFrame with the extracted features and location labels
    features_df = pd.DataFrame(features, columns=['A', 'B', 'C', 'D', 'E', 'F'])
    # features_df['loc_label'] = loc_labels
    #print(features_df)

    return features_df

## Load training traces

In [20]:
# TODO: CHANGE THIS PATH FOR KAGGLE TO LOCAL PATH
traceNames = [join(dir_traces_train, f) for f in listdir(dir_traces_train) if (isfile(join(dir_traces_train, f)) and f[-5:] == '.json')]
traceNames.sort()

# Get features

In [33]:
X = pd.DataFrame()
y = pd.DataFrame()
for i,traceName in enumerate(traceNames):
    if i%10 == 0:
        print("Processing data: ", i, '/', len(traceNames))
    trace = Recording(traceName, no_labels=False, mute=True)
    
    # Get features of data
    features_df = preprocess_and_extract_features(trace)

    # Prepare data for classification
    X_trace = features_df.drop('loc_label', axis=1)
    y_trace = features_df['loc_label']
    X = pd.concat([X, X_trace], axis=0).reset_index(drop=True)
    y = pd.concat([y, y_trace], axis=0).reset_index(drop=True)

            
# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Processing data:  0 / 263


KeyboardInterrupt: 

In [28]:
# save X_scaled and y for later use
np.save(dir_load + 'sw_x.npy', np.array(X))
np.save(dir_load + 'sw_x_scaled.npy', np.array(X_scaled))
np.save(dir_load + 'sw_y_gt.npy', np.array(y))

## Train with SVM

In [None]:
# Using features of all training traces as training set
X_train = X_scaled
y_train = y
#X_train, _, y_train, _ = train_test_split(X_scaled, y, test_size=0.01, random_state=42)

# Check the number of classes in y_train
'''if len(set(y_train)) < 2:
    raise ValueError("The number of classes has to be greater than one; got %d class" % len(set(y_train)))
'''
# Train the SVM classifier with RBF kernel
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
grid = GridSearchCV(SVC(kernel='rbf'), param_grid, refit=True, verbose=2)
grid.fit(X_train, np.ravel(y_train))

# Save the trained SVM model
joblib.dump(grid, './trained_models/location_svm_model.joblib')

## Train with XGboost

In [30]:
import xgboost as xgb
from xgboost import XGBClassifier
# Using features of all training traces as training set
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Check the number of classes in y_train
'''if len(set(y_train)) < 2:
    raise ValueError("The number of classes has to be greater than one; got %d class" % len(set(y_train)))
'''
# Create the XGBoost DMatrix object
dtrain = xgb.DMatrix(X_train, label=y_train)

# Set the parameters for the XGBoost model
params = {'max_depth': 3, 'eta': 0.1, 'objective': 'multi:softmax', 'num_class': 3}

# Train the model
xgb_model = xgb.train(params, dtrain)
print("Training XGBoost classifier for smartwatch location")

# Create the XGBoost DMatrix object for the test data
dval = xgb.DMatrix(X_val)

# Make predictions on the test set and evaluate the model
y_pred = xgb_model.predict(dval)
accuracy = accuracy_score(y_pred, y_val)
# Evaluate the model on the testing data
print("Validation acc:", accuracy)

Training XGBoost classifier for smartwatch location
Validation acc: 0.8482188951987609


## Predict on testing traces with trained model and Evaluation predictions

In [None]:
# Load the trained model for prediction
loaded_model = joblib.load('./trained_models/location_svm_model.joblib')

# Load testing data
# TODO: CHANGE THIS PATH FOR KAGGLE TO LOCAL PATH
path_testing = '/kaggle/input/mobile-health-2023-path-detection/data/test'
traceNames_test = os.listdir(path_testing)

# labels and final prediction for the whole traces
y_labels = []
y_finals = []
for traceName in traceNames_test[0:3]:
    if traceName[-5:] == '.json':
        trace = Recording(path_testing + '/'+ traceName, no_labels=True, mute=True)
        
        # Get features of data
        features_df = preprocess_and_extract_features(trace)

        # Prepare data for classification
        X_test = scaler.fit_transform(features_df)

        # Predict the location with loaded model
        y_pred = loaded_model.predict(X_test)
        y_pred = np.squeeze(y_pred)
        y_final = np.argmax(np.bincount(y_pred.astype(int)))
        
        y_finals.append(y_final)

y_finals = np.array(y_finals)


In [5]:
y_gt = np.load(dir_load + 'train_loaction_label.npy')
# check training data balance
loaction_1 = np.count_nonzero(y_gt == 1)
loaction_2 = np.count_nonzero(y_gt == 2)
location_0 = np.shape(y_gt)[0] - loaction_1 - loaction_2
print("number of traces in training for loaction 0: ", location_0, ", loaction 1: ", loaction_1, ", loaction 2: ", loaction_2)

number of traces in training for loaction 0:  84 , loaction 1:  82 , loaction 2:  97


In [58]:
# predict using XGBoost
traceNames_test = [join(dir_traces_test, f) for f in listdir(dir_traces_test) if (isfile(join(dir_traces_test, f)) and f[-5:] == '.json')]
traceNames_test.sort()
# labels and final prediction for the whole traces
y_labels = []
y_finals = []
for i, traceName in enumerate(traceNames_test):
    trace = Recording(traceName, no_labels=True, mute=True)
    
    # Get features of data
    features_df = preprocess_and_extract_features(trace)

    # Prepare data for classification
    X_test = scaler.fit_transform(features_df)
    # Create the XGBoost DMatrix object for the test data
    dtest = xgb.DMatrix(X_test)
    
    # Make predictions on the test set and evaluate the model
    y_pred = xgb_model.predict(dtest)
    print("predicted window label: ", y_pred)
    y_pred = np.squeeze(y_pred)
    y_final = np.argmax(np.bincount(y_pred.astype(int)))
    print("predicted trace label: ", y_final, " ground truth: ", y_gt[i])
    y_finals.append(y_final)

y_finals = np.array(y_finals)



predicted window label:  [0. 0. 2. 0. 1. 0. 0. 0. 0. 2. 2. 1. 2. 2. 2. 2. 0. 0. 0. 0. 0. 0. 0. 2.
 2. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1.]
predicted trace label:  0  ground truth:  2
predicted window label:  [2. 0. 1. 2. 2. 1. 2. 1. 2. 2. 2. 1. 1. 1. 1. 2. 1. 1. 0. 0. 0. 0. 2. 2.
 1. 2. 0. 2. 0. 2. 2. 2. 2. 2. 2. 0. 0.]
predicted trace label:  2  ground truth:  2
predicted window label:  [1. 1. 0. 2. 1. 0. 0. 2. 0. 0. 1. 0. 2. 2. 2. 0. 0. 0. 0. 1. 1. 2. 2. 2.
 0. 2. 2. 2. 2. 2. 2. 2. 2. 2. 1. 2. 1. 2. 2. 2. 2.]
predicted trace label:  2  ground truth:  2
predicted window label:  [2. 2. 0. 2. 2. 2. 2. 2. 2. 0. 0. 2. 2. 0. 2. 2. 0. 0. 2. 2. 2. 1. 2. 1.
 2. 2. 2. 2. 2. 2. 0. 0. 2. 0. 0.]
predicted trace label:  2  ground truth:  2
predicted window label:  [0. 0. 2. 0. 2. 2. 0. 1. 1. 2. 2. 2. 1. 2. 2. 2. 2. 2. 2. 2. 1. 1. 1. 1.
 2. 1. 0. 1. 1. 2. 2. 0.]
predicted trace label:  2  ground truth:  1
predicted window label:  [2. 1. 2. 0. 0. 0. 0. 2. 2. 2. 0. 2. 0. 2. 0. 1. 0. 1. 2. 1. 2. 0. 1. 0.

KeyboardInterrupt: 