In [1]:
# This is the template for the submission. If you want, you can develop your algorithm in a regular Python script and copy the code here for submission.

# Team members (e-mail, legi):
# zhisun@ethz.ch, 22-958-227
# enjcao@ethz.ch, 22-942-700
# yifzhou@ethz.ch, 22-940-381

In [16]:
import numpy as np
import pandas as pd

from Lilygo.Recording import Recording
from Lilygo.Dataset import Dataset

import os
from os import listdir
from os.path import isfile, join


import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from scipy.stats import entropy
from scipy.signal import welch
from scipy.fftpack import fft

from Lilygo.Recording import Recording, data_integrity
from Lilygo.Dataset import Dataset

import xgboost as xgb
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

import joblib

# Local dirs

In [3]:
# Get the path of all traces
dir_data = 'E:\\Sunzhichao\\ETHz\\2223Spring\\Mobile_Health\\data\\'
dir_traces_train = dir_data + 'train\\'
dir_traces_test = dir_data + 'test\\'
dir_labels = dir_data + 'labels\\'
dir_loaded = dir_data + 'Loaded_data\\'
dir_models = dir_data + 'models\\'
# recorded
dir_recorded = 'data/recorded'

In [4]:
# Get the path of all traces
# dir_traces = '/kaggle/input/mobile-health-2023-path-detection/data/test'
dir_traces = dir_traces_test
filenames = [join(dir_traces, f) for f in listdir(dir_traces) if (isfile(join(dir_traces, f)) and f[-5:] == '.json')]
filenames.sort()

# Data Pre-processing

In [29]:
# This function aims to find the component caused by gravity from data, which means the signal around 0 Hz
def get_gravity(data):
    filtered_data = np.zeros_like(data)
    # Parameters in IIR filter
    alpla = [1, -1.979133761292768, 0.979521463540373]
    beta = [0.000086384997973502, 0.00012769995947004, 0.000086384997973502]
    # Formula of IIR filter
    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

# This function aims to realize a low-pass filter with cutoff frequency = 1 Hz. Because according to massive amounts of data, the general 
# minimum frequency of human walking is about 1 Hz
def get_highpass(data):
    filtered_data = np.zeros_like(data)  # filtered_data
    alpla = [1, -1.905384612118461, 0.910092542787947]
    beta = [0.953986986993339, -1.907503180919730, 0.953986986993339]
    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

# This funciton aims to realize a high-pass filter with cutoff frequency = 5 Hz. Because according to massive amounts of data, the general 
# maximum frequency of human walking is about 5 Hz
def get_lowpass(data):
    filtered_data = np.zeros_like(data)  # filtered_data
    alpla = [1, -1.80898117793047, 0.827224480562408]
    beta = [0.096665967120306, -0.172688631608676, 0.095465967120306]
    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

In [32]:
# compute energy
def compute_energy(data):
    window_length = len(data)  # length of each window
    # Define the window function (e.g., Hanning or Hamming)
    window = np.hanning(window_length)
    # Apply the window function to the data
    windowed_data = data * window
    # Compute the FFT of the windowed data
    fft_result = np.fft.fft(windowed_data)
    # Compute the squared magnitudes of the FFT components (excluding DC)
    mag_squared = np.abs(fft_result[1:window_length // 2])**2
    # Sum the squared magnitudes and normalize by the window length
    energy = np.sum(mag_squared) / window_length
    # fft_data now contains the energy feature for each window of data
    return energy

# compute entropy
def compute_entropy(data):
    # Assume we have accelerometer data in a 3D array called "data", where each row
    # represents a window of data and each column represents a sensor axis (x, y, z or magnitute)
    window_length = len(data) # length of each window
    # Define the window function (e.g., Hanning or Hamming)
    window = np.hanning(window_length)
    # Apply the window function to the data
    windowed_data = data * window
    # Compute the FFT of the windowed data
    fft_result = np.fft.fft(windowed_data)
    # Compute the magnitudes of the FFT components (excluding DC)
    mag = np.abs(fft_result[1:window_length // 2])
    # Normalize the magnitudes to obtain a probability distribution
    mag_norm = mag / np.sum(mag)
    # Compute the information entropy of the probability distribution
    entropy = -np.sum(mag_norm * np.log2(mag_norm))
    # fft_data now contains the frequency-domain entropy feature for each window of data
    return entropy

def get_features(data):
    # get features from sliding windows
    dc = np.mean(data)
    energy = compute_energy(np.asarray(data))
    entropy = compute_entropy(np.asarray(data))
    return dc, energy, entropy

In [30]:
def preprocess_and_extract_features(trace, window_size=60, sampling_rate=50):
    """
    Preprocess the data and extract features from the 3D accelerometer, gyroscope, and magnetometer data.

    Args:
    trace (Lilygo.Recording.Recording): Object containing the raw data with accelerometer data stored in lists (e.g. trace.data['ax'])
    window_size (int): The window size in seconds for splitting the data
    sampling_rate (int): The sampling rate of the data in Hz

    Returns:
    pd.DataFrame: A DataFrame with the extracted features and location labels
    """
    # Read data from trace
    # To-Do: filter raw data with implemented function
    ax = get_lowpass(get_highpass(trace.data['ax'].values))
    ay = get_lowpass(get_highpass(trace.data['ay'].values))
    az = get_lowpass(get_highpass(trace.data['az'].values))
    
    '''gx = trace.data['gx'].values
    gy = trace.data['gy'].values
    gz = trace.data['gz'].values
    
    mx = trace.data['mx'].values
    my = trace.data['my'].values
    mz = trace.data['mz'].values'''
    
    # Compute the length of each window in samples
    window_samples = window_size * sampling_rate
    
    # Compute the number of windows in the recording
    num_windows = len(ax) // window_samples

    # Initialize lists for storing extracted features and location labels
    features = []
    loc_labels = []

    # Helper function to compute the magnitude of a vector
    magnitude = lambda vec: np.sqrt(np.sum(vec**2, axis=1))

    for i in range(num_windows):
       
        # Extract the accelerometer, gyroscope, and magnetometer data for the current window
        acc_data = np.array([ax[i*window_samples:(i+1)*window_samples],
                             ay[i*window_samples:(i+1)*window_samples],
                             az[i*window_samples:(i+1)*window_samples]]).T
        '''gyro_data = np.array([gx[i*window_samples:(i+1)*window_samples],
                              gy[i*window_samples:(i+1)*window_samples],
                              gz[i*window_samples:(i+1)*window_samples]]).T
        mag_data = np.array([mx[i*window_samples:(i+1)*window_samples],
                             my[i*window_samples:(i+1)*window_samples],
                             mz[i*window_samples:(i+1)*window_samples]]).T'''
        
        '''figure,ax = plt.subplots(3, 1, figsize=(10, 6))
        ax[0].plot(acc_data[:,0])
        ax[0].set_ylabel('ax')
        ax[1].plot(acc_data[:,1])
        ax[1].set_ylabel('ay')
        ax[2].plot(acc_data[:,2])
        ax[2].set_ylabel('az')'''

        # Compute magnitudes
        acc_magnitude = np.sqrt(np.sum(acc_data**2, axis=1))
        #gyro_magnitude = magnitude(gyro_data)
        #mag_magnitude = magnitude(mag_data)

        # Calculate features (according to https://www.sciencedirect.com/science/article/pii/S1574119211001222)
        '''ax_amp = acc_data[np.argmax(acc_data[:,0]), 0] - acc_data[np.argmin(acc_data[:,0]), 0]
        ay_amp = acc_data[np.argmax(acc_data[:,1]), 1] - acc_data[np.argmin(acc_data[:,1]), 1]
        az_amp = acc_data[np.argmax(acc_data[:,2]), 2] - acc_data[np.argmin(acc_data[:,2]), 2]'''
        
        ax_amp = abs(np.mean(acc_data[:,0]))
        ay_amp = abs(np.mean(acc_data[:,1]))
        az_amp = abs(np.mean(acc_data[:,2]))
        a_amp_list = [ax_amp, ay_amp, az_amp]
        a_amp_list.sort() # Sorting list of numbers in ascending
        #print('ax_amp:',ax_amp,'ay_amp:',ay_amp,'az_amp',az_amp)
        A = a_amp_list[2] #Feature A: the maximum amplitude among all dimensions (represents motion range for location)
        B = a_amp_list[2]/a_amp_list[1] # Feature B and C: ratio of the maximum amplitudes in different axes (represents DoF in movement for location)
        C = a_amp_list[2]/a_amp_list[0]
        #print('A:',A,'B:',B,'C',C)

        # Calculate the energy and entropy of acc_mag in the frequency domain (D and F)
        freq, Pxx = welch(acc_magnitude, fs=sampling_rate) # use of the fast Fourier transform for the estimation of power spectra
        #plt.plot(freq,Pxx)
        
        D = np.max(Pxx) # Feature D: the maximum energy captured by the accelerator
        F = np.sum(Pxx) # Feature F: the overall energy captured by the accelerator
        norm_Pxx = Pxx / F # normalize the power spectrum
        E = entropy(norm_Pxx) # Feature E: normalized information entropy of the discrete FFT component magnitudes

   
        # Append the features to the list
        features.append([A, B, C, D, E, F])
        
        # Calculate the timestamp for the current window as the median of the timestamps (not necessary for location)
        # timestamp = np.median(trace.timestamp[i*window_samples:(i+1)*window_samples]) 

        # Determine the location label for the current window based on the timestamp
        # loc_label = trace.labels.get('board_loc')

        # Append the label to the labels list
        # loc_labels.append(loc_label)
    #plt.show()

    # Create a DataFrame with the extracted features and location labels
    features_df = pd.DataFrame(features, columns=['A', 'B', 'C', 'D', 'E', 'F'])
    # features_df['loc_label'] = loc_labels
    #print(features_df)

    return features_df

## Load trained model

In [44]:
# Location
location_model = joblib.load('./trained_models/location_svm_model.joblib')
activity_xgb_model = joblib.load('trained_models/activity_xgboost_model_feature.joblib')


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [21]:
def pre_process(data):
    # Find the component caused by gravity from data and remove it from the singanl
    data_gravity = get_gravity(data)
    data_user = data - data_gravity
    # Get user's acceleration along the gravity direction by dot product
    data_acc = data_user * data_gravity
    # Add low pass and high pass filter to reduce noise in signal (possible human walking rate:1 - 5Hz)
    data_filtered = get_highpass(data_acc)
    data_filtered = get_lowpass(data_filtered)
    return data_filtered

In [15]:
def get_segment(trace):
    # Calculate raw magnitude of accelerometer signal
    amagn_acc = [np.sqrt(a**2+trace.data['ay'].values[i]**2+trace.data['az'].values[i]**2)for i, a in enumerate(trace.data['ay'].values)]
    # Pre-process data
    amagn = pre_process(amagn_acc)
    # Calculate window size
    sampling_rate = 200
    std_win = 3 #s
    window_size = round(std_win*sampling_rate)
    segment_trace = [amagn[s:s+window_size] for s in range(0, len(amagn)-window_size, round(window_size/2))]
    return segment_trace


In [25]:
def get_watch_loc(trace):
    # # Get features of data
    # features_df = preprocess_and_extract_features(trace)

    # # Prepare data for classification
    # scaler = StandardScaler()
    # X_test = scaler.fit_transform(features_df)

    # # Predict the location with loaded model
    # y_pred = location_model.predict(X_test)
    # y_pred = np.squeeze(y_pred)
    # y_final = np.argmax(np.bincount(y_pred.astype(int)))

    y_final = 3
    
    return y_final

In [9]:
def get_path_idx(trace):
    return 5

In [10]:
def get_step_count(trace):
    return 0

In [42]:
def get_activity(trace):
    stand, walk, run, cycle = 0, 0, 0, 0
    
    # get segment data
    segment_trace = get_segment(trace)
    # add feature extraction
    num_features = 3
    featured_trace = np.zeros((np.shape(segment_trace)[0], num_features))
    for i in range(np.shape(segment_trace)[0]):
        featured_trace[i,] = get_features(segment_trace[i])
    # Create the XGBoost DMatrix object for the test data
    dtest = xgb.DMatrix(featured_trace)

    # Make predictions on the test set and evaluate the model
    y_pred = activity_xgb_model.predict(dtest)
    
    # filter prediction by 60s
    # Sliding window: 60s
    std_win = 10
    n = round (60 / std_win * 2 - 1)
    y_pred_count = np.zeros(4)
    
    y_pred_60 = y_pred.copy()
    for s in range(0, len(y_pred) - n, int(n/2+1)):
        # window = 60s 
        windowed_label = y_pred[s : s+n]
        for j in range(n): 
            # Find the label that appears the most
            for k in range(4):
                if windowed_label[j] == k:
                    y_pred_count[k]+=1
        label_argmax = np.where(y_pred_count == np.max(y_pred_count))
        # print(label_argmax)
        if len(label_argmax)==1:
            y_pred_60[s : s+n] = np.argmax(y_pred_count)
    
    # remove duplicated elements
    predicted = list(set(y_pred_60))
    if 0 in predicted:
        stand = 1
    if 1 in predicted:
        walk = 1
    if 2 in predicted:
        run = 1
    if 3 in predicted:
        cycle = 1

    return stand, walk, run, cycle

In [46]:
# Loop through all traces and calculate the step count for each trace
solution_file = []
for idx, filename in enumerate(filenames):
    trace = Recording(filename, no_labels=True, mute=True)
    categorization_results = {'watch_loc': 0, 'path_idx': 0, 'step_count': 0, 'stand': 0, 'walk': 0, 'run': 0, 'cycle': 0}

    #
    # Your algorithm goes here
    # Make sure, you do not use the gps data and are tolerant for missing data (see task set).
    # Your program must not crash when single smartphone data traces are missing.
    #

    stand, walk, run, cycle = get_activity(trace)
    categorization_results['watch_loc'] = get_watch_loc(trace)
    categorization_results['path_idx'] = get_path_idx(trace)
    categorization_results['step_count'] = get_step_count(trace)
    categorization_results['stand'], categorization_results['walk'], categorization_results['run'], categorization_results['cycle'] = get_activity(trace)


    # Append your calculated results and the id of each trace and category to the solution file
    trace_id = ''.join([*filename][-8:-5])
    for counter_label, category in enumerate(categorization_results):
        solution_file.append([trace_id + f'_{counter_label+1}', categorization_results[category]])
    # show progress
    if (idx+1)%10 == 0:
        print("Process traces: ", idx+1, '/', len(filenames))


Process traces:  10 / 376
Process traces:  20 / 376
Process traces:  30 / 376
Process traces:  40 / 376


In [31]:
# Write the detected step counts into a .csv file to then upload the .csv file to Kaggle
# When cross-checking the .csv file on your computer, we recommend using the text editor and NOT excel so that the results are displayed correctly
# IMPORTANT: Do NOT change the name of the columns ('Id' and 'Category') of the .csv file
submission_file_df = pd.DataFrame(np.asarray(solution_file), columns=['Id', 'Category'])
submission_file_df.to_csv('results/submission_2.csv', header=['Id', 'Category'], index=False)