In [2]:
import numpy as np
import pandas as pd

from Lilygo.Recording import Recording
from Lilygo.Dataset import Dataset

import os
from os import listdir
from os.path import isfile, join


import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from scipy.stats import entropy
from scipy import signal
from scipy.signal import welch
from scipy.fftpack import fft
from scipy.stats import entropy, kurtosis, skew
from scipy.interpolate import interp1d

from Lilygo.Recording import Recording, data_integrity
from Lilygo.Dataset import Dataset

import xgboost as xgb
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

import joblib
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

## Functions for stepcount

In [3]:
# This function aims to find the component caused by gravity from data, which means the signal around 0 Hz
def get_gravity(data):
    filtered_data = np.zeros_like(data)
    # Parameters in IIR filter
    alpla = [1, -1.979133761292768, 0.979521463540373]
    beta = [0.000086384997973502, 0.00012769995947004, 0.000086384997973502]
    # Formula of IIR filter
    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

# This function aims to realize a low-pass filter with cutoff frequency = 1 Hz. Because according to massive amounts of data, the general 
# minimum frequency of human walking is about 1 Hz
def get_highpass(data):
    filtered_data = np.zeros_like(data)  # filtered_data
    alpla = [1, -1.905384612118461, 0.910092542787947]
    beta = [0.953986986993339, -1.907503180919730, 0.953986986993339]
    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

# This funciton aims to realize a high-pass filter with cutoff frequency = 5 Hz. Because according to massive amounts of data, the general 
# maximum frequency of human walking is about 5 Hz
def get_lowpass(data):
    filtered_data = np.zeros_like(data)  # filtered_data
    alpla = [1, -1.80898117793047, 0.827224480562408]
    beta = [0.096665967120306, -0.172688631608676, 0.095465967120306]
    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

def pre_process(data):
    # Find the component caused by gravity from data and remove it from the singanl
    data_gravity = get_gravity(data)
    data_user = data - data_gravity
    # Get user's acceleration along the gravity direction by dot product
    data_acc = data_user * data_gravity
    # Add low pass and high pass filter to reduce noise in signal (possible human walking rate:1 - 5Hz)
    data_filtered = get_highpass(data_acc)
    data_filtered = get_lowpass(data_filtered)
    return data_filtered

In [4]:
def get_segment_nonoverlap(data,sampling_rate, std_win):
    # Calculate window size
    window_size = round(std_win*sampling_rate)
    segment_trace = [data[s:s+window_size] for s in range(0, len(data)-window_size, window_size)]
    return segment_trace

def check_moving(magn_data):
    acc_mag_mean = np.mean(abs(magn_data))
    moving = False
    # Append the features to the list
    if acc_mag_mean > 0.027:
        moving = True
    return moving

def normalize(data):
    min_val = min(data)
    max_val = max(data)
    if min_val == max_val:
        normalized_data = [0 for x in data]
    else:
        normalized_data = [(x - min_val) / (max_val - min_val)  - 0.5 for x in data]
    return normalized_data

# This function aims to find peak locations and corresponding values in the signal with the function signal.find_peaks
def get_peaks(input_signal, prominence):
    peak_locations, _ = signal.find_peaks(input_signal, prominence=prominence)
    peak_values = input_signal[peak_locations]
    return peak_locations, peak_values

## Load trained model

In [16]:
# Define the CNN model for magnitute data
class CNN1D(nn.Module):
    def __init__(self, num_classes):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=5)
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=5)
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(p=0.5)  # Add dropout layer with 50% dropout probability
        self.fc1 = nn.Linear(64 * 247, 255)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(255, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        x = self.dropout(x)  # Apply dropout
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.fc2(x)
        return x

In [6]:
# Location------------------------------------------
location_xgboost_model = joblib.load('group00_model_1.joblib')
# activity------------------------------------------
activity_xgb_model = joblib.load('group00_model_2.joblib')
# path---------------------------------------------
path_cnn_model = torch.load('group00_model_3.pt')


In [7]:
def get_segment_overlap(data,sampling_rate, std_win):
    # Calculate raw magnitude of accelerometer signal
    # amagn_acc = [np.sqrt(a**2+trace.data['ay'].values[i]**2+trace.data['az'].values[i]**2)for i, a in enumerate(trace.data['ay'].values)]
    # Pre-process data
    data_seg = pre_process(data)
    # Calculate window size
    window_size = round(std_win*sampling_rate)
    segment_trace = [data_seg[s:s+window_size] for s in range(0, len(data_seg)-window_size, round(window_size/2))]
    return segment_trace


In [8]:
# remove outliers in bearing
def remove_outliers(data):
    # Convert list to numpy array for easier mathematical operations
    data = np.array(data)
    # Calculate the Median of the data
    median = np.median(data)
    # Calculate the absolute deviation from the median
    absolute_deviation = np.abs(data - median)
    # Calculate the Median Absolute Deviation (MAD)
    mad = np.median(absolute_deviation)
    # Identify the outliers using a threshold (typically 2.5 or 3)
    outliers = absolute_deviation / mad > 2.5
    # Replace outliers with NaN
    data[outliers] = np.nan
    # Interpolate to replace NaNs with reasonable values
    nans, x = np.isnan(data), lambda z: z.nonzero()[0]
    data[nans] = np.interp(x(nans), x(~nans), data[~nans])
    return data

In [9]:
def get_watch_loc(featured_trace):
    # Create the XGBoost DMatrix object for the test data
    dtest = xgb.DMatrix(featured_trace)

    # Make predictions on the test set and evaluate the model
    y_pred = location_xgboost_model.predict(dtest)
    count_1 = np.count_nonzero(y_pred == 1)
    count_2 = np.count_nonzero(y_pred == 2)
    count_0 = len(y_pred) - count_1 - count_2
    count = np.array([count_0, count_1, count_2])
    y_final = np.argmax(count)
    return y_final

In [10]:
def get_path_idx(magn_mag):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X_trace = torch.tensor(np.expand_dims(magn_mag, axis=(0, -1)), dtype=torch.float32)
    X_trace = X_trace.permute(0, 2, 1).to(device)
    
    path_cnn_model.to(device)
    path_cnn_model.eval()
    path_idx = int(np.argmax(path_cnn_model(X_trace).cpu().detach().numpy()))
    return path_idx

In [11]:
def get_step_count(trace):
    # Calculate raw magnitude of accelerometer signal
    amagn = [np.sqrt(a**2+trace.data['ay'].values[i]**2+trace.data['az'].values[i]**2)for i, a in enumerate(trace.data['ax'].values)]
    # Filter the signal to get more accurate results -----------------------------------------------------------
  
    data_filtered = pre_process(amagn)
    # Use convolution to reduce noise in signal again
    filter_window_size = 40
    data_filtered = np.convolve(data_filtered, np.ones((filter_window_size,))/filter_window_size, mode='valid')
    # Find peaks in the filtered signal and realize our stepcount -----------------------------------------------
    # Segment data into windows --------------------------------------------------------------------------------
    std_win = 1 # length of window in seconds
    sampling_rate = 200
    data_segmented = get_segment_nonoverlap(data_filtered, sampling_rate, std_win)
    # Normalize data in each windows ---------------------------------------------------------------------------
    win_size = round(std_win * sampling_rate)
    normalized_data = data_filtered.copy()
    for i, seg in enumerate(data_segmented):
        if check_moving(seg):
            normalized_data[i*win_size:i*win_size + win_size] = normalize(seg)
        else:
            normalized_data[i*win_size:i*win_size + win_size] = 0
    # check data after last window
    if np.shape(data_segmented)[0]*win_size < len(data_filtered):
        seg = data_filtered[np.shape(data_segmented)[0]*win_size:]
        if check_moving(seg):
                normalized_data[i*win_size:i*win_size + len(seg)] = normalize(seg)
        else:
            normalized_data[i*win_size:i*win_size + len(seg)] = 0

    # Find peaks in the filtered signal and realize our stepcount -----------------------------------------------
    prominence = 0.4
    peak_locations, _ = get_peaks(normalized_data, prominence)
    stepCount = len(peak_locations)
    
    return stepCount

In [12]:
def get_activity(featured_trace):
    stand, walk, run, cycle = 0, 0, 0, 0
    # Create the XGBoost DMatrix object for the test data
    dtest = xgb.DMatrix(featured_trace)
    # Make predictions on the test set and evaluate the model
    y_pred = activity_xgb_model.predict(dtest)
    # filter prediction by 60s
    # Sliding window: 60s
    std_win = 10
    n = round (60 / std_win * 2 - 1)
    y_pred_count = np.zeros(4)
    y_pred_60 = y_pred.copy()
    for s in range(0, len(y_pred) - n, int(n/2+1)):
        # window = 60s 
        windowed_label = y_pred[s : s+n]
        for j in range(n): 
            # Find the label that appears the most
            for k in range(4):
                if windowed_label[j] == k:
                    y_pred_count[k]+=1
        label_argmax = np.where(y_pred_count == np.max(y_pred_count))
        # print(label_argmax)
        if len(label_argmax)==1:
            y_pred_60[s : s+n] = np.argmax(y_pred_count)
    
    # remove duplicated elements
    predicted = list(set(y_pred_60))
    if 0 in predicted:
        stand = 1
    if 1 in predicted:
        walk = 1
    if 2 in predicted:
        run = 1
    if 3 in predicted:
        cycle = 1    
    return stand, walk, run, cycle

In [13]:
# change to test data dir
dir_traces = '/kaggle/input/mobile-health-2023-path-detection/data/test'
filenames = [join(dir_traces, f) for f in listdir(dir_traces) if isfile(join(dir_traces, f)) and f[-5:] == '.json']
filenames.sort()

In [14]:
# load saved feature
feature_all_trace = np.load("group00_features.npy", allow_pickle=True).item()
# Loop through all traces and calculate the step count for each trace
solution_file = []
for idx, filename in enumerate(filenames):
    trace_id = ''.join([*filename][-8:-5])
    
    trace = Recording(filename, no_labels=True, mute=True)
    categorization_results = {'watch_loc': 0, 'path_idx': 0, 'step_count': 0, 'stand': 0, 'walk': 0, 'run': 0, 'cycle': 0}

    #
    # Your algorithm goes here
    # Make sure, you do not use the gps data and are tolerant for missing data (see task set).
    # Your program must not crash when single smartphone data traces are missing.
    #
    feature_cur_trace = feature_all_trace[trace_id]
    categorization_results['watch_loc'] = get_watch_loc(feature_cur_trace["location"])
    categorization_results['path_idx'] = get_path_idx(feature_cur_trace["path_idx"])
    categorization_results['step_count'] = get_step_count(trace)
    categorization_results['stand'], categorization_results['walk'], categorization_results['run'], categorization_results['cycle'] = get_activity(feature_cur_trace["activity"])


    # Append your calculated results and the id of each trace and category to the solution file
    for counter_label, category in enumerate(categorization_results):
        solution_file.append([trace_id + f'_{counter_label+1}', categorization_results[category]])
    # show progress
    if (idx+1)%10 == 0:
        print("Process traces: ", idx+1, '/', len(filenames))


Process traces:  10 / 376
Process traces:  20 / 376
Process traces:  30 / 376
Process traces:  40 / 376
Process traces:  50 / 376
Process traces:  60 / 376
Process traces:  70 / 376
Process traces:  80 / 376
Process traces:  90 / 376
Process traces:  100 / 376




Process traces:  110 / 376
Process traces:  120 / 376
Process traces:  130 / 376
Process traces:  140 / 376
Process traces:  150 / 376
Process traces:  160 / 376
Process traces:  170 / 376
Process traces:  180 / 376
Process traces:  190 / 376
Process traces:  200 / 376
Process traces:  210 / 376
Process traces:  220 / 376
Process traces:  230 / 376
Process traces:  240 / 376
Process traces:  250 / 376
Process traces:  260 / 376
Process traces:  270 / 376
Process traces:  280 / 376
Process traces:  290 / 376
Process traces:  300 / 376
Process traces:  310 / 376
Process traces:  320 / 376
Process traces:  330 / 376
Process traces:  340 / 376
Process traces:  350 / 376
Process traces:  360 / 376
Process traces:  370 / 376


In [15]:
# Write the detected step counts into a .csv file to then upload the .csv file to Kaggle
# When cross-checking the .csv file on your computer, we recommend using the text editor and NOT excel so that the results are displayed correctly
# IMPORTANT: Do NOT change the name of the columns ('Id' and 'Category') of the .csv file
submission_file_df = pd.DataFrame(np.asarray(solution_file), columns=['Id', 'Category'])
submission_file_df.to_csv('submission.csv', header=['Id', 'Category'], index=False)