In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.stats import entropy

from scipy.signal import welch
from scipy.fftpack import fft

from Lilygo.Recording import Recording, data_integrity
from Lilygo.Dataset import Dataset


# Load data


In [1]:
dir_data = 'E:\\Sunzhichao\\ETHz\\2223Spring\\Mobile_Health\\data\\'
dir_traces = dir_data + 'train\\'
dir_labels = dir_data + 'labels\\'
dir_loaded = dir_data + 'Loaded_data\\'

In [5]:

traceNames=os.listdir(dir_traces)
traceNames=['train_trace_092.json','train_trace_063.json','train_trace_188.json']

for traceName in traceNames:
    if traceName[-5:] == '.json':
        trace = Recording(dir_traces + traceName, no_labels=False, mute=True)
        print(trace.labels)
        print(trace.labels.get('board_loc'))

{'board_loc': 0, 'path_idx': 4, 'activities': [1, 2, 1, 0, 1, 0, 2], 'gender': 'm', 'body_height': 174, 'legi': '18-717-827'}
0
{'board_loc': 1, 'path_idx': 1, 'activities': [1, 2, 1], 'gender': 'f', 'body_height': 158, 'legi': '22-912-430'}
1
{'board_loc': 2, 'path_idx': 1, 'activities': [1], 'gender': 'f', 'body_height': 162, 'legi': '19-953-538'}
2


In [6]:
def get_highpass(data):
    filtered_data = np.zeros_like(data)  # filtered_data
    alpla = [1, -1.905384612118461, 0.910092542787947]
    beta = [0.953986986993339, -1.907503180919730, 0.953986986993339]

    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

# This funciton aims to realize a high-pass filter with cutoff frequency = 5 Hz. Because according to massive amounts of data, the general 
# maximum frequency of human walking is about 5 Hz
def get_lowpass(data):
    filtered_data = np.zeros_like(data)  # filtered_data
    alpla = [1, -1.80898117793047, 0.827224480562408]
    beta = [0.096665967120306, -0.172688631608676, 0.095465967120306]

    for i in range(2, len(data)):
        filtered_data[i] = alpla[0] * (data[i] * beta[0] + data[i-1] * beta[1] + data[i-2] * beta[2] - filtered_data[i-1] * alpla[1] - filtered_data[i-2] * alpla[2])
    return filtered_data

In [7]:
def preprocess_and_extract_features(trace, window_size=60, sampling_rate=50):
    """
    Preprocess the data and extract features from the 3D accelerometer, gyroscope, and magnetometer data.

    Args:
    trace (Lilygo.Recording.Recording): Object containing the raw data with accelerometer data stored in lists (e.g. trace.data['ax'])
    window_size (int): The window size in seconds for splitting the data
    sampling_rate (int): The sampling rate of the data in Hz

    Returns:
    pd.DataFrame: A DataFrame with the extracted features and location labels
    """
    # Read data from trace
    # To-Do: filter raw data with implemented function
    ax = get_lowpass(get_highpass(trace.data['ax'].values))
    ay = get_lowpass(get_highpass(trace.data['ay'].values))
    az = get_lowpass(get_highpass(trace.data['az'].values))
    
    '''gx = trace.data['gx'].values
    gy = trace.data['gy'].values
    gz = trace.data['gz'].values
    
    mx = trace.data['mx'].values
    my = trace.data['my'].values
    mz = trace.data['mz'].values'''
    
    # Compute the length of each window in samples
    window_samples = window_size * sampling_rate
    
    # Compute the number of windows in the recording
    num_windows = len(ax) // window_samples

    # Initialize lists for storing extracted features and location labels
    features = []
    loc_labels = []

    # Helper function to compute the magnitude of a vector
    magnitude = lambda vec: np.sqrt(np.sum(vec**2, axis=1))

    for i in range(num_windows):
       
        # Extract the accelerometer, gyroscope, and magnetometer data for the current window
        acc_data = np.array([ax[i*window_samples:(i+1)*window_samples],
                             ay[i*window_samples:(i+1)*window_samples],
                             az[i*window_samples:(i+1)*window_samples]]).T
        '''gyro_data = np.array([gx[i*window_samples:(i+1)*window_samples],
                              gy[i*window_samples:(i+1)*window_samples],
                              gz[i*window_samples:(i+1)*window_samples]]).T
        mag_data = np.array([mx[i*window_samples:(i+1)*window_samples],
                             my[i*window_samples:(i+1)*window_samples],
                             mz[i*window_samples:(i+1)*window_samples]]).T'''
        
        '''figure,ax = plt.subplots(3, 1, figsize=(10, 6))
        ax[0].plot(acc_data[:,0])
        ax[0].set_ylabel('ax')
        ax[1].plot(acc_data[:,1])
        ax[1].set_ylabel('ay')
        ax[2].plot(acc_data[:,2])
        ax[2].set_ylabel('az')'''

        # Compute magnitudes
        acc_magnitude = np.sqrt(np.sum(acc_data**2, axis=1))
        #gyro_magnitude = magnitude(gyro_data)
        #mag_magnitude = magnitude(mag_data)

        # Calculate features (according to https://www.sciencedirect.com/science/article/pii/S1574119211001222)
        '''ax_amp = acc_data[np.argmax(acc_data[:,0]), 0] - acc_data[np.argmin(acc_data[:,0]), 0]
        ay_amp = acc_data[np.argmax(acc_data[:,1]), 1] - acc_data[np.argmin(acc_data[:,1]), 1]
        az_amp = acc_data[np.argmax(acc_data[:,2]), 2] - acc_data[np.argmin(acc_data[:,2]), 2]'''
        
        ax_amp = abs(np.mean(acc_data[:,0]))
        ay_amp = abs(np.mean(acc_data[:,1]))
        az_amp = abs(np.mean(acc_data[:,2]))
        a_amp_list = [ax_amp, ay_amp, az_amp]
        a_amp_list.sort() # Sorting list of numbers in ascending
        #print('ax_amp:',ax_amp,'ay_amp:',ay_amp,'az_amp',az_amp)
        A = a_amp_list[2] #Feature A: the maximum amplitude among all dimensions (represents motion range for location)
        B = a_amp_list[2]/a_amp_list[1] # Feature B and C: ratio of the maximum amplitudes in different axes (represents DoF in movement for location)
        C = a_amp_list[2]/a_amp_list[0]
        #print('A:',A,'B:',B,'C',C)

        # Calculate the energy and entropy of acc_mag in the frequency domain (D and F)
        freq, Pxx = welch(acc_magnitude, fs=sampling_rate) # use of the fast Fourier transform for the estimation of power spectra
        #plt.plot(freq,Pxx)
        
        D = np.max(Pxx) # Feature D: the maximum energy captured by the accelerator
        F = np.sum(Pxx) # Feature F: the overall energy captured by the accelerator
        norm_Pxx = Pxx / F # normalize the power spectrum
        E = entropy(norm_Pxx) # Feature E: normalized information entropy of the discrete FFT component magnitudes

   
        # Append the features to the list
        features.append([A, B, C, D, E, F])
        
        # Calculate the timestamp for the current window as the median of the timestamps (not necessary for location)
        # timestamp = np.median(trace.timestamp[i*window_samples:(i+1)*window_samples]) 

        # Determine the location label for the current window based on the timestamp
        loc_label = trace.labels.get('board_loc')

        # Append the label to the labels list
        loc_labels.append(loc_label)
    #plt.show()

    # Create a DataFrame with the extracted features and location labels
    features_df = pd.DataFrame(features, columns=['A', 'B', 'C', 'D', 'E', 'F'])
    features_df['loc_label'] = loc_labels
    #print(features_df)

    return features_df


In [8]:
traceNames=os.listdir(dir_traces)
#traceNames=['train_trace_092.json','train_trace_063.json','train_trace_188.json']
#traceNames=['train_trace_092.json']

X = pd.DataFrame()
y = pd.DataFrame()
for traceName in traceNames[0:20]:
    if traceName[-5:] == '.json':
        trace = Recording(dir_traces+ traceName, no_labels=False, mute=True)
        
        # Get features of data
        features_df = preprocess_and_extract_features(trace)
        

        # Prepare data for classification
        X_trace = features_df.drop('loc_label', axis=1)
        y_trace = features_df['loc_label']
        X = pd.concat([X, X_trace], axis=0).reset_index(drop=True)
        y = pd.concat([y, y_trace], axis=0).reset_index(drop=True)

        
        
# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
print(y_train)



       0
432  2.0
326  1.0
445  2.0
104  2.0
114  2.0
..   ...
71   2.0
106  2.0
270  0.0
435  2.0
102  2.0

[505 rows x 1 columns]


In [9]:
y_train=y_train.reset_index(drop=True)
print(y_train)
y_train=np.array(y_train)

       0
0    2.0
1    1.0
2    2.0
3    2.0
4    2.0
..   ...
500  2.0
501  2.0
502  0.0
503  2.0
504  2.0

[505 rows x 1 columns]


In [10]:
print(type(X_train))

<class 'numpy.ndarray'>


In [14]:
# Check the number of classes in y_train
'''if len(set(y_train)) < 2:
    raise ValueError("The number of classes has to be greater than one; got %d class" % len(set(y_train)))
'''
# Train the SVM classifier with RBF kernel
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
grid = GridSearchCV(SVC(kernel='rbf'), param_grid, refit=True, verbose=2)
grid.fit(X_train,np.ravel(y_train))
# Predict the location
y_pred = grid.predict(X_test)

# Evaluate the classifier
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END ..................................C=0.1, gamma=0.01; total time=   0.0s
[CV] END ..................................C=0.1

In [16]:
# Check the number of classes in y_train
if len(set(np.ravel(y_train))) < 2:
    raise ValueError("The number of classes has to be greater than one; got %d class" % len(set(y_train)))

# Train the SVM classifier with RBF kernel
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
grid = GridSearchCV(SVC(kernel='rbf'), param_grid, refit=True, verbose=2)
grid.fit(X_train, np.ravel(y_train))

# Predict the location
y_pred = grid.predict(X_test)

# Evaluate the classifier
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END ..................................C=0.1, gamma=0.01; total time=   0.0s
[CV] END ..................................C=0.1

In [17]:
traceNames=os.listdir(dir_traces)
#traceNames=['train_trace_092.json','train_trace_063.json','train_trace_188.json']
#traceNames=['train_trace_092.json']

for traceName in traceNames[0:20]:
    if traceName[-5:] == '.json':
        trace = Recording(dir_traces + traceName, no_labels=False, mute=True)
        
        # Get features of data
        features_df = preprocess_and_extract_features(trace)
        

'''        # Prepare data for classification
        X = features_df.drop('label', axis=1)
        y = features_df['label']

        # Normalize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

        # Train the SVM classifier with RBF kernel
        param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
        grid = GridSearchCV(SVC(kernel='rbf'), param_grid, refit=True, verbose=2)
        grid.fit(X_train, y_train)

        # Predict the location
        y_pred = grid.predict(X_test)

# Evaluate the classifier
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))'''


KeyboardInterrupt: 