In [101]:
import pandas as pd
import numpy as np
from scipy.spatial.transform import Rotation as R
from scipy.fft import fft, fftfreq

In [102]:
# read cata from csv files
# data_floor1 = pd.read_csv('raw_data/OnTable.csv')
# data_floor2 = pd.read_csv('raw_data/onIpad.csv')
# data_floor3_1 = pd.read_csv('raw_data/side1.csv')
# data_floor3_2 = pd.read_csv('raw_data/record.csv')

data = pd.read_csv('raw_data/handheld1.csv')

In [103]:
def rotate_accelerometer_to_world_frame(sensor_df):
    acc_data = sensor_df[['acc_x', 'acc_y', 'acc_z']].copy()
    acc_gravity = sensor_df[['acc_gx', 'acc_gy', 'acc_gz']].copy()
    rotation_rate = sensor_df[['gyro_x', 'gyro_y', 'gyro_z']].copy()

    columns = ['acc_x', 'acc_y', 'acc_z', 'acc_gx', 'acc_gy', 'acc_gz', 'mean_magnitude']
    result_df = pd.DataFrame(columns=columns)
    for acc, acc_g, gyro in zip(acc_data.itertuples(index=False), acc_gravity.itertuples(index=False), rotation_rate.itertuples(index=False)):
        roll = gyro.gyro_x
        pitch = gyro.gyro_y
        yaw = gyro.gyro_z
        r = R.from_euler('zyx', [yaw, pitch, roll], degrees=False)

        acc_world = r.apply([acc.acc_x, acc.acc_y, acc.acc_z])
        gravity_world = r.apply([acc_g.acc_gx, acc_g.acc_gy, acc_g.acc_gz])
        mean_magnitude = np.linalg.norm(acc_world)

        result = pd.DataFrame([list(acc_world) + list(gravity_world) + [mean_magnitude]], columns=columns)
        result_df = pd.concat([result_df, result], ignore_index=True)

    return result_df


In [104]:
def compute_frequency_domain(signal, interval):
    N = len(signal)
    fs = 1000 / interval
    frequencies = fftfreq(N, d=1/fs)
    fft_values = np.abs(fft(signal))
    
    pos_mask = frequencies > 0
    frequencies = frequencies[pos_mask]
    fft_values = fft_values[pos_mask]
    
    power = fft_values**2
    return np.sum(frequencies * power) / np.sum(power), frequencies[np.argmax(fft_values)]
    

## Preprocessing function

In [105]:
def csv_preprocess(data, window_size=4000, data_interval=500):
    features = [
        'avg_acc_x', 'median_acc_x', 'std_acc_x', 'min_x', 'max_x', 'mean_abs_x',
        'avg_acc_y', 'median_acc_y', 'std_acc_y', 'min_y', 'max_y', 'mean_abs_y',
        'avg_acc_z', 'median_acc_z', 'std_acc_z', 'min_z', 'max_z', 'mean_abs_z',
        'avg_acc_gx', 'avg_acc_gy', 'avg_acc_gz',
        'gyro_z_mean', 'gyro_z_std', 'gyro_z_max', 'gyro_z_min',
        'mean_magnitude', 'signal_magnitude_area',
        'mean_freq_x', 'dominant_freq_x',
        'mean_freq_y', 'dominant_freq_y',
        'mean_freq_z', 'dominant_freq_z',
        'lat_diff', 'lon_diff',
        'label'
    ]
    result_df = pd.DataFrame(columns=features)
    
    # window sliding approach
    n_row = window_size // data_interval
    for window_start in range(0, len(data) - n_row + 1):
        window_end = window_start + n_row
        this_window = data.iloc[window_start:window_end]
        
        rotated_df = rotate_accelerometer_to_world_frame(this_window)
        
        # accelerometer of this window frame
        mean_acc_x = rotated_df['acc_x'].mean()
        median_acc_x = rotated_df['acc_x'].median()
        std_acc_x = rotated_df['acc_x'].std()
        min_acc_x = rotated_df['acc_x'].min()
        max_acc_x = rotated_df['acc_x'].max()
        mean_abs_x = rotated_df['acc_x'].abs().mean()
        # ------------------------------------
        mean_acc_y = rotated_df['acc_y'].mean()
        median_acc_y = rotated_df['acc_y'].median()
        std_acc_y = rotated_df['acc_y'].std()
        min_acc_y = rotated_df['acc_y'].min()
        max_acc_y = rotated_df['acc_y'].max()
        mean_abs_y = rotated_df['acc_y'].abs().mean()
        # ------------------------------------
        mean_acc_z = rotated_df['acc_z'].mean()
        median_acc_z = rotated_df['acc_z'].median()
        std_acc_z = rotated_df['acc_z'].std()
        min_acc_z = rotated_df['acc_z'].min()
        max_acc_z = rotated_df['acc_z'].max()
        mean_abs_z = rotated_df['acc_z'].abs().mean()
        
        # accelerometer (including gravity) of this window frame
        mean_acc_gx = rotated_df['acc_gx'].mean()
        # std_acc_gx = rotated_df['acc_gx'].std()
        # min_acc_gx = rotated_df['acc_gx'].min()
        # max_acc_gx = rotated_df['acc_gx'].max()
        # ------------------------------------
        mean_acc_gy = rotated_df['acc_gy'].mean()
        # std_acc_gy = rotated_df['acc_gy'].std()
        # min_acc_gy = rotated_df['acc_gy'].min()
        # max_acc_gy = rotated_df['acc_gy'].max()
        # ------------------------------------
        mean_acc_gz = rotated_df['acc_gz'].mean()
        # std_acc_gz = rotated_df['acc_gz'].std()
        # min_acc_gz = rotated_df['acc_gz'].min()
        # max_acc_gz = rotated_df['acc_gz'].max()
        # ------------------------------------
        gyro_z = this_window['gyro_z']
        gyro_z_mean = gyro_z.mean()
        gyro_z_std = gyro_z.std()
        gyro_z_max = gyro_z.max()
        gyro_z_min = gyro_z.min()
        
        # other features
        mean_magnitude = rotated_df['mean_magnitude'].mean()
        # ------------------------------------
        signal_magnitude_area = np.sum(np.abs([mean_acc_x, mean_acc_y, mean_acc_z]))
        # ------------------------------------
        mean_freq_x, dominant_freq_x = compute_frequency_domain(rotated_df['acc_x'], data_interval)
        mean_freq_y, dominant_freq_y = compute_frequency_domain(rotated_df['acc_y'], data_interval)
        mean_freq_z, dominant_freq_z = compute_frequency_domain(rotated_df['acc_z'], data_interval)
        # ------------------------------------
        lats = list(this_window['gps_lat'])
        lons = list(this_window['gps_lon'])
        diff_lat = lats[0] - lats[len(lats) - 1]
        diff_lon = lons[0] - lons[len(lons) - 1]
        
        label_map = {'Halt': 0, 'Forward': 1, 'Turn': 2}
        label = label_map.get(this_window['action'].mode()[0], -1)

        row = pd.DataFrame([[mean_acc_x, median_acc_x, std_acc_x, min_acc_x, max_acc_x, mean_abs_x, 
                             mean_acc_y, median_acc_y, std_acc_y, min_acc_y, max_acc_y, mean_abs_y, 
                             mean_acc_z, median_acc_z, std_acc_z, min_acc_z, max_acc_z, mean_abs_z, 
                             mean_acc_gx, mean_acc_gy, mean_acc_gz, 
                             gyro_z_mean, gyro_z_std, gyro_z_max, gyro_z_min,
                             mean_magnitude, signal_magnitude_area, 
                             mean_freq_x, dominant_freq_x, 
                             mean_freq_y, dominant_freq_y, 
                             mean_freq_z, dominant_freq_z, 
                             diff_lat, diff_lon, 
                             label]], columns=features)
        result_df = pd.concat([result_df, row], ignore_index=True)
        
    return result_df


## Model Evaluation

In [106]:
from sklearn.model_selection import  cross_validate, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

In [107]:
# preprocessed_data = csv_preprocess(data_floor1)
# preprocessed_data = pd.concat([preprocessed_data, csv_preprocess(data_floor2)], ignore_index=True)
# preprocessed_data = pd.concat([preprocessed_data, csv_preprocess(data_floor3_1)], ignore_index=True)
# preprocessed_data = pd.concat([preprocessed_data, csv_preprocess(data_floor3_2)], ignore_index=True)

preprocessed_data = csv_preprocess(data)

  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, row], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result], ignore_index=True)
  result_df = pd.concat([result_df, result]

In [108]:
X = preprocessed_data.drop('label', axis=1)
y = preprocessed_data['label'].astype(int)

In [109]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
scoring = ['accuracy', 'f1_macro', 'f1_weighted']

### Decision Tree Classifier

In [110]:
model = DecisionTreeClassifier(class_weight='balanced', random_state=69)

In [111]:
cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring)

print("Accuracy scores:", cv_results['test_accuracy'])
print("Mean accuracy:", cv_results['test_accuracy'].mean())

print("F1 macro scores:", cv_results['test_f1_macro'])
print("Mean F1 macro:", cv_results['test_f1_macro'].mean())

Accuracy scores: [0.9637883  0.95543175 0.94707521 0.95543175 0.97214485]
Mean accuracy: 0.9587743732590528
F1 macro scores: [0.9641121  0.95577126 0.94693315 0.95583629 0.97232226]
Mean F1 macro: 0.958995014082553


### Random Forest Classifier

In [112]:
model = RandomForestClassifier(class_weight='balanced', random_state=69)

In [113]:
cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring)

print("Accuracy scores:", cv_results['test_accuracy'])
print("Mean accuracy:", cv_results['test_accuracy'].mean())

print("F1 macro scores:", cv_results['test_f1_macro'])
print("Mean F1 macro:", cv_results['test_f1_macro'].mean())

Accuracy scores: [0.98050139 0.97771588 0.96657382 0.97771588 0.98607242]
Mean accuracy: 0.9777158774373259
F1 macro scores: [0.98072048 0.97789638 0.96660192 0.97760698 0.9860932 ]
Mean F1 macro: 0.9777837928138101


### LightGBM

In [114]:
model = LGBMClassifier(class_weight='balanced', random_state=69)

In [115]:
cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000343 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7869
[LightGBM] [Info] Number of data points in the train set: 1436, number of used features: 35
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000306 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7851
[LightGBM] [Info] Number of data points in the train set: 1436, number of used features: 35
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000440 secon

In [116]:
print("Accuracy scores:", cv_results['test_accuracy'])
print("Mean accuracy:", cv_results['test_accuracy'].mean())

print("F1 macro scores:", cv_results['test_f1_macro'])
print("Mean F1 macro:", cv_results['test_f1_macro'].mean())

Accuracy scores: [0.97771588 0.97771588 0.97771588 0.98050139 0.98607242]
Mean accuracy: 0.9799442896935933
F1 macro scores: [0.97789276 0.9776855  0.97777243 0.98030923 0.98610248]
Mean F1 macro: 0.9799524789773468


## Model Training
We decided to go with LightGBM model since it has high accuracy score and F1 score.

In [117]:
import pickle

In [118]:
model = LGBMClassifier(class_weight='balanced', random_state=69)
model.fit(X, y)

file_path = 'Models/lightGBM-model_v1.pkl'
with open(file_path, 'wb') as f:
    pickle.dump(model, f)
    
print(f'The model has been saved to: { file_path }')
    

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000306 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7889
[LightGBM] [Info] Number of data points in the train set: 1795, number of used features: 35
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
The model has been saved to: Models/lightGBM-model_v1.pkl
