In [1]:
import pandas as pd
import os
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact
import plotly.graph_objects as go
import ruptures as rpt
from itertools import combinations as comb
from statsmodels.stats import power
import numpy as np

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from scipy.spatial.distance import cdist
import colorsys
import warnings
warnings.filterwarnings('ignore')

In [2]:
cpd_model = rpt.Binseg(model = 'l2',jump=500, min_size=4500)
def change_point_detection(df, throw_away = 0.1):
    """
    Changepoint Detection
    Input: - df:  from read_data
           - key: which is the filename
           - throw_away: everything smaller than 10% of the total length of data is thrown away

    Returns: - return_dict: A dictionary with all the data and the format key_cnt
    """
    signal = df[['accelerometer_x','accelerometer_y','accelerometer_z']].values
    length = len(df)
    algo = cpd_model.fit(signal)
    result = [0]
    result += algo.predict(pen=1000)
    if result[-1] != length:
        result += [length]
        
    return_df = pd.DataFrame()
    for i in range(len(result)-1):
        if result[i+1]-result[i] > length*throw_away:
            if return_df.empty:
                return_df = df.iloc[result[i]:result[i+1]]
            else:
                pd.concat([return_df, df.iloc[result[i]:result[i+1]]],ignore_index=True)
            
    """
    This is the seperated version, so we dont have windows that belong to different changepoint sections
    return_dict = {}
    cnt = 1
    for i in range(len(result)-1):
        if result[i+1]-result[i] > length*throw_away:
            return_dict[key+'_'+str(cnt)] = df.iloc[result[i]:result[i+1]]
            cnt+=1
    """
    return return_df

In [3]:
folder_path = 'data'
def read_data(filename):
    accelerometer = pd.read_csv(os.path.join(folder_path, filename, 'Accelerometer.csv'),sep=';')
    accelerometer['Time (s)'] = pd.to_datetime(accelerometer['Time (s)'], unit='s')
    accelerometer = accelerometer.set_index('Time (s)')
    accelerometer = accelerometer.resample('2.5ms').mean()
    accelerometer.rename(columns={'Acceleration x (m/s^2)':'accelerometer_x','Acceleration y (m/s^2)':'accelerometer_y','Acceleration z (m/s^2)':'accelerometer_z'}, inplace=True)
    #accelerometer =accelerometer[9000:-9000]
    accelerometer.reset_index(inplace=True)

    gyroscope = pd.read_csv(os.path.join(folder_path, filename,'Gyroscope.csv'),sep=';')
    gyroscope['Time (s)'] = pd.to_datetime(gyroscope['Time (s)'], unit='s')
    gyroscope =gyroscope.set_index('Time (s)')
    gyroscope = gyroscope.resample('2.5ms').mean()
    gyroscope.rename(columns={'Gyroscope x (rad/s)':'gyroscope_x','Gyroscope y (rad/s)':'gyroscope_y','Gyroscope z (rad/s)':'gyroscope_z'}, inplace=True)
    #gyroscope = gyroscope[9000:-9000]
    gyroscope.reset_index(inplace=True)
    merged = pd.merge(accelerometer,gyroscope, on= 'Time (s)', how='inner')
    return change_point_detection(merged)

In [4]:
# regex to delete the number at the end of the string:
def delete_number(string):
    return ''.join([i for i in string if not i.isdigit()])

In [18]:
data_dict = {}
name_to_idx = {} # name -> idx
i = 0

for name in os.listdir(folder_path):
    data_dict[name] = read_data(name)
    
    # add label
    if delete_number(name) not in name_to_idx.keys():
        name_to_idx[delete_number(name)] = i
        i+=1        

In [6]:
verification_keys = ["nick2", "till2", "uta2", "paula2"]

In [7]:
name_to_idx

{'chris': 0,
 'felix': 1,
 'katarina': 2,
 'leon': 3,
 'lucas': 4,
 'luisa': 5,
 'nele': 6,
 'nick': 7,
 'paula': 8,
 'rebecca': 9,
 'till': 10,
 'uta': 11}

In [19]:
def create_sliding_windows(data, window_length, stride, label):
    labels = np.ones(data.shape[0])*label
    num_windows = (len(data) - window_length) // stride + 1

    # Create sliding windows using numpy stride tricks
    windows = np.lib.stride_tricks.sliding_window_view(data, (window_length, data.shape[1]))
    windows = windows[::stride, 0, :, :]
    y_windows = labels[np.arange(0, num_windows * stride, stride)]  # Class labels at the start of each window

    return windows, y_windows

window_length = 100
stride = 100

X_train_list = []
y_train_list = []
X_test_list = []
y_test_list = []


for name, df in data_dict.items():
    if name in verification_keys:
        continue
    label = name_to_idx[delete_number(name)]
    train = df[:int(0.8*len(df))]
    # add normalization for training data
    train_normalized = (train - train.min()) / (train.max()-train.min())

    test = df[int(0.8*len(df)):]

    test_normalized = (test - train.min()) / (train.max()-train.min())   # normalize test data with training data mean and std


    X_train, y_train = create_sliding_windows(train, window_length, stride, label)
    X_test, y_test = create_sliding_windows(test, window_length, stride, label)
    
    X_train_list.append(X_train)
    y_train_list.append(y_train)
    X_test_list.append(X_test)
    y_test_list.append(y_test)

X_train = np.concatenate(X_train_list, axis=0)
y_train = np.concatenate(y_train_list, axis=0)
X_test = np.concatenate(X_test_list, axis=0)
y_test = np.concatenate(y_test_list, axis=0)

In [20]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((16072, 100, 6), (16072,), (4017, 100, 6), (4017,))

In [21]:
n_samples, window_length, n_features = X_train.shape
X_train_reshaped = X_train.reshape(n_samples, window_length * n_features)
X_test_reshaped = X_test.reshape(X_test.shape[0], window_length * n_features)

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train_reshaped, y_train)

In [22]:
y_pred = model.predict(X_test_reshaped)
y_pred_proba = model.predict_proba(X_test_reshaped)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("X_train shape:", X_train_reshaped.shape)
print("X_test shape:", X_test_reshaped.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Accuracy: 0.9843166542195668
X_train shape: (16072, 600)
X_test shape: (4017, 600)
y_train shape: (16072,)
y_test shape: (4017,)


In [23]:
report = classification_report(y_test, y_pred)

print(report)

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99       282
         1.0       1.00      0.96      0.98       643
         2.0       0.98      0.99      0.99       167
         3.0       0.98      1.00      0.99       263
         4.0       0.99      1.00      0.99       336
         5.0       1.00      0.88      0.93        56
         6.0       0.99      0.99      0.99       415
         7.0       0.96      1.00      0.98       872
         8.0       1.00      0.54      0.70        28
         9.0       1.00      1.00      1.00       246
        10.0       0.99      0.99      0.99       282
        11.0       0.99      0.99      0.99       427

    accuracy                           0.98      4017
   macro avg       0.99      0.94      0.96      4017
weighted avg       0.98      0.98      0.98      4017



In [24]:
y_pred_proba.min(), y_pred_proba.max()

(4.302113e-08, 0.9999789)

# Verification with all second runs

In [31]:
X_verification_list = []
y_verification_list = []
for name, df in data_dict.items():
    if name in verification_keys:
        label = name_to_idx[delete_number(name)]
        try:
            df.drop(columns=['Time (s)','label'], inplace=True)
        except:
            pass
        train = df[:int(0.8*len(df))]
        train_normalized = (train - train.min()) / (train.max()-train.min())
        test = df[int(0.8*len(df)):]
        test_normalized = (test - train.min()) / (train.max()-train.min()) 
        X_ver, y_ver = create_sliding_windows(df, window_length, stride, label)
        X_verification_list.append(X_ver)
        y_verification_list.append(y_ver)
X_verification = np.concatenate(X_verification_list, axis=0)
y_verification = np.concatenate(y_verification_list, axis=0)

In [32]:
n_samples, window_length, n_features = X_verification.shape
X_verification = X_verification.reshape(n_samples, window_length * n_features)

In [33]:
y_pred_ver = model.predict(X_verification)
y_pred_proba_ver = model.predict_proba(X_verification)
accuracy = accuracy_score(y_pred_ver, y_verification)
accuracy

0.834698275862069

In [34]:
report = classification_report(y_verification, y_pred_ver)

print(report)

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       0.00      0.00      0.00         0
         2.0       0.00      0.00      0.00         0
         3.0       0.00      0.00      0.00         0
         4.0       0.00      0.00      0.00         0
         5.0       0.00      0.00      0.00         0
         6.0       0.00      0.00      0.00         0
         7.0       0.91      0.97      0.94      2280
         8.0       1.00      0.91      0.95       140
         9.0       0.00      0.00      0.00         0
        10.0       0.96      0.88      0.92      1705
        11.0       0.69      0.06      0.10       515

    accuracy                           0.83      4640
   macro avg       0.30      0.23      0.24      4640
weighted avg       0.90      0.83      0.84      4640



Verification with Till2

In [20]:
y_pred_ver = model.predict(X_verification)
y_pred_proba_ver = model.predict_proba(X_verification)
accuracy = accuracy_score(y_pred_ver, y_verification)

In [21]:
y_pred_proba_ver.min(), y_pred_proba_ver.max()

(9.011673e-08, 0.99984455)

In [22]:
accuracy

0.5899757120777214