# Data Construction

The single `.csv` file used in the machine learning models for Case Study 2 is created using this file.

## Imports

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

## Regression Dataset

The original data can be found [here](https://zenodo.org/records/8415066). Please download this data locally, then set the `data_dir` variable to point to the directory the data is contained in.

In [10]:
data_dir = "raw_data/"  # DATA/WSD4FEDSRM/

In [11]:
data = pd.read_csv(data_dir + "Borg data/borg_data.csv")

# Drop all columns of time greater than 90 sec
data_temp = data.copy()
for col in data_temp.columns:
    if '_sec' in str(col) and int(col.split('_')[0]) > 90 or 'end_' in str(col):
        data = data.drop(col,axis=1)

# Remove all rows containing external rotation task
data = data[~data['task_order'].str.endswith('e')]

data['subject'] = data['subject'].ffill()  # Replace NA subjects with last seen

# Remove all rows of subjects who did not complete full 90 seconds
data = data.dropna()

# Pivot time columns into one column
fatigue_cols = ['before_task', '10_sec', '20_sec', '30_sec', '40_sec', '50_sec', '60_sec', '70_sec', '80_sec', '90_sec']
data = pd.melt(data, id_vars=['subject', 'task_order'], value_vars=fatigue_cols, var_name='time', value_name='fatigue')

# Make time number instead of string
data['time'] = data['time'].str.replace('_sec', '', regex=False)
data['time'] = data['time'].replace('before_task', 0.01)
data['time'] = data['time'].astype(float)

In [12]:
# Load dataframe of demographic data
demo_anthro = pd.read_csv(data_dir + "/Demographic and antropometric data/demographic&anthro.csv")
demo_anthro = demo_anthro[['subject', 'age', 'height(cm)', 'dominant_hand', 'sex', 'BMI(kg/m2)']]

data = data.merge(demo_anthro, on='subject').sort_values(['subject', 'task_order', 'time'])
data.to_csv("final_data/data.csv", index=False)

In [5]:
sensor_data_dir = data_dir + "EMG, IMU, and PPG data/"

# All directory lists created
sensor_dir_list = list(filter(lambda x: x.split()[1] == 'internal', list(os.listdir(sensor_data_dir))))
subject_list = [s.split('_')[0].title() + " " + s.split('_')[1] for s in data['subject'].unique()]
sensors = [s.lower() for s in list(os.listdir(sensor_data_dir + sensor_dir_list[0] + '/' + subject_list[0] + '/IMU data')) if s != 'Pelvis']
sensor_data_types = ['acc', 'gyr']

# Dictionary that contains subjects as keys and list of tasks completed as value
subject_tasks_dict = dict(zip(list(data['subject'].sort_values().unique()), list(data.groupby('subject')['task_order'].unique())))



# Create dataframe to store sensor data
sensor_df_cols = ['subject', 'task_order', 'sensor', 'type', 'time', 'x', 'y', 'z']
sensor_df = pd.DataFrame(columns=sensor_df_cols)

# Nightmare fuel
for subject in tqdm(subject_list, desc="Subjects"):
    sensor_dir_list_counter = 0
    for d in sensor_dir_list:
        sensor_dir_list_counter += 1
        if f"task{sensor_dir_list_counter}_{d[0]}5i" in subject_tasks_dict[subject.lower().replace(' ', '_')]:
            for sensor in sensors:
                for t in sensor_data_types:
                    # CSV directory created, csv read using this dir
                    csv_dir = sensor_data_dir + d + '/' + subject + '/IMU data/' + sensor + '/' + t + '_' + sensor.replace(' ', '_') + '.csv'
                    csv_df = pd.read_csv(csv_dir)

                    # Get 
                    length = 10
                    x, y, z = csv_df.iloc[::1000, 0].head(length), csv_df.iloc[::1000, 1].head(length), csv_df.iloc[::1000, 2].head(length)

                    # Create lists to add to row section to be added
                    task_order = [f"task{sensor_dir_list_counter}_{d[0]}5i"] * length  # Task list
                    subject_lst = [subject.lower().replace(' ', '_')] * length  # Subject list
                    sensor_lst = [sensor] * length  # Sensor list
                    t_lst = [t] * length  # Type list
                    time_lst = [0.01] + list(range(10, 100, 10))  # Time list
                    new_row = pd.DataFrame({
                        'subject': subject_lst, 'task_order': task_order, 'sensor': sensor_lst, 'type': t_lst, 'time':time_lst,
                        'x': x, 'y': y, 'z': z
                    })

                    # Add row to dataframe
                    sensor_df = pd.concat([sensor_df, new_row], ignore_index=True)

sensor_df.to_csv("final_data/sensor_data.csv", index=False)

  sensor_df = pd.concat([sensor_df, new_row], ignore_index=True)
Subjects: 100%|██████████| 25/25 [00:18<00:00,  1.35it/s]


In [6]:
# Merge data with sensor data
data = data.merge(sensor_df, on=['subject', 'task_order', 'time'])

In [7]:
# Fancy code to turn gyr/acc column into separate x/y/z columns
data = data.pivot_table(
    index=['subject', 'task_order', 'time', 'fatigue', 'age', 'height(cm)', 'dominant_hand', 'sex', 'BMI(kg/m2)', 'sensor'], 
    columns='type', values=['x', 'y', 'z']).reset_index()
data.columns = ['_'.join(col).strip() if col[1] else col[0] for col in data.columns.values]

# Convert sex to 0/1 format
data['sex'] = data['sex'].replace(to_replace=['male', 'female'], value=[0, 1])
data['dominant_hand'] = data['dominant_hand'].replace(to_replace=['left', 'right'], value=[0, 1])

  data['sex'] = data['sex'].replace(to_replace=['male', 'female'], value=[0, 1])


In [8]:
# Save data to CSV
data.to_csv('final_data/final_data.csv', index=False)

## Classification Dataset

This reformats the data so that the response is in binary format (fatigued/not fatigued). It also retains only the first and last 20% of each subject under each task.

Note: Needs the regression dataset first to be created.

In [20]:
data = pd.read_csv("final_data/final_data.csv")  # Load regression dataset

In [21]:
def get_quantile_rows(group):
    n = len(group)
    first = group.iloc[:int(n * 0.2)]
    last = group.iloc[int(n * 0.8):]

    # Ensures that subject's final fatigue exceeds 14, so that
    #   there is theoretically sufficent difference between the 
    #   non-fatigued and fatigued data.
    if max(last['fatigue']) > (20 - 6) / 2:
        first['fatigue'] = 0
        last['fatigue'] = 1

        return pd.concat([first, last])

g = data.groupby(['subject', 'task_order'])
result_df = g.apply(get_quantile_rows).reset_index(drop=True)

  result_df = g.apply(get_quantile_rows).reset_index(drop=True)


In [22]:
result_df.to_csv("final_data/final_data_classif.csv", index=False)  # Save data