In [1]:
import copy
import os 
import glob
import numpy as np
import pandas as pd
from lib import feature_extraction as fe
from lib import models

# Data Loader

- Selecting and loading the required data instances
- Loading all data from the LiftingAssessment Task

In [2]:
# Main data path
data_path = os.path.join(os.getcwd(), "data")
# Selecting Task-2
weight_lifting = os.path.join(data_path, "LiftingAssessment")
# Get all the ".csv" files
all_parsed_files = glob.glob("**/*.csv", root_dir=weight_lifting, recursive=True)

# Load the data
loaded_data = {}
for file_path in all_parsed_files:
    # Full path to file
    full_path = os.path.join(weight_lifting, file_path)

    # Load the time of DAQ
    with open(full_path, "r") as file_handle:
        daq_time = file_handle.readline()
        daq_time = daq_time.split(" ")[-1]
        daq_time = int(daq_time[0:-2])
    # Read the csv
    df = pd.read_csv(full_path, header="infer", skiprows=1)

    # Store data
    loaded_data[full_path] = {
        "daq_time": daq_time,
        "df": df
    }


In [3]:
# Print the counts
print(f"Total number of files loaded - {len(loaded_data.keys())}")

Total number of files loaded - 600


In [4]:
# Group by features
box_types = ["Crate", "CardboardBox"]
weight_levels = ["W2", "W5", "W10", "W15", "W30"]
labelled_data = {}
for box_instance in box_types:
    for weight_instance in weight_levels:
        labelled_data[box_instance + "-" + weight_instance] = []

for file_id in loaded_data.keys():
    box_instance = file_id.split(os.sep)[-4]
    weight_instance = file_id.split(os.sep)[-3]
    labelled_data[box_instance + "-" + weight_instance].append(file_id)
    

In [5]:
# Print number of items within each group
for class_instance in labelled_data.keys():
    print(f"For the class - {class_instance}, total number of items are {len(labelled_data[class_instance])}")

For the class - Crate-W2, total number of items are 60
For the class - Crate-W5, total number of items are 60
For the class - Crate-W10, total number of items are 60
For the class - Crate-W15, total number of items are 60
For the class - Crate-W30, total number of items are 60
For the class - CardboardBox-W2, total number of items are 60
For the class - CardboardBox-W5, total number of items are 60
For the class - CardboardBox-W10, total number of items are 60
For the class - CardboardBox-W15, total number of items are 60
For the class - CardboardBox-W30, total number of items are 60


In [6]:
sentinels_samplingRate = {"DAQSentinel01": [],
                          "DAQSentinel02": [],
                          "DAQSentinel03": []}
sampling_rates = {}
for file_path, data in loaded_data.items():
    # Choose the right sentinel
    sentinel = file_path.split("/")[-1].split("_")[0]

    # Determine sampling rate
    total_time = data["daq_time"]
    samples = data["df"].shape[0]
    sentinels_samplingRate[sentinel].append(samples / total_time)

for sentinel in sentinels_samplingRate.keys():
    print("Sampling Rate for " + sentinel + " with mean " + str(round(np.mean(sentinels_samplingRate[sentinel]), 2)) +
          " and std of " + str(round(np.std(sentinels_samplingRate[sentinel]), 2)))

    # Get the mean sampling rate
    sampling_rates[sentinel] = round(np.mean(sentinels_samplingRate[sentinel]), 2)

Sampling Rate for DAQSentinel01 with mean 396.8 and std of 13.25
Sampling Rate for DAQSentinel02 with mean 388.31 and std of 12.9
Sampling Rate for DAQSentinel03 with mean 390.16 and std of 13.0


In [7]:
class_combined_dfs = {}
sentinels = ["DAQSentinel01", "DAQSentinel02", "DAQSentinel03"]

# Group dataframes together
for class_instance in labelled_data.keys():
    # Differentiate by Sentinels
    class_combined_dfs[class_instance] = {}
    
    # Sentinels data instance counters
    counters = {}
    
    # Go through each file
    for file_id in labelled_data[class_instance]:
        # Get the sentinel name
        sentinel = file_id.split(os.sep)[-1].split("_")[0]
        
        # Get the dataframe
        df = loaded_data[file_id]["df"].copy(deep=True)
        # Remove the starting and ending data instances
        df = df.iloc[int(4 * sampling_rates[sentinel]):int(df.shape[0] - (4 * sampling_rates[sentinel]))]
        
        if sentinel in list(class_combined_dfs[class_instance].keys()):
            class_combined_dfs[class_instance][sentinel] = pd.concat([class_combined_dfs[class_instance][sentinel], df], ignore_index=True, copy=True)
            counters[sentinel] += 1
        else:
            class_combined_dfs[class_instance][sentinel] = df
            counters[sentinel] = 1
            
    # Assert at the end of every class
    for s in counters.values():
        assert s == 20, "Each sentinel should add upto 20 counts for two individuals"


# Segmentation

- 1 second segments with 250ms overlap between segments

In [8]:
# To ensure order
data_cols_considered = ["acc-X", "acc-Y", "acc-Z", "gyr-X", "gyr-Y", "gyr-Z"]

In [9]:
def segment_data(data_array: np.array, segment_window: float, overlap: float, sampling_rate: float):
    
    window_size = int(segment_window * sampling_rate)
    starting_points = np.arange(0, data_array.shape[0], int(window_size * (1 - overlap))).astype("uint32")
    
    data_segments = list()
    for starting_index in starting_points:
        if(starting_index + window_size) < data_array.shape[0]:
            data_segments.append(
                data_array[starting_index:starting_index + window_size, ...])
            
    return np.array(data_segments)
    

In [10]:
# Segment the data
sentinel_segmented_data = {}
for class_instance in class_combined_dfs.keys():
    sentinel_segmented_data[class_instance] = {}
    for sentinel in class_combined_dfs[class_instance].keys():
        sentinel_segmented_data[class_instance][sentinel] = segment_data(class_combined_dfs[class_instance][sentinel][data_cols_considered].to_numpy(), 1.0, 0.75, sampling_rates[sentinel])


# Feature Extraction

Extraction key features from the acceleration data

## Time-frequency features

Extracting the following features, a total of 17 features

- Time domain
- Frequency domain
- Time-frequency domain

In [11]:
features_extracted_data = {}
for class_instance in sentinel_segmented_data.keys():
    features_extracted_data[class_instance] = {}
    for sentinel in sentinel_segmented_data[class_instance].keys():
        data = sentinel_segmented_data[class_instance][sentinel]
        
        # Select arguments based on sentinel
        freq_args = [{"axis": 0}, {"axis": 0}, {"axis": 0, "nperseg": 200, "noverlap": 100, "fs": sampling_rates[sentinel]}]
        freq_time_args = [{"wavelet": "db1"}, {"wavelet": "db1"}, {"wavelet": "db1"}]
        
        # Apply transformation to every data row
        for index, row in enumerate(data):
            computed_segments_sensors = []
            for i in range(data.shape[-1]):
                # apply the transformation
                computed_segments_sensors += fe.compute_all_features(row[:, i], freq_args=freq_args, freq_time_args=freq_time_args)
            
            data_array = np.array(computed_segments_sensors).T
            if index == 0:
                features_extracted_data[class_instance][sentinel] = copy.deepcopy(data_array[np.newaxis, ...])
            else:
                features_extracted_data[class_instance][sentinel] = np.append(features_extracted_data[class_instance][sentinel], copy.deepcopy(data_array[np.newaxis, ...]), axis=0)



# Model Development

- Choose among the 10 available models
- set the parameters appropriately
- Train the model and get the metrics


In [12]:
model_params = {
    "LogisticRegression" : {"class_weight": "balanced", "max_iter": 5000, "n_jobs": 4},
    "DecisionTreeClassifier": {"min_samples_split": 100},
    "KNeighborsClassifier": {"n_neighbors": 10},
    "SVC": {"kernel": "rbf", "tol":1e-7},
    "BaggingClassifier": {"n_estimators": 50},
    "RandomForestClassifier": {"n_estimators": 100, "min_samples_split": 100, "class_weight": "balanced"},
}

# NIOSH labels
labels = {
    "Crate-W2": 0,
    "Crate-W5": 0,
    "Crate-W10": 0,
    "Crate-W15": 1,
    "Crate-W30": 1,
    "CardboardBox-W2": 0,
    "CardboardBox-W5": 0,
    "CardboardBox-W10": 0,
    "CardboardBox-W15": 1,
    "CardboardBox-W30": 1,
}

## Individual Sentinels

Model development by considering one Sentinel at a time


Choose the sentinel for model training

In [13]:
sentinel = "DAQSentinel02"

In [14]:
# Construct training data and labels
for index, class_instance in enumerate(features_extracted_data.keys()):
    # Select sentinel
    if index == 0:
        X_train = features_extracted_data[class_instance][sentinel]
        y_train = np.array([labels[class_instance]] * features_extracted_data[class_instance][sentinel].shape[0])[:, np.newaxis]
    else:
        X_train = np.append(X_train, features_extracted_data[class_instance][sentinel], axis=0)
        y_train = np.append(y_train, np.array([labels[class_instance]] * features_extracted_data[class_instance][sentinel].shape[0])[:, np.newaxis], axis=0)
        
# Print results
print(f"Shape of X-train is {X_train.shape}")
y_train = y_train.squeeze(axis=-1)
print(f"Shape of y-train is {y_train.shape}")


Shape of X-train is (3611, 102)
Shape of y-train is (3611,)


In [15]:
# Create models repo
models_repo = models.Models()
# Initialize
models_repo.create_models(model_params)

# 10-fold CV
cv_results_summary = models_repo.train_models_cvfolds(X_train, y_train, summarize_results=True, standardize=True)

Fold - 9

In [16]:
# Model names
model_association = [
    "LogisticRegression",
    "DecisionTreeClassifier",
    "KNeighborsClassifier",
    "SVC",
    "BaggingClassifier",
    "RandomForestClassifier"
]

# Make a copy
temp = copy.deepcopy(cv_results_summary)

for index, model_name in enumerate(model_association):

    temp[model_name].columns = pd.MultiIndex.from_product([[model_name], temp[model_name].columns])
    # Append columns
    if index == 0:
        combined_cv_results = temp[model_name]
    else:
        combined_cv_results = pd.concat([combined_cv_results, temp[model_name]], axis=1)

In [17]:
combined_cv_results

Unnamed: 0_level_0,LogisticRegression,LogisticRegression,LogisticRegression,LogisticRegression,LogisticRegression,DecisionTreeClassifier,DecisionTreeClassifier,DecisionTreeClassifier,DecisionTreeClassifier,DecisionTreeClassifier,...,BaggingClassifier,BaggingClassifier,BaggingClassifier,BaggingClassifier,BaggingClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier
Unnamed: 0_level_1,average,std,min,max,count,average,std,min,max,count,...,average,std,min,max,count,average,std,min,max,count
accuracy_score,0.708946,0.019758,0.6759,0.745152,10,0.702319,0.034114,0.629834,0.745152,10,...,0.813074,0.013607,0.797784,0.836565,10,0.7599,0.029956,0.722992,0.803324,10
balanced_accuracy_score,0.701476,0.020547,0.666553,0.734963,10,0.67228,0.039396,0.582111,0.724204,10,...,0.78033,0.017517,0.75962,0.810633,10,0.747502,0.033265,0.707564,0.794365,10
f1_score,0.639207,0.024102,0.597938,0.676056,10,0.581923,0.059863,0.436975,0.656716,10,...,0.723828,0.025219,0.689956,0.766798,10,0.689544,0.041327,0.633962,0.747405,10
recall_score,0.668849,0.027448,0.625899,0.697842,10,0.541043,0.072706,0.371429,0.633094,10,...,0.637276,0.041655,0.564286,0.705036,10,0.693299,0.054842,0.604317,0.776978,10
precision_score,0.612288,0.0243,0.572368,0.662069,10,0.632817,0.051928,0.530612,0.682171,10,...,0.840636,0.028961,0.805556,0.88764,10,0.687094,0.036942,0.629139,0.739437,10


## All Sentinels

- Considering all Sentinels in the model training process

In [18]:
# Construct training data and labels
for index, class_instance in enumerate(features_extracted_data.keys()):
    
    # Find the sentinel with min samples
    samples = []
    for sentinel in sentinels:
        samples.append(features_extracted_data[class_instance][sentinel].shape[0])
    
    min_samples = min(samples)

    for index2, sentinel in enumerate(sentinels):
        if index2 == 0:
            sub_X_train = features_extracted_data[class_instance][sentinel][0:min_samples, ...]
        else:
            sub_X_train = np.concatenate((sub_X_train, features_extracted_data[class_instance][sentinel][0:min_samples, ...]), axis=-1)
    
    if index == 0:
        X_train = copy.deepcopy(sub_X_train)
        y_train = np.array([labels[class_instance]] * sub_X_train.shape[0])[:, np.newaxis]
    else:
        X_train = np.append(X_train, copy.deepcopy(sub_X_train), axis=0)
        y_train = np.append(y_train, np.array([labels[class_instance]] * sub_X_train.shape[0])[:, np.newaxis], axis=0)
        
# Print results
print(f"Shape of X-train is {X_train.shape}")
y_train = y_train.squeeze(axis=-1)
print(f"Shape of y-train is {y_train.shape}")

Shape of X-train is (3609, 306)
Shape of y-train is (3609,)


In [19]:
# Create models repo
models_repo = models.Models()
# Initialize
models_repo.create_models(model_params)

# 10-fold CV
cv_results_summary = models_repo.train_models_cvfolds(X_train, y_train, summarize_results=True, standardize=True)

Fold - 9

In [20]:
# Model names
model_association = [
    "LogisticRegression",
    "DecisionTreeClassifier",
    "KNeighborsClassifier",
    "SVC",
    "BaggingClassifier",
    "RandomForestClassifier"
]

# Make a copy
temp = copy.deepcopy(cv_results_summary)

for index, model_name in enumerate(model_association):

    temp[model_name].columns = pd.MultiIndex.from_product([[model_name], temp[model_name].columns])
    # Append columns
    if index == 0:
        combined_cv_results = temp[model_name]
    else:
        combined_cv_results = pd.concat([combined_cv_results, temp[model_name]], axis=1)

In [21]:
combined_cv_results

Unnamed: 0_level_0,LogisticRegression,LogisticRegression,LogisticRegression,LogisticRegression,LogisticRegression,DecisionTreeClassifier,DecisionTreeClassifier,DecisionTreeClassifier,DecisionTreeClassifier,DecisionTreeClassifier,...,BaggingClassifier,BaggingClassifier,BaggingClassifier,BaggingClassifier,BaggingClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier
Unnamed: 0_level_1,average,std,min,max,count,average,std,min,max,count,...,average,std,min,max,count,average,std,min,max,count
accuracy_score,0.778883,0.025549,0.750693,0.825485,10,0.75395,0.020868,0.706371,0.778393,10,...,0.881968,0.02008,0.858726,0.914127,10,0.832646,0.019766,0.806094,0.869806,10
balanced_accuracy_score,0.771803,0.024416,0.740375,0.809693,10,0.734324,0.022613,0.698052,0.77006,10,...,0.860877,0.022292,0.83403,0.897903,10,0.823731,0.022157,0.79349,0.864557,10
f1_score,0.720997,0.029514,0.681004,0.765799,10,0.669382,0.03116,0.609053,0.71831,10,...,0.833671,0.029228,0.798419,0.881226,10,0.782928,0.027232,0.745387,0.83274,10
recall_score,0.741007,0.034083,0.683453,0.791367,10,0.648921,0.056322,0.532374,0.733813,10,...,0.769065,0.035155,0.719424,0.827338,10,0.784892,0.044407,0.719424,0.841727,10
precision_score,0.703397,0.041945,0.662252,0.792308,10,0.69567,0.038022,0.609272,0.761468,10,...,0.910938,0.031113,0.872,0.949153,10,0.782726,0.032308,0.744681,0.835938,10
