In [None]:
import copy
import os 
import glob
import numpy as np
import pandas as pd
from lib import feature_extraction as fe
from lib import models
from sklearn.ensemble import BaggingClassifier

# Data Loader

- Selecting and loading the required data instances
- Loading all data from the LiftingAssessment Task

In [None]:
# Main data path
data_path = os.path.join(os.getcwd(), "data")
# Selecting Task-2
weight_lifting = os.path.join(data_path, "LiftingAssessment")
# Get all the ".csv" files
all_parsed_files = glob.glob("**/*.csv", root_dir=weight_lifting, recursive=True)

# Load the data
loaded_data = {}
for file_path in all_parsed_files:
    # Full path to file
    full_path = os.path.join(weight_lifting, file_path)

    # Load the time of DAQ
    with open(full_path, "r") as file_handle:
        daq_time = file_handle.readline()
        daq_time = daq_time.split(" ")[-1]
        daq_time = int(daq_time[0:-2])
    # Read the csv
    df = pd.read_csv(full_path, header="infer", skiprows=1)

    # Store data
    loaded_data[full_path] = {
        "daq_time": daq_time,
        "df": df
    }


In [None]:
# Print the counts
print(f"Total number of files loaded - {len(loaded_data.keys())}")

In [None]:
# Group by features
box_types = ["Crate", "CardboardBox"]
weight_levels = ["W2", "W5", "W10", "W15", "W30"]
labelled_data = {}
for box_instance in box_types:
    for weight_instance in weight_levels:
        labelled_data[box_instance + "-" + weight_instance] = []

for file_id in loaded_data.keys():
    box_instance = file_id.split(os.sep)[-4]
    weight_instance = file_id.split(os.sep)[-3]
    labelled_data[box_instance + "-" + weight_instance].append(file_id)
    

In [None]:
# Print number of items within each group
for class_instance in labelled_data.keys():
    print(f"For the class - {class_instance}, total number of items are {len(labelled_data[class_instance])}")

In [None]:
sentinels_samplingRate = {"DAQSentinel01": [],
                          "DAQSentinel02": [],
                          "DAQSentinel03": []}
sampling_rates = {}
for file_path, data in loaded_data.items():
    # Choose the right sentinel
    sentinel = file_path.split("/")[-1].split("_")[0]

    # Determine sampling rate
    total_time = data["daq_time"]
    samples = data["df"].shape[0]
    sentinels_samplingRate[sentinel].append(samples / total_time)

for sentinel in sentinels_samplingRate.keys():
    print("Sampling Rate for " + sentinel + " with mean " + str(round(np.mean(sentinels_samplingRate[sentinel]), 2)) +
          " and std of " + str(round(np.std(sentinels_samplingRate[sentinel]), 2)))

    # Get the mean sampling rate
    sampling_rates[sentinel] = round(np.mean(sentinels_samplingRate[sentinel]), 2)

In [None]:
class_combined_dfs = {}
sentinels = ["DAQSentinel01", "DAQSentinel02", "DAQSentinel03"]

# Group dataframes together
for class_instance in labelled_data.keys():
    # Differentiate by Sentinels
    class_combined_dfs[class_instance] = {}
    
    # Sentinels data instance counters
    counters = {}
    
    # Go through each file
    for file_id in labelled_data[class_instance]:
        # Get the sentinel name
        sentinel = file_id.split(os.sep)[-1].split("_")[0]
        
        # Get the dataframe
        df = loaded_data[file_id]["df"].copy(deep=True)
        # Remove the starting and ending data instances
        df = df.iloc[int(4 * sampling_rates[sentinel]):int(df.shape[0] - (4 * sampling_rates[sentinel]))]
        
        if sentinel in list(class_combined_dfs[class_instance].keys()):
            class_combined_dfs[class_instance][sentinel] = pd.concat([class_combined_dfs[class_instance][sentinel], df], ignore_index=True, copy=True)
            counters[sentinel] += 1
        else:
            class_combined_dfs[class_instance][sentinel] = df
            counters[sentinel] = 1
            
    # Assert at the end of every class
    for s in counters.values():
        assert s == 20, "Each sentinel should add upto 20 counts for two individuals"


# Segmentation

- 1 second segments with 250ms overlap between segments

In [None]:
# To ensure order
data_cols_considered = ["acc-X", "acc-Y", "acc-Z", "gyr-X", "gyr-Y", "gyr-Z"]

In [None]:
def segment_data(data_array: np.array, segment_window: float, overlap: float, sampling_rate: float):
    
    window_size = int(segment_window * sampling_rate)
    starting_points = np.arange(0, data_array.shape[0], int(window_size * (1 - overlap))).astype("uint32")
    
    data_segments = list()
    for starting_index in starting_points:
        if(starting_index + window_size) < data_array.shape[0]:
            data_segments.append(
                data_array[starting_index:starting_index + window_size, ...])
            
    return np.array(data_segments)
    

In [None]:
# Segment the data
sentinel_segmented_data = {}
for class_instance in class_combined_dfs.keys():
    sentinel_segmented_data[class_instance] = {}
    for sentinel in class_combined_dfs[class_instance].keys():
        sentinel_segmented_data[class_instance][sentinel] = segment_data(class_combined_dfs[class_instance][sentinel][data_cols_considered].to_numpy(), 1.0, 0.75, sampling_rates[sentinel])


# Feature Extraction

Extraction key features from the sensor data

<span style='color:red'> Note: Run only one cell in the feature extraction segment </span>

## Time domain Features

Only the features from time-domain are extracted

1. Time Domain - 11
    - RMS
    - Variance
    - Peak Value
    - Crest Factor
    - Kurtosis Factor
    - Clearance Factor
    - Impulse Factor
    - Shape Factor
    - Line Integral
    - Peak to Peak
    - Skewness


In [None]:
features_extracted_data = {}
for class_instance in sentinel_segmented_data.keys():
    features_extracted_data[class_instance] = {}
    for sentinel in sentinel_segmented_data[class_instance].keys():
        data = sentinel_segmented_data[class_instance][sentinel]
        
        # Apply transformation to every data row
        for index, row in enumerate(data):
            computed_segments_sensors = []
            for i in range(data.shape[-1]):
                # apply the transformation
                computed_segments_sensors += fe.compute_time_domain_features(row[:, i])
            
            data_array = np.array(computed_segments_sensors).T
            if index == 0:
                features_extracted_data[class_instance][sentinel] = copy.deepcopy(data_array[np.newaxis, ...])
            else:
                features_extracted_data[class_instance][sentinel] = np.append(features_extracted_data[class_instance][sentinel], copy.deepcopy(data_array[np.newaxis, ...]), axis=0)
        

## Time Domain + Frequency Domain Features

The features here include a combination of time domain and frequency domain features

1. Time domain features - 11 (Same as above)
2. Frequency domain features - 3
    - Peak FFT
    - Energy FFT
    - Power Spectral Density of FFT

In [None]:
features_extracted_data = {}
for class_instance in sentinel_segmented_data.keys():
    features_extracted_data[class_instance] = {}
    for sentinel in sentinel_segmented_data[class_instance].keys():
        data = sentinel_segmented_data[class_instance][sentinel]
        
        # Select arguments based on sentinel
        freq_args = [{"axis": 0}, {"axis": 0}, {"axis": 0, "nperseg": 200, "noverlap": 100, "fs": sampling_rates[sentinel]}]
        
        # Apply transformation to every data row
        for index, row in enumerate(data):
            computed_segments_sensors = []
            for i in range(data.shape[-1]):
                # apply the transformation
                computed_segments_sensors += fe.compute_time_and_frequency_features(row[:, i], freq_args=freq_args)
            
            data_array = np.array(computed_segments_sensors).T
            if index == 0:
                features_extracted_data[class_instance][sentinel] = copy.deepcopy(data_array[np.newaxis, ...])
            else:
                features_extracted_data[class_instance][sentinel] = np.append(features_extracted_data[class_instance][sentinel], copy.deepcopy(data_array[np.newaxis, ...]), axis=0)

## Frequency Domain + Time-Frequency Domain Features

Includes a total of 6 features

1. Frequency Domain - 3
2. Time-Frequency Domain - 6
    - Energy WPD (Wavelet Packet Decomposition) 1st Order
    - Energy WPD 2nd Order
    - Energy WPD 3rd Order

In [None]:
features_extracted_data = {}
for class_instance in sentinel_segmented_data.keys():
    features_extracted_data[class_instance] = {}
    for sentinel in sentinel_segmented_data[class_instance].keys():
        data = sentinel_segmented_data[class_instance][sentinel]
        
        # Select arguments based on sentinel
        freq_args = [{"axis": 0}, {"axis": 0}, {"axis": 0, "nperseg": 200, "noverlap": 100, "fs": sampling_rates[sentinel]}]
        freq_time_args = [{"wavelet": "db1"}, {"wavelet": "db1"}, {"wavelet": "db1"}]
        
        # Apply transformation to every data row
        for index, row in enumerate(data):
            computed_segments_sensors = []
            for i in range(data.shape[-1]):
                # apply the transformation
                computed_segments_sensors += fe.compute_frequency_and_time_frequency_features(row[:, i], freq_args=freq_args, freq_time_args=freq_time_args)
            
            data_array = np.array(computed_segments_sensors).T
            if index == 0:
                features_extracted_data[class_instance][sentinel] = copy.deepcopy(data_array[np.newaxis, ...])
            else:
                features_extracted_data[class_instance][sentinel] = np.append(features_extracted_data[class_instance][sentinel], copy.deepcopy(data_array[np.newaxis, ...]), axis=0)

## Time Domain + Frequency Domain + Time-frequency Domain Features

Extracting the following features, a total of 17 features

1. Time Domain Features - 11
2. Frequency Domain Features - 3
3. Time-frequency Domain Features - 3

In [None]:
features_extracted_data = {}
for class_instance in sentinel_segmented_data.keys():
    features_extracted_data[class_instance] = {}
    for sentinel in sentinel_segmented_data[class_instance].keys():
        data = sentinel_segmented_data[class_instance][sentinel]
        
        # Select arguments based on sentinel
        freq_args = [{"axis": 0}, {"axis": 0}, {"axis": 0, "nperseg": 200, "noverlap": 100, "fs": sampling_rates[sentinel]}]
        freq_time_args = [{"wavelet": "db1"}, {"wavelet": "db1"}, {"wavelet": "db1"}]
        
        # Apply transformation to every data row
        for index, row in enumerate(data):
            computed_segments_sensors = []
            for i in range(data.shape[-1]):
                # apply the transformation
                computed_segments_sensors += fe.compute_all_features(row[:, i], freq_args=freq_args, freq_time_args=freq_time_args)
            
            data_array = np.array(computed_segments_sensors).T
            if index == 0:
                features_extracted_data[class_instance][sentinel] = copy.deepcopy(data_array[np.newaxis, ...])
            else:
                features_extracted_data[class_instance][sentinel] = np.append(features_extracted_data[class_instance][sentinel], copy.deepcopy(data_array[np.newaxis, ...]), axis=0)



# Model Development

- Choose among the 10 available models
- set the parameters appropriately
- Train the model and get the metrics


In [None]:
# Tuned hyperparameters
model_params = {
    "LogisticRegression" : {"class_weight": "balanced", "max_iter": 5000, "n_jobs": 4, "tol": 0.0001},
    "DecisionTreeClassifier": {'class_weight': 'balanced', 'max_depth': 50, 'min_samples_leaf': 20, 'min_samples_split': 20},
    "KNeighborsClassifier": {'n_neighbors': 10, 'weights': 'uniform'},
    "SVC": {'class_weight': 'balanced', 'kernel': 'poly', 'tol': 1e-07},
    "BaggingClassifier": {"n_estimators": 100},
    "RandomForestClassifier": {'class_weight': 'balanced', 'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 100, 'n_estimators': 100},
    # "GradientBoostingClassifier": {"loss": "log_loss", "learning_rate": 0.001, "n_estimators": 100, "min_samples_split": 50, "min_samples_leaf": 5, "max_depth": 5, "verbose": 0, "tol":1e-7},
    "AdaBoostClassifier": {"base_estimator": BaggingClassifier(n_estimators=100),"n_estimators": 500, "learning_rate": 0.001},
    "MLPClassifier": {"hidden_layer_sizes": (50, 20), "max_iter": 200}
}

# NIOSH labels
labels = {
    "Crate-W2": 0,
    "Crate-W5": 0,
    "Crate-W10": 0,
    "Crate-W15": 1,
    "Crate-W30": 1,
    "CardboardBox-W2": 0,
    "CardboardBox-W5": 0,
    "CardboardBox-W10": 0,
    "CardboardBox-W15": 1,
    "CardboardBox-W30": 1,
}

## Individual Sentinels

Model development by considering one Sentinel at a time


Choose the sentinel for model training

In [None]:
sentinel = "DAQSentinel02"

In [None]:
# Construct training data and labels
for index, class_instance in enumerate(features_extracted_data.keys()):
    # Select sentinel
    if index == 0:
        X_train = features_extracted_data[class_instance][sentinel]
        y_train = np.array([labels[class_instance]] * features_extracted_data[class_instance][sentinel].shape[0])[:, np.newaxis]
    else:
        X_train = np.append(X_train, features_extracted_data[class_instance][sentinel], axis=0)
        y_train = np.append(y_train, np.array([labels[class_instance]] * features_extracted_data[class_instance][sentinel].shape[0])[:, np.newaxis], axis=0)
        
# Print results
print(f"Shape of X-train is {X_train.shape}")
y_train = y_train.squeeze(axis=-1)
print(f"Shape of y-train is {y_train.shape}")


In [None]:
# Create models repo
models_repo = models.Models()
# Initialize
models_repo.create_models(model_params)

# 10-fold CV
cv_results_summary = models_repo.train_models_cvfolds(X_train, y_train, kfolds=10, summarize_results=True, standardize=True)

In [None]:
# Model names
model_association = [
    "LogisticRegression",
    "DecisionTreeClassifier",
    "KNeighborsClassifier",
    "SVC",
    "BaggingClassifier",
    "RandomForestClassifier", 
    "AdaBoostClassifier",
    "MLPClassifier"
]

# Make a copy
temp = copy.deepcopy(cv_results_summary)

for index, model_name in enumerate(model_association):

    temp[model_name].columns = pd.MultiIndex.from_product([[model_name], temp[model_name].columns])
    # Append columns
    if index == 0:
        combined_cv_results = temp[model_name]
    else:
        combined_cv_results = pd.concat([combined_cv_results, temp[model_name]], axis=1)

In [None]:
combined_cv_results

## All Sentinels

- Considering all Sentinels in the model training process

In [None]:
# Construct training data and labels
for index, class_instance in enumerate(features_extracted_data.keys()):
    
    # Find the sentinel with min samples
    samples = []
    for sentinel in sentinels:
        samples.append(features_extracted_data[class_instance][sentinel].shape[0])
    
    min_samples = min(samples)

    for index2, sentinel in enumerate(sentinels):
        if index2 == 0:
            sub_X_train = features_extracted_data[class_instance][sentinel][0:min_samples, ...]
        else:
            sub_X_train = np.concatenate((sub_X_train, features_extracted_data[class_instance][sentinel][0:min_samples, ...]), axis=-1)
    
    if index == 0:
        X_train = copy.deepcopy(sub_X_train)
        y_train = np.array([labels[class_instance]] * sub_X_train.shape[0])[:, np.newaxis]
    else:
        X_train = np.append(X_train, copy.deepcopy(sub_X_train), axis=0)
        y_train = np.append(y_train, np.array([labels[class_instance]] * sub_X_train.shape[0])[:, np.newaxis], axis=0)
        
# Print results
print(f"Shape of X-train is {X_train.shape}")
y_train = y_train.squeeze(axis=-1)
print(f"Shape of y-train is {y_train.shape}")

In [None]:
# Create models repo
models_repo = models.Models()
# Initialize
models_repo.create_models(model_params)

# 10-fold CV
cv_results_summary = models_repo.train_models_cvfolds(X_train, y_train, kfolds=10, summarize_results=True, standardize=True)

In [None]:
# Model names
model_association = [
    "LogisticRegression",
    "DecisionTreeClassifier",
    "KNeighborsClassifier",
    "SVC",
    "BaggingClassifier",
    "RandomForestClassifier", 
    "AdaBoostClassifier",
    "MLPClassifier"
]

# Make a copy
temp = copy.deepcopy(cv_results_summary)

for index, model_name in enumerate(model_association):

    temp[model_name].columns = pd.MultiIndex.from_product([[model_name], temp[model_name].columns])
    # Append columns
    if index == 0:
        combined_cv_results = temp[model_name]
    else:
        combined_cv_results = pd.concat([combined_cv_results, temp[model_name]], axis=1)

In [None]:
combined_cv_results

# Hyperparameters Optimization

In [None]:
model_params = {
    "LogisticRegression" : {"class_weight": "balanced", "max_iter": 5000, "n_jobs": 4},
    "DecisionTreeClassifier": {"min_samples_split": 20},
    "KNeighborsClassifier": {"n_neighbors": 10},
    "SVC": {"kernel": "rbf", "tol":1e-7},
    "BaggingClassifier": {"n_estimators": 50},
    "RandomForestClassifier": {"n_estimators": 100, "min_samples_split": 100, "class_weight": "balanced"},
    # "GradientBoostingClassifier": {"loss": "log_loss", "learning_rate": 0.001, "n_estimators": 100, "min_samples_split": 50, "min_samples_leaf": 5, "max_depth": 5, "verbose": 0, "tol":1e-7},
    "AdaBoostClassifier": {"n_estimators": 100, "learning_rate": 0.0001},
    "MLPClassifier": {"hidden_layer_sizes": (100, 50), "max_iter": 500}
}

# Prospective hyperparameters
hp = {
    "LogisticRegression" : {"tol": [0.0001, 0.00005, 0.0000005], "max_iter": [5000, 10000, 20000], "multi_class": ["multinomial"], "n_jobs": [4], "class_weight": ["balanced"]},
    "DecisionTreeClassifier": {"min_samples_split": [20, 50, 100], "max_depth": [None, 5, 10, 15, 50], "min_samples_leaf":[1, 20, 100, 200], "class_weight": ["balanced"]},
    "KNeighborsClassifier": {"n_neighbors": [10, 5, 20, 50, 100], "weights":["uniform", "distance"]},
    "SVC": {"kernel": ["linear", "poly", "rbf"], "tol":[1e-7, 1e-3], "class_weight": ["balanced"]},
    "BaggingClassifier": {"n_estimators": [10, 20, 50, 100]},
    "RandomForestClassifier": {"n_estimators": [100, 50, 200], "min_samples_split": [100, 500], "max_depth": [None, 5, 10, 15, 50], "min_samples_leaf":[1, 100, 500], "class_weight": ["balanced"]},
    # "GradientBoostingClassifier": {"loss": ["log_loss"], "learning_rate": [0.1, 0.001, 0.0001], "n_estimators": [50, 100, 400], "min_samples_split": [10, 20, 50, 100], "min_samples_leaf": [1, 10, 50], "max_depth": [None, 5, 10, 15, 50], "tol":[1e-7, 1e-3]},
    "AdaBoostClassifier": {"n_estimators": [20, 100, 200, 500], "learning_rate": [0.01, 0.001]},
    "MLPClassifier": {"hidden_layer_sizes": [(100, 50), (100, 50, 20), (50, 20)], "max_iter": [100, 200, 500]}
}

# NIOSH labels
labels = {
    "Crate-W2": 0,
    "Crate-W5": 0,
    "Crate-W10": 0,
    "Crate-W15": 1,
    "Crate-W30": 1,
    "CardboardBox-W2": 0,
    "CardboardBox-W5": 0,
    "CardboardBox-W10": 0,
    "CardboardBox-W15": 1,
    "CardboardBox-W30": 1,
}


In [None]:
# Construct training data and labels
for index, class_instance in enumerate(features_extracted_data.keys()):
    
    # Find the sentinel with min samples
    samples = []
    for sentinel in sentinels:
        samples.append(features_extracted_data[class_instance][sentinel].shape[0])
    
    min_samples = min(samples)

    for index2, sentinel in enumerate(sentinels):
        if index2 == 0:
            sub_X_train = features_extracted_data[class_instance][sentinel][0:min_samples, ...]
        else:
            sub_X_train = np.concatenate((sub_X_train, features_extracted_data[class_instance][sentinel][0:min_samples, ...]), axis=-1)
    
    if index == 0:
        X_train = copy.deepcopy(sub_X_train)
        y_train = np.array([labels[class_instance]] * sub_X_train.shape[0])[:, np.newaxis]
    else:
        X_train = np.append(X_train, copy.deepcopy(sub_X_train), axis=0)
        y_train = np.append(y_train, np.array([labels[class_instance]] * sub_X_train.shape[0])[:, np.newaxis], axis=0)
        
# Print results
print(f"Shape of X-train is {X_train.shape}")
y_train = y_train.squeeze(axis=-1)
print(f"Shape of y-train is {y_train.shape}")

In [None]:
# Create repo of models for hyperparameter optimization
models_repo_hyperopt = models.Models()
# Initialize the models
models_repo_hyperopt.create_models(model_params)

# Optimize the hyperparameters for all models
models_repo_hyperopt.optimize_hyperparameters(hyperparameters=hp, X_train=X_train, y_train=y_train, standardize=True)

# Print the optimized f1-scores
print("F1-Scores")
for model_name in models_repo_hyperopt.hyper_opt_model_scores.keys():

    print(f"{model_name} - {models_repo_hyperopt.hyper_opt_model_scores[model_name]}")

In [None]:
models_repo_hyperopt.hyper_opt_model_params