# Model development for multi-class of faults

In [None]:
import sys
import matplotlib.pyplot as plt
import matplotlib
import os
import pandas as pd
from lib import data_prep, feature_extraction, models
from sklearn.utils import shuffle
import numpy as np
import pickle

font = {
    "family": "sans-serif",
    "weight": "bold",
    "size": 16
}
matplotlib.rc("font", **font)

# Loading and testing the data
- Plotting to ensure validity
- Add more files to the training data as required

In [None]:
# Base directory
data_loc = os.path.join(os.getcwd(), "DATA")

# file name
file_names = {
    0: "machine_ON_no-ref_start-error_1.csv",  # Machine turned ON, and the parameter switch enable error
    1: "machine_ON_no-ref_start-error_2.csv",
    2: "machine_ON_no-ref_start-error_3.csv",
    3: "machine_ON_no-ref_start-error_4.csv",
    4: "machine_ON_ref_no-error_1.csv",  # Machine ON referenced and no-error idling
    5: "machine_ON_ref_no-error_2.csv",  # Machine ON referenced and no-error idling
    6: "machine_ON_ref_no-error_3.csv",
    7: "machine_ON_ref_no-error_4.csv",
    8: "machine_ON_ref_no-error_5.csv",
    9: "machine_ON_ref_no-error_6.csv",
    10: "machine_ON_ref_no-error_7.csv",
    11: "machine_ON_ref_no-error_8.csv",
    12: "machine_ON_ref_no-error_9.csv",
    13: "machine_ON_ref_no-error_10.csv",
    14: "machine_ON_ref_overtravel-error_x_neg_1.csv",  # Machine ON referenced and Overtravel for X negative
    15: "machine_ON_ref_overtravel-error_x_pos_1.csv",  # Machine ON referenced and Overtravel for X positive
    16: "machine_ON_no-ref_overtravel-error_x_neg_1.csv",  # Machine ON not-referenced and Overtravel for X negative
    17: "machine_ON_no-ref_overtravel-error_x_pos_1.csv", # Machine ON not-referenced and Overtravel for X positive
    18: "machine_ON_ref_overtravel-error_x_neg_axes-extreme_1.csv", # Reference and overtravel in X
    19: "machine_ON_ref_overtravel-error_x_neg_axes-extreme_2.csv", # Referenced and overtravel in X
    20: "machine_ON_ref_overtravel-error_x_pos_axes-extreme_1.csv", # Referenced and overtravel in X
    21: "machine_ON_ref_overtravel-error_y_neg_axes-extreme_1.csv",  # Machine ON referenced and Overtravel for Y negative
    22: "machine_ON_ref_overtravel-error_y_neg_1.csv", # Machine and ON referenced and Overtravel in Y
    23: "machine_ON_ref_overtravel-error_y_pos_1.csv",  # Machine ON referenced and Overtravel for Y positive
    24: "machine_ON_ref_overtravel-error_y_pos_axes-extreme_1.csv",
    25: "machine_ON_ref_overtravel-error_z_neg_1.csv",  # Machine ON referenced and Overtravel for Z negative
    26: "machine_ON_ref_overtravel-error_z_neg_axes-extreme_1.csv",
    27: "machine_ON_ref_overtravel-error_z_pos_1.csv",  # Machine ON referenced and Overtravel for Z positive
    28: "machine_ON_ref_overtravel-error_z_pos_axes-extreme_1.csv",
    29: "machine_ON_no-ref_1.csv",
    30: "machine_ON_no-ref_2.csv"
}

In [None]:
# load the data
index = 5
df = pd.read_csv(os.path.join(data_loc, file_names[index]), header="infer", index_col="no")
fig = plt.figure(figsize=(25, 5))
axs = fig.add_axes([0, 0, 1, 1])
df["PowerSum"][-120:].plot(ax=axs)

# Data preparation


## Segmentation

- Control different segmentation length
    - 2 mins
    - 1 min
    - 30 secs
    - 15 secs

- Refer to the spreadsheet in the results directory

In [None]:
segment_secs = 60
wavelet_nperseg = 15

In [None]:
# Dont choose "no" and "sample_time" as they will be added later to the beginning
# Chosen - Three different power components for three phases
chosen_cols = ["Power1", "Power2", "Power3", "PowerReac1", "PowerReac2", "PowerReac3", "PowerApp1", "PowerApp2", "PowerApp3"]
segmented_data = {}
for index, file_name in file_names.items():
    path = os.path.join(data_loc, file_name)
    temp = data_prep.segment_data(file_name=path, col_names=chosen_cols, segment_secs=segment_secs)
    # Remove the sample_time col
    temp = temp[:, 1:, :]
    segmented_data[file_name] =  temp

In [None]:
# Print to ensure that segmentation is successful
for file_name in segmented_data.keys():

    sys.stdout.write(f"For the file-{file_name} the shape-{segmented_data[file_name].shape}\n")

## Determine classes

Make a choice on the number of classes to be used for study

In [None]:
# Associations between the classes and the files in this study
class_file_association = {
    "on-ref": ["machine_ON_ref_no-error_1.csv", "machine_ON_ref_no-error_2.csv", "machine_ON_ref_no-error_3.csv", "machine_ON_ref_no-error_4.csv", "machine_ON_ref_no-error_5.csv", "machine_ON_ref_no-error_6.csv", "machine_ON_ref_no-error_7.csv", "machine_ON_ref_no-error_8.csv", "machine_ON_ref_no-error_9.csv", "machine_ON_ref_no-error_9.csv"],

    "on-noref-error": ["machine_ON_no-ref_start-error_1.csv", "machine_ON_no-ref_start-error_2.csv", "machine_ON_no-ref_start-error_3.csv", "machine_ON_no-ref_start-error_4.csv", "machine_ON_no-ref_1.csv", "machine_ON_no-ref_2.csv"],

    "overtravel-x": ["machine_ON_ref_overtravel-error_x_neg_1.csv", "machine_ON_ref_overtravel-error_x_pos_1.csv", "machine_ON_no-ref_overtravel-error_x_neg_1.csv", "machine_ON_no-ref_overtravel-error_x_pos_1.csv", "machine_ON_ref_overtravel-error_x_neg_axes-extreme_1.csv",
    "machine_ON_ref_overtravel-error_x_neg_axes-extreme_2.csv", "machine_ON_ref_overtravel-error_x_pos_axes-extreme_1.csv"],

    "overtravel-y": ["machine_ON_ref_overtravel-error_y_neg_1.csv", "machine_ON_ref_overtravel-error_y_pos_1.csv",
                    "machine_ON_ref_overtravel-error_y_neg_axes-extreme_1.csv", "machine_ON_ref_overtravel-error_y_pos_axes-extreme_1.csv"],

    "overtravel-z": ["machine_ON_ref_overtravel-error_z_neg_1.csv", "machine_ON_ref_overtravel-error_z_pos_1.csv", "machine_ON_ref_overtravel-error_z_neg_axes-extreme_1.csv", "machine_ON_ref_overtravel-error_z_pos_axes-extreme_1.csv"],
}

In [None]:
# Okay
class_segmented_data = {}
for class_instance in class_file_association.keys():
    for index, file_name in enumerate(class_file_association[class_instance]):

        if index == 0:
            class_segmented_data[class_instance] = segmented_data[file_name]
        else:
            class_segmented_data[class_instance] = np.append(class_segmented_data[class_instance], segmented_data[file_name], axis=-1)

In [None]:
# Reshape the data appropriately
for class_instance in class_segmented_data.keys():
    class_segmented_data[class_instance] = np.transpose(class_segmented_data[class_instance], (2, 1, 0))

In [None]:
# Print to ensure that the files have been loaded correctly
for class_instance in class_segmented_data.keys():

    sys.stdout.write(f"The class-{class_instance} has the shape-{class_segmented_data[class_instance].shape}\n")

## Feature Extraction

- Time domain
- Frequency domain
- Time-frequency domain

In [None]:
# Features to extract
features_to_select = input("Features to select\n\t--All features 'all'\n\t--Time domain features => 'time'\n\t--Frequency domain features => 'freq'\n\t--Time-Frequency domain features => 'time-freq'\n--Choice: ")

In [None]:
class_dataset_features = {}
for class_instance in class_segmented_data.keys():
    dataset_features = []
    for row in class_segmented_data[class_instance]:
        computed_features = []
        for col in row:
            freq_args = [{"axis": 0}, {"axis": 0}, {"axis": 0, "nperseg": wavelet_nperseg}]
            freq_time_args = [{"wavelet": "db1"}, {"wavelet": "db1"}, {"wavelet": "db1"}]
            if features_to_select == 'all':
                computed_features += feature_extraction.compute_all_features(col, freq_args=freq_args, freq_time_args=freq_time_args)
            elif features_to_select == 'time':
                computed_features += feature_extraction.compute_time_domain_features(col)
            elif features_to_select == 'freq':
                computed_features += feature_extraction.compute_frequency_domain_features(col, args=freq_args)
            elif features_to_select == "time-freq":
                computed_features += feature_extraction.compute_time_frequency_features(col, args=freq_time_args)
            else:
                sys.exit("Unknown Choice on the feature selection")

        # Append to a list
        dataset_features.append(computed_features)

    # Add to class instance
    class_dataset_features[class_instance] = np.array(dataset_features)

In [None]:
sys.stdout.write("After feature extraction process\n\n")
for class_instance in class_dataset_features.keys():

    sys.stdout.write(f'For the class-{class_instance} , the extracted features has the shape={class_dataset_features[class_instance].shape}\n')

## Generate training data

- Combine all with appropriate labels
- Using KFold

In [None]:
class_label_associations = {
    "on-ref": 0,
    "on-noref-error": 1,
    "overtravel-x": 2,
    "overtravel-y": 3,
    "overtravel-z": 4
}
for index, class_instance in enumerate(class_dataset_features.keys()):

    temp_X = class_dataset_features[class_instance]
    temp_y = np.repeat(class_label_associations[class_instance], temp_X.shape[0])[:, np.newaxis]

    if index == 0:
        X = temp_X
        y = temp_y
    else:
        X = np.append(X, temp_X, axis=0)
        y = np.append(y, temp_y, axis=0)

# Shuffle the dataset
X, y = shuffle(X, y, random_state=42)
# To a vector format
y =  np.squeeze(y)

sys.stdout.write(f"The final combined shape-{X.shape}\n")

# Model development
- Choose among the available models
- Set the parameters appropriately
- Train the model and get the metrics

## Hyper-parameters optimization

- trying to find the best hyperparameters for each of the models

In [None]:
# Manually chosen ones
model_params = {
    "LogisticRegression" : {"class_weight": "balanced", "max_iter": 5000, "multi_class": "multinomial", "n_jobs": 4},
    "DecisionTreeClassifier": {"min_samples_split": 100, "class_weight": "balanced"},
    "KNeighborsClassifier": {"n_neighbors": 10},
    "SVC": {"kernel": "rbf", "tol":1e-7, "class_weight": "balanced"},
    "BaggingClassifier": {"n_estimators": 50},
    "RandomForestClassifier": {"n_estimators": 100, "min_samples_split": 100, "class_weight": "balanced"},
}

In [None]:
# Possible parameters
hyper_params = {
    "LogisticRegression" : {"tol": [0.0001, 0.00005, 0.0000005], "max_iter": [5000, 10000, 20000], "multi_class": ["multinomial"], "n_jobs": [4], "class_weight": ["balanced"]},
    "DecisionTreeClassifier": {"min_samples_split": [100, 500], "max_depth": [None, 5, 10, 15, 50], "min_samples_leaf":[1, 100, 500], "class_weight": ["balanced"]},
    "KNeighborsClassifier": {"n_neighbors": [10, 5, 20, 50, 100], "weights":["uniform", "distance"]},
    "SVC": {"kernel": ["linear", "poly", "rbf"], "tol":[1e-7, 1e-3], "class_weight": ["balanced"]},
    "BaggingClassifier": {"n_estimators": [10, 20, 50, 100]},
    "RandomForestClassifier": {"n_estimators": [100, 50, 200], "min_samples_split": [100, 500], "max_depth": [None, 5, 10, 15, 50], "min_samples_leaf":[1, 100, 500], "class_weight": ["balanced"]},
}

In [None]:
# Create repo of models for hyperparameter optimization
models_repo_hyperopt = models.Models()
# Initialize the models
models_repo_hyperopt.create_models(model_params)

# Optimize the hyper-parameters for all models
models_repo_hyperopt.optimize_hyperparameters(hyperparameters=hyper_params, X_train=X, y_train=y, standardize=True)

# Print the optimized f1-scores
print("F1-Scores")
for model_name in models_repo_hyperopt.hyper_opt_model_scores.keys():

    print(f"{model_name} - {models_repo_hyperopt.hyper_opt_model_scores[model_name]}")

In [None]:
# Get the best performing parameters
for model_name in models_repo_hyperopt.hyper_opt_model_params.keys():
    print(f"Model name: {model_name}")
    print(models_repo_hyperopt.hyper_opt_model_params[model_name])
    print()


## CV Fold training

- Training across 10-fold CV

In [None]:
# Set this if you are optimizing before this step
# model_params = models_repo_hyperopt.hyper_opt_model_params

# The ones below are hardcoded optimized hyperparameters
# for each of the models
model_params = {'LogisticRegression': {'max_iter': 5000, 'multi_class': 'multinomial', 'n_jobs': 4, 'tol': 0.0001, "class_weight": "balanced"},
 'DecisionTreeClassifier': {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 100},
 'KNeighborsClassifier': {'n_neighbors': 20},
 'SVC': {'class_weight': 'balanced', 'kernel': 'linear', 'tol': 1e-07},
 'BaggingClassifier': {'n_estimators': 50},
 'RandomForestClassifier': {'class_weight': 'balanced', 'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 100, 'n_estimators': 200}}

In [None]:
# Create repo of models
models_repo = models.Models()
# Initialize the models
models_repo.create_models(model_params)

# 10-fold to determine the effective performance of all models
# And standardize the data
cv_results_summary = models_repo.train_models_cvfolds(X, y, summarize_results=True, standardize=True)

In [None]:
# Better way to set this
model_association = {
    0: "LogisticRegression",
    1: "DecisionTreeClassifier",
    2: "KNeighborsClassifier",
    3: "SVC",
    4: "BaggingClassifier",
    5: "RandomForestClassifier"
}

In [None]:
# Print the required dataframe and record the metrics
model_id = 5
cv_results_summary[model_association[model_id]]

# Save the data

Just for future reference - can be used later to analyze the results

In [None]:
train_save_dir = os.path.join(os.getcwd(), "results")
train_save_file_name = "training_data.pkl"

# Training data
train_data = {
    "X": X,
    "y": y
}

with open(os.path.join(train_save_dir, train_save_file_name), "wb") as file_handle:
    pickle.dump(train_data, file_handle, protocol=pickle.HIGHEST_PROTOCOL)