# Model Training for PHME21 Tasks

In [None]:
import pickle
import pandas as pd
import numpy as np
from numpy import nan
from os import listdir
from os.path import isfile, join
from os import listdir
import re

import warnings
warnings.filterwarnings("ignore")

In [None]:
fields_path = 'input/training_validation_2/fields.csv'
fields_df = pd.read_csv(fields_path)
fields_df.columns = ['name', 'f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6']

fields_dict = {}

for idx in range(fields_df.shape[0]):
    name = fields_df.loc[idx, 'name']

    _fields = []

    for f in fields_df.columns[1:]:
        if not (str(fields_df.loc[idx, f]) == 'nan'):
            _fields.append(name + "_" + str(fields_df.loc[idx, f]))

    fields_dict[idx] = {'name': fields_df.loc[idx, 'name'], 'fields': _fields}
    
# fields_dict

In [None]:
# Get class id and run id from filename
def parse_class_name(fname):
    p = re.compile("^class[^\d]*(\d+)_(\d+).*.csv")
    m = p.match(fname)

    return m.groups()

In [None]:
def impute_df(df):

    df = df.interpolate(limit_direction='both')

    return df

In [None]:
# Load one data file and return in a data frame
def load_data_file(path, fname):
    fullpath = join(path, fname)
    df = pd.read_csv(fullpath)
    df.columns = ['name', 'data']

    dfx = []

    for f in fields_dict:
        name = fields_dict[f]['name']
        fields = fields_dict[f]['fields']

        data = eval(df.loc[f, 'data'])  # convert data to array

        new_df = pd.DataFrame(data)
        if (f == 33) and (new_df.shape[1] == 6):  # NumberFuseDetected has a special case!
            new_df[6] = new_df[5]
            new_df[5] = np.NaN

        new_df.columns = fields_dict[f]['fields']

        dfx.append(new_df)

    merged_df = pd.concat(dfx, axis=1)  # Merge columns

    # # Do some imputation on the data file
    # merged_df = impute_df(merged_df.copy())

    c, r = parse_class_name(fname)  # Get class id and run id

    # Add class labels and run id
    merged_df['class'] = int(c)
    merged_df['run'] = int(r)

    return merged_df

In [None]:
# Load data files from a directory and return merged data frame
def load_data_files(path):
    print("In", path)
    files = []
    for f in listdir(path):
        if (isfile(join(path, f)) and (f.startswith("class"))):
            files.append(f)

    data_df_list = []
    for fname in files:
        print("Loading:", fname)

        df = load_data_file(path, fname)

        data_df_list.append(df)

    data_df = pd.concat(data_df_list, axis=0)  # Merge data frames

    return data_df

In [None]:
data_df_1 = load_data_files("input/training_validation_1/")
data_df_2 = load_data_files("input/training_validation_2/")
data_df_3 = load_data_files("input/ModelRefinement/")

In [None]:
data_df = pd.concat([data_df_1, data_df_2, data_df_3], axis=0, ignore_index=True)
df = data_df.copy(deep=True)


In [None]:
print(df.isnull().sum().any())


In [None]:
droped_cols = []
first_cols = df.columns.tolist()
df = df.dropna(thresh=int(df.shape[0] * 0.7), axis=1)  # Drop column if it does not have at least x values that are **not** NaN
print("col: ", df.shape)
droped_cols.extend(list(set(first_cols).difference(df.columns)))

In [None]:
print(droped_cols)

In [None]:
def fill_nan_values(data, name, fields):

    field_df = data[fields]

    if field_df.isnull().values.any():
        data[fields] = field_df.interpolate(method='linear', limit_direction='both')

    return data[fields]

In [None]:
for f in fields_dict:

    name = fields_dict[f]['name']
    fields = fields_dict[f]['fields']

#     print("\nname:", name, "fields:", fields)
    fields = list(set(fields).difference(droped_cols))
    df_ = df.groupby(["class", "run"]).apply(fill_nan_values, name, fields)
    df_.reset_index(drop=True, inplace=True)
    df[fields] = df_[fields]

In [None]:
print(df.isnull().sum().any())

In [None]:
for column in df.columns:
    if column not in ["class", "run"]:
        if (len(df[column].unique()) == 1) or (df[column].isnull().all()):
            df.drop(column, inplace=True, axis=1)
            droped_cols.append(column)
            print(column, "droped-unique")

        else:
            zero_rows = df.loc[df[column] == float(0)]
            if zero_rows.shape[0] >= df.shape[0] * 50:
                df.drop(column, inplace=True, axis=1)
                droped_cols.append(column)
                print(column, "droped-zero")

In [None]:
df.sample(10)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [None]:
df['runId'] = 1000 * df['class'] + df['run']
run_df = df[['class', 'runId']].copy()
run_df.drop_duplicates(inplace=True)
run_df.reset_index(inplace=True)
run_df_ = df['run'].copy()
del run_df['index'], df['run']

In [None]:
importance_df = pd.read_pickle("feature_importance_df.pkl")
importance_df = importance_df.loc[(importance_df["importance_mean"] > 0)]

sorted_features_imp = list(importance_df["feature"].values)
value_features_imp = list(importance_df["importance_mean"].values)
f_imp = [(name, value) for name, value in zip(sorted_features_imp, value_features_imp)]

print(len(sorted_features_imp), sorted_features_imp)

sorted_features_imp = ['Temperature_value', 'Humidity_value', 'DurationTestBenchClosed_vFreq', 'FuseHeatSlopeNOK_vMin', 'Pressure_vMax', 'FeederAction4_vCnt', 'ProcessCpuLoadNormalized_value', 'Vacuum_vFreq', 'TotalMemoryConsumption_vMin', 'CpuTemperature_vStd', 'SmartMotorPositionError_vMax', 'FuseCycleDuration_vMax', 'LightBarrierActiveTaskDuration1_vTrend', 'DurationRobotFromFeederToTestBench_vMin', 'SmartMotorSpeed_vMax', 'FusePicked_vFreq', 'DurationPickToPick_vFreq', 'DurationTestBenchClosed_value', 'DurationPickToPick_vTrend', 'VacuumValveClosed_vStd', 'LightBarrierPassiveTaskDuration1_vCnt', 'DurationRobotFromTestBenchToFeeder_vMin', 'NumberFuseEstimated_vFreq', 'VacuumFusePicked_vFreq', 'FuseTestResult_vStd', 'EPOSCurrent_vStd', 'DurationTestBenchClosed_vMin', 'VacuumFusePicked_vMin', 'TemperatureThermoCam_value', 'SmartMotorPositionError_vFreq', 'LightBarrierActiveTaskDuration1_vMax', 'IntensityTotalThermoImage_vMax', 'DurationRobotFromFeederToTestBench_vStd', 'TemperatureThermoCam_vTrend', 'EPOSVelocity_vStd', 'ProcessMemoryConsumption_vStd', 'VacuumFusePicked_vCnt', 'FuseHeatSlopeNOK_vFreq', 'FuseCycleDuration_vCnt', 'Vacuum_vTrend', 'LightBarrierActiveTaskDuration1_vStd', 'FuseCycleDuration_vMin', 'DurationRobotFromTestBenchToFeeder_vTrend', 'FuseHeatSlope_vMax', 'FusePicked_vMin', 'DurationTestBenchClosed_vCnt', 'SmartMotorPositionError_vTrend', 'SmartMotorSpeed_vFreq', 'Vacuum_vMax', 'Pressure_vMin', 'Vacuum_value', 'VacuumFusePicked_value', 'Pressure_vFreq', 'FuseOutsideOperationalSpace_vCnt', 'FuseHeatSlopeNOK_vMax', 'EPOSVelocity_vTrend', 'NumberFuseEstimated_vCnt', 'Vacuum_vStd', 'EPOSCurrent_vTrend', 'FuseOutsideOperationalSpace_vFreq', 'EPOSVelocity_vFreq', 'VacuumValveClosed_value', 'LightBarrierActiveTaskDuration1_vCnt', 'FuseHeatSlope_value', 'ValidFrame_vCnt', 'Vacuum_vCnt', 'TotalMemoryConsumption_vMax', 'Vacuum_vMin', 'TotalMemoryConsumption_vStd', 'ProcessCpuLoadNormalized_vMax', 'EPOSCurrent_vMax', 'TotalCpuLoadNormalized_vStd', 'ValidFrameOptrisPIIRCamera_vFreq', 'FusePicked_vTrend', 'DurationRobotFromTestBenchToFeeder_vFreq', 'FuseCycleDuration_value', 'EPOSCurrent_vMin', 'VacuumValveClosed_vMin', 'FeederAction2_vCnt', 'DurationPickToPick_vMax', 'FuseCycleDuration_vFreq', 'LightBarrierPassiveTaskDuration1_vStd', 'NumberFuseDetected_vFreq', 'DurationTestBenchClosed_vTrend', 'LightBarrierActiveTaskDuration1_vMin', 'DurationRobotFromFeederToTestBench_vMax', 'FusePicked_vCnt', 'EPOSVelocity_vCnt', 'DurationPickToPick_vCnt', 'FuseHeatSlope_vMin', 'SmartMotorPositionError_vStd', 'SmartMotorPositionError_vMin', 'EPOSVelocity_vMax', 'EPOSPosition_vMin', 'TemperatureThermoCam_vStd', 'DurationRobotFromFeederToTestBench_vFreq', 'FuseHeatSlope_vTrend', 'EPOSVelocity_vMin', 'SmartMotorSpeed_value', 'LightBarrierActiveTaskDuration1_value', 'FuseTestResult_value', 'VacuumFusePicked_vTrend', 'FusePicked_vStd', 'EPOSPosition_vTrend', 'VacuumValveClosed_vFreq', 'LightBarrierPassiveTaskDuration1_vTrend', 'ValidFrameOptrisPIIRCamera_vCnt', 'DurationRobotFromTestBenchToFeeder_vCnt', 'LightBarrierActiveTaskDuration1_vFreq', 'FuseTestResult_vFreq', 'EPOSPosition_vFreq', 'IntensityTotalImage_vFreq', 'EPOSCurrent_vFreq', 'FeederBackgroundIlluminationIntensity_vCnt', 'DurationRobotFromTestBenchToFeeder_value', 'EPOSCurrent_value', 'DurationPickToPick_vStd', 'SmartMotorSpeed_vTrend', 'FuseCycleDuration_vStd', 'DurationRobotFromFeederToTestBench_vCnt', 'FuseHeatSlope_vStd', 'Pressure_vCnt', 'FuseIntoFeeder_vCnt', 'ValidFrame_vFreq', 'ProcessMemoryConsumption_vMin', 'TotalCpuLoadNormalized_vMax', 'DurationTestBenchClosed_vStd', 'DurationPickToPick_value', 'CpuTemperature_vMin', 'LightBarrierPassiveTaskDuration1_vFreq', 'DurationPickToPick_vMin', 'CpuTemperature_vMax', 'VacuumValveClosed_vMax', 'CpuTemperature_value', 'TotalCpuLoadNormalized_vMin', 'DurationTestBenchClosed_vMax', 'VacuumFusePicked_vStd', 'ProcessMemoryConsumption_vMax', 'EPOSCurrent_vCnt', 'Pressure_value', 'ProcessMemoryConsumption_value']

In [None]:
# split a sequence into samples
def create_sequence(sequence, n_steps):
    X = list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence):
            break
        # gather input and output parts of the pattern
        seq_x = sequence[i:end_ix]
        X.append(seq_x)

    return np.array(X)

def create_dataset_for_run(df, ws):

    sensors_df = df.filter(sensor_list)

    # Calculate seq of windows_size len
    seq = create_sequence(sensors_df.values, n_steps=ws)
    #     seq = np.transpose(seq, axes=(0, 2, 1))
    seq_count = seq.shape[0]
    seq = seq.reshape((seq_count, -1))  # for 1D

    labels = df['class'].values[:seq_count]

    return seq, labels

def create_datasets(df, ws):
    run_list = df['runId'].unique()
    l_len_runs = []

    X_df_list = []
    y_df_list = []

    for r in run_list:
        r_df = df[df['runId'] == r]
        # print ("--> r: ", r, r_df.shape)
        sensor_data, label_data = create_dataset_for_run(r_df, ws)

        # Post Processing for the model

        # Padding for model input
        padded_sensor_data = sensor_data.copy()  # np.hstack((sensor_data, np.zeros((sensor_data.shape[0], 2)))) # for AE

        X_t = padded_sensor_data[:]

        y_t = label_data[:]

        X_df_list.append(pd.DataFrame(X_t))
        y_df_list.append(pd.DataFrame(y_t))
        l_len_runs.append(len(X_t))

    X_t = pd.concat(X_df_list, axis=0)  # Merge data frames
    y_t = pd.concat(y_df_list, axis=0)  # Merge data frames

    return X_t.values, y_t.values.flatten(), run_list, l_len_runs


In [None]:
sensor_list = sorted_features_imp[:len(sorted_features_imp)].copy()
df_report = pd.DataFrame()

In [None]:
# K-fold Model evaluations

fold_num = 3
cv = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=41)

ws = 40

acc_sum_ = 0
f1_sum_ = 0
mcc_sum_ = 0

print("_______________________________________________________________")

for fold, (training_indices, validation_indices) in enumerate(cv.split(run_df['runId'], run_df['class'])):
    print("----------------------------------------------------------")

    report_index = "ws_{}_fold_{}".format(ws, fold + 1)
    print("Fold: ", report_index)

    training_runIds = run_df.loc[training_indices]['runId']
    validation_runIds = run_df.loc[validation_indices]['runId']

    df_ = df.loc[~(df["runId"].isin([56, 74, 49, 23, 35, 6, 83, 54]))]

    X_train_df = df_[df_['runId'].isin(training_runIds)].copy()
    X_val_df = df_[df_['runId'].isin(validation_runIds)].copy()

    X_train_df = X_train_df[sensor_list + ["class", "runId"]].copy()
    X_val_df = X_val_df[sensor_list + ["class", "runId"]].copy()

    scaler_cols = sensor_list.copy()  

    scaler = RobustScaler()
    scaler_data_tr = scaler.fit_transform(X_train_df[scaler_cols])
    scaler_data_tr = pd.DataFrame(scaler_data_tr, index=X_train_df.index, columns=scaler_cols)
    X_train_df = pd.concat([X_train_df[["class", "runId"]], scaler_data_tr], axis=1)

    scaler_data_ts = scaler.transform(X_val_df[scaler_cols])
    scaler_data_ts = pd.DataFrame(scaler_data_ts, index=X_val_df.index, columns=scaler_cols)
    X_val_df = pd.concat([X_val_df[["class", "runId"]], scaler_data_ts], axis=1)

    X_train, y_train, runList_tr, l_len_runs_tr = create_datasets(X_train_df, ws)
    X_val, y_val, runList_val, l_len_runs_val = create_datasets(X_val_df, ws)

    lda = LinearDiscriminantAnalysis()
    X_train = lda.fit_transform(X_train, y_train)
    X_val = lda.transform(X_val)

    param = {
        'objective': 'multi:softprob',  # error evaluation for multiclass training
        'num_class': len(pd.unique(y_train))}  # the number of classes that exist in this datset

    xgb_model = XGBClassifier(param, random_state=35535, verbosity=0)
    xgb_model.fit(X_train, y_train)
    pred = xgb_model.predict(X_val)

    acc_val = round(accuracy_score(y_val, pred), 3)
    f1_val = round(f1_score(y_val, pred, average='weighted'), 3)
    mcc_val = round(matthews_corrcoef(y_val, pred), 3)
    cm = confusion_matrix(y_val, pred)

    acc_sum_ += acc_val
    f1_sum_ += f1_val
    mcc_sum_ += mcc_val

    print(cm)
    print("XGBClassifier Fold:", fold, "ACC:", acc_val, "F1:", f1_val, "MCC:", mcc_val)
    df_report.loc[report_index, "XGB_ACC"] = acc_val
    df_report.loc[report_index, "XGB_F1"] = f1_val
    df_report.loc[report_index, "XGB_MCC"] = mcc_val

ave_acc = round(acc_sum_ / fold_num, 3)
ave_f1_score = round(f1_sum_ / fold_num, 3)
ave_mcc = round(mcc_sum_ / fold_num, 3)

print("\nXGBM Avg ACC:", ave_acc, "Avg F1:", ave_f1_score, "Avg MCC:", ave_mcc)

In [None]:
df_report

In [None]:
# Final Training
ws = 40

acc_sum_4 = 0
f1_sum_4 = 0
mcc_sum_4 = 0

df_ = df.loc[~(df["runId"].isin([56, 74, 49, 23, 35, 6, 83, 54]))]

scaler_cols = sensor_list.copy()

scaler = RobustScaler()
scaler_data_tr = scaler.fit_transform(df_[scaler_cols])
scaler_data_tr = pd.DataFrame(scaler_data_tr, index=df_.index, columns=scaler_cols)
X_train_df = pd.concat([df_[["class", "runId"]], scaler_data_tr], axis=1)

X_train, y_train, runList_tr, l_len_runs_tr = create_datasets(X_train_df, ws)

lda = LinearDiscriminantAnalysis()
X_train = lda.fit_transform(X_train, y_train)

l_index = []
for run_, num_run in zip(runList_tr, l_len_runs_tr):
    l_index.extend([run_] * num_run)

param = {
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': len(pd.unique(y_train))}  # the number of classes that exist in this datset

xgb_final_model = XGBClassifier(param, random_state=41, verbosity=0)
xgb_final_model.fit(X_train, y_train)
pred = xgb_final_model.predict(X_train)

acc_val = round(accuracy_score(y_train, pred), 3)
f1_val = round(f1_score(y_train, pred, average='weighted'), 3)
mcc_val = round(matthews_corrcoef(y_train, pred), 3)

cm = confusion_matrix(y_train, pred)

print("XGBClassifier:", "ACC:", acc_val, "F1:", f1_val, "MCC:", mcc_val)

print(cm)


# Model Training for PHME21 Clustering Task

In [None]:
from sklearn.cluster import KMeans
from collections import Counter


In [None]:
df_ = df.loc[~(df["runId"].isin([56, 74, 49, 23, 35, 6, 83, 54]))]

scaler_cols = sensor_list.copy()

scaler = RobustScaler()
scaler_data_tr = scaler.fit_transform(df_[scaler_cols])
scaler_data_tr = pd.DataFrame(scaler_data_tr, index=df_.index, columns=scaler_cols)


In [None]:
X_train = scaler_data_tr[['Temperature_value', 'Humidity_value']]
X_train.sample(10)

In [None]:
cluster_model = KMeans(n_clusters=2).fit(X_train)
y_pred = cluster_model.labels_
Counter(y_pred)

# Dump final models

In [None]:
with open('models_to_submit.pkl', 'wb') as handle:
    pickle.dump((fields_dict, sensor_list, scaler, lda, xgb_final_model, cluster_model) , handle, protocol=pickle.HIGHEST_PROTOCOL)    