In [None]:
import numpy as np
import pandas as pd

%matplotlib inline

In [None]:
from sacred import Experiment
from sacred.commands import print_config
from sacred.observers import FileStorageObserver

ex = Experiment('PHME21', interactive=True)
# ex.observers.append(FileStorageObserver('./lgbm_logs/'))

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.svm import SVC

In [None]:
@ex.config
def configuration_settings():
    fill_missing_ = True
    scale_type = "standard"
    window_size_ = 5
    train_model = True
    eval_model = True
    subsample = 10 # for sampling
    stride = 5 # for stride
    
    c_ = 1
    kernel_ = 'rbf'
    class_weight_ = 'None'
                

In [None]:
### import data
data_df_1 = pd.read_csv("../../data/training_validation_1.csv")
data_df_2 = pd.read_csv("../../data/training_validation_2.csv")
data_df_3 = pd.read_csv("../../data/model_refinement.csv")

train_df = pd.concat([data_df_1, data_df_2, data_df_3], axis=0) # Merge data frames
# train_df = data_df_2.filter(regex="vCnt|value")

train_df['runId'] = 1000 * train_df['class'] + train_df['run']

labels = train_df['class']
runs = train_df['runId']

run_df = train_df[['class', 'runId']].copy()
run_df.drop_duplicates(inplace=True)
run_df.reset_index(inplace=True)
del run_df['index']

# del train_df['class']
del train_df['run']

train_df.shape

% skip 
sensor_list = list(train_df.columns)
sensor_list.remove('runId')
sensor_list.remove('class')
len(sensor_list)

In [None]:
sensor_list_lgbm = ['CpuTemperature_value',
 'DurationPickToPick_vCnt',
 'DurationRobotFromFeederToTestBench_vCnt',
 'DurationRobotFromFeederToTestBench_value',
 'DurationRobotFromTestBenchToFeeder_value',
 'DurationTestBenchClosed_vCnt',
 'DurationTestBenchClosed_value',
 'EPOSCurrent_value',
 'EPOSPosition_vCnt',
 'EPOSPosition_value',
 'EPOSVelocity_vCnt',
 'EPOSVelocity_value',
 'FeederAction1_vCnt',
 'FeederAction2_vCnt',
 'FeederAction3_vCnt',
 'FeederAction4_vCnt',
 'FeederBackgroundIlluminationIntensity_vCnt',
 'FuseCycleDuration_vCnt',
 'FuseCycleDuration_value',
 'FuseHeatSlopeNOK_vCnt',
 'FuseHeatSlopeNOK_value',
 'FuseHeatSlopeOK_vCnt',
 'FuseHeatSlopeOK_value',
 'FuseHeatSlope_value',
 'FuseIntoFeeder_vCnt',
 'FuseOutsideOperationalSpace_vCnt',
 'FuseOutsideOperationalSpace_value',
 'FusePicked_vCnt',
 'FusePicked_value',
 'FuseTestResult_vCnt',
 'FuseTestResult_value',
 'Humidity_value',
 'IntensityTotalImage_value',
 'IntensityTotalThermoImage_value',
 'LightBarrierActiveTaskDuration1_vCnt',
 'LightBarrierActiveTaskDuration1_value',
 'LightBarrierPassiveTaskDuration1_value',
 'NumberFuseDetected_value',
 'NumberFuseEstimated_value',
 'ProcessCpuLoadNormalized_value',
 'ProcessMemoryConsumption_value',
 'SharpnessImage_vCnt',
 'SharpnessImage_value',
 'SmartMotorPositionError_value',
 'SmartMotorSpeed_value',
 'TemperatureThermoCam_value',
 'Temperature_value',
 'TotalCpuLoadNormalized_value',
 'TotalMemoryConsumption_value',
 'VacuumFusePicked_vCnt',
 'VacuumFusePicked_value',
 'VacuumValveClosed_vCnt',
 'VacuumValveClosed_value',
 'Vacuum_vCnt',
 'Vacuum_value',
 'ValidFrameOptrisPIIRCamera_vCnt',
 'ValidFrame_vCnt']

len(sensor_list_lgbm)

In [None]:
sensor_list_svm = [
    'CpuTemperature_value',
    'DurationPickToPick_vCnt',
    'DurationRobotFromTestBenchToFeeder_value',
    'EPOSVelocity_vCnt',
    'FeederAction4_vCnt',
    'FeederBackgroundIlluminationIntensity_value',
    'FuseCycleDuration_value',
    'FuseHeatSlopeNOK_value',
    'FuseHeatSlope_vCnt',
    'Humidity_value',
    'IntensityTotalImage_value',
    'IntensityTotalThermoImage_value',
    'LightBarrieActiveTaskDuration2_vCnt',
    'LightBarrierPassiveTaskDuration1_vCnt',
    'LightBarrierPassiveTaskDuration1_value',
    'LightBarrierPassiveTaskDuration2_vCnt',
    'NumberFuseDetected_value',
    'ProcessCpuLoadNormalized_value',
    'ProcessMemoryConsumption_value',
    'SmartMotorPositionError_vCnt',
    'SmartMotorSpeed_vCnt',
    'TemperatureThermoCam_value',
    'Temperature_value',
    'TotalCpuLoadNormalized_value',
    'TotalMemoryConsumption_value',
    'VacuumFusePicked_value',
    'ValidFrame_vCnt'
]

len(sensor_list_svm)

In [None]:
sensor_list = sensor_list_svm

In [None]:
# from catboost import CatBoostClassifier
# from ngboost import NGBClassifier
# from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold

In [None]:
train_df.head()

In [None]:
# split a sequence into samples
def create_sequence(sequence, n_steps):
    X = list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence):
            break
        # gather input and output parts of the pattern
        seq_x = sequence[i:end_ix]
        X.append(seq_x)
    return np.array(X)


In [None]:
def create_dataset_for_run(df, ws):
#     data_data = np.empty((0, ws * len(sensor_list))) # for 1D
#     data_data = np.empty((0, ws, len(sensor_list))) # for 2D
#     data_data = np.empty((0, len(sensor_list), ws)) # for 2D
#     label_data = np.empty((0, 1))

    sensors_df = df.filter(sensor_list)

    # Calculate seq of windows_size len
    seq = create_sequence(sensors_df.values, n_steps=ws)
#     seq = np.transpose(seq, axes=(0, 2, 1))
    seq_count = seq.shape[0]
    seq = seq.reshape((seq_count, -1)) # for 1D

    seq = seq[::ws//2]
    
    # add new seq to data_data array
#     data_data = np.vstack((data_data, seq))

    # Calculate RULS
    labels = df['class'].values[:seq_count]

    labels = labels[::ws//2]
    
    # add rul to rul_data array
#     rul_data = np.vstack((rul_data, ruls))

# TODO: What is RUL_Max in this context?

#     print ("Shape:", seq.shape, labels.shape)
    return seq, labels


 

In [None]:
# Globals

# Main sensor list. 
# List shall include other features such as operating conditions. 
_subsample = -1
_stride = -1

# scale_type = "standard"
# window_size = 50
# cv_fold = 3

# train_model = True
# eval_model = True
verbose = False

In [None]:
# TODO: X_t, X_tp1, y_t, y_tp1 should be calculated per run.  
# TODO: Then should be merged into one X_t, X_tp1, y_t, y_tp1.
def create_datasets(df, ws):
    
    run_list = df['runId'].unique()

    X_df_list = []
    y_df_list = []
    
    for r in run_list:
        r_df = df[df['runId'] == r]
#         print ("--> r: ", r, r_df.shape)
        sensor_data, label_data = create_dataset_for_run(r_df, ws)

        # Post Processing for the model

        # Padding for model input 
        padded_sensor_data = sensor_data.copy() #np.hstack((sensor_data, np.zeros((sensor_data.shape[0], 2)))) # for AE     

        # Calculate X(t) and X(t+1) for model input/output 
        X_t = padded_sensor_data[:]

        # Calculate y(t) and y(t+1) for model input/output 
        y_t = label_data[:]

        X_df_list.append(pd.DataFrame(X_t))
        y_df_list.append(pd.DataFrame(y_t))
    
    X_t = pd.concat(X_df_list, axis=0) # Merge data frames
    y_t = pd.concat(y_df_list, axis=0) # Merge data frames

    return X_t.values, y_t.values.flatten()


In [None]:
# cv = StratifiedKFold(n_splits=4, shuffle=True)

In [None]:
fillna_list = [True]
ws_list = [1, 5, 10, 15, 20, 25, 30, 40, 50, 100]

params_c = [1, 0.1, 0.01, 0.001]
params_kernel = ['linear', 'poly', 'rbf', 'sigmoid']
params_class_weight = ['balanced']

total_runs = len(fillna_list) * len(ws_list) * len(params_c) * len(params_kernel)  * len(params_class_weight)

total_runs

In [None]:
fillna_list = [True]
ws_list = [5, 10, 15, 20, 25, 30, 40, 50, 100]

params_c = [1, 0.1, 0.01, 0.001]
params_kernel = ['linear', 'poly', 'rbf', 'sigmoid']
params_class_weight = ['balanced']

total_runs = len(fillna_list) * len(ws_list) * len(params_c) * len(params_kernel)  * len(params_class_weight)

total_runs

In [None]:

@ex.main
def ex_main(_run, fill_missing_, scale_type, window_size_, train_model, eval_model, subsample, stride, c_, kernel_, class_weight_):

    global total_runs
    global run_counter
    global results_df
    
    print ('===========================================================================')
    print ('Run:', run_counter+1, '/', total_runs)
    print ("Fill missing:", fill_missing_, "Window Size:", window_size_, "SVM Params:", c_, kernel_, class_weight_)

    run_counter += 1
    
    acc_sum = 0
    f1_sum = 0

    cv = StratifiedKFold(n_splits=3, shuffle=True)

    for fold, (training_indices, validation_indices) in enumerate(cv.split(run_df['runId'], run_df['class'])):
        print ("--> Fold: ", fold)

        training_runIds = run_df.loc[training_indices]['runId']
        validation_runIds = run_df.loc[validation_indices]['runId']

        X_train_df = train_df[train_df['runId'].isin(training_runIds)].copy()
        X_val_df = train_df[train_df['runId'].isin(validation_runIds)].copy()

        if (fill_missing_):
            X_train_df = X_train_df.interpolate().fillna(method='bfill')
            X_val_df = X_val_df.interpolate().fillna(method='bfill')

        X_train, y_train = create_datasets(X_train_df, window_size_)
        X_val, y_val = create_datasets(X_val_df, window_size_)

    #     print ("Data shape", X_train_df.shape, X_val_df.shape)
        print ("Train data shape:", X_train.shape, y_train.shape)
        print ("Val data shape:", X_val.shape, y_val.shape)

        model = make_pipeline(StandardScaler(), PCA(n_components=53), SVC(C=c_, kernel=kernel_, class_weight=class_weight_, gamma='auto'))        
        model.fit(X_train, y_train)

#         m = np.argmax(model.feature_importances_)
#         print (sensor_list[m], model.feature_importances_[m])

        pred = model.predict(X_val)

        acc_val = accuracy_score(y_val, pred)
        f1_val = f1_score(y_val, pred, average='weighted')

        acc_sum += acc_val
        f1_sum += f1_val

        print ("Fold:", fold, "ACC:", acc_val, "F1:", f1_val)

    print ()
    print ("Avg ACC:", acc_sum / 3.0, "Avg F1:", f1_sum / 3.0)

    result = {
        'ML_Algorithm': 'LGBM',
        'Fill missing': fill_missing_, 
        'Window Size': window_size_, 
        'Params_C': c_, 
        'Params_kernel': kernel_, 
        'Params_class_weight': class_weight_,        
        'ACC': acc_sum / 3.0,
        'F1': f1_sum / 3.0,
    }

    results_df = results_df.append(result, ignore_index=True)
    results_df.to_excel("svm_results.xlsx")    

In [None]:
run_counter = 0
total_runs = len(fillna_list) * len(ws_list) * len(params_c) * len(params_kernel)  * len(params_class_weight)

results_df = pd.DataFrame()

for fnal in fillna_list:
    for wsl in ws_list:
        for pc in params_c:
            for pk in params_kernel:
                for pcw in params_class_weight:
                    ex.run(config_updates={
                        'window_size_': wsl,
                        'c_': pc, 
                        'kernel_': pk,
                        'class_weight_': pcw
                    })



In [None]:
fill_missing_ = True
window_size_ = 50
c_ = 0.01
kernel_ = 'linear'
class_weight_ = 'balanced'

results_df = pd.DataFrame()

print ('===========================================================================')
print ('Run:', run_counter+1, '/', total_runs)
print ("Fill missing:", fill_missing_, "Window Size:", window_size_, "SVM Params:", c_, kernel_, class_weight_)

run_counter += 1

acc_sum = 0
f1_sum = 0

cv = StratifiedKFold(n_splits=3, shuffle=True)

for fold, (training_indices, validation_indices) in enumerate(cv.split(run_df['runId'], run_df['class'])):
    print ("--> Fold: ", fold)

    training_runIds = run_df.loc[training_indices]['runId']
    validation_runIds = run_df.loc[validation_indices]['runId']

    X_train_df = train_df[train_df['runId'].isin(training_runIds)].copy()
    X_val_df = train_df[train_df['runId'].isin(validation_runIds)].copy()

    if (fill_missing_):
        X_train_df = X_train_df.interpolate().fillna(method='bfill')
        X_val_df = X_val_df.interpolate().fillna(method='bfill')

    X_train, y_train = create_datasets(X_train_df, window_size_)
    X_val, y_val = create_datasets(X_val_df, window_size_)

#     print ("Data shape", X_train_df.shape, X_val_df.shape)
    print ("Train data shape:", X_train.shape, y_train.shape)
    print ("Val data shape:", X_val.shape, y_val.shape)

    model = make_pipeline(StandardScaler(), PCA(n_components=53), SVC(C=c_, kernel=kernel_, class_weight=class_weight_, gamma='auto'))        
    model.fit(X_train, y_train)

#         m = np.argmax(model.feature_importances_)
#         print (sensor_list[m], model.feature_importances_[m])

    for c in train_df['class'].unique():
        print (c, f1_score(y_val[y_val == c], model.predict(X_val[y_val == c]), average='weighted'))


In [None]:
model

In [None]:
pred = model.predict(X_val[y_val == 11])
pred

In [None]:
train_df['class'].unique()

In [None]:
for c in train_df['class'].unique():
    print (c, f1_score(y_val[y_val == c], model.predict(X_val[y_val == c]), average='weighted'))


In [None]:
pd.DataFrame([np.nan, 1, 2, 3, np.nan]).interpolate().fillna(method='bfill')