In [1]:
import numpy as np
import pandas as pd

%matplotlib inline

In [2]:
from sacred import Experiment
from sacred.commands import print_config
from sacred.observers import FileStorageObserver

ex = Experiment('PHME21', interactive=True)
# ex.observers.append(FileStorageObserver('./lgbm_logs/'))

In [3]:
@ex.config
def configuration_settings():
    fill_missing_ = True
    scale_type = "standard"
    window_size_ = 5
    train_model = True
    eval_model = True
    subsample = 10 # for sampling
    stride = 5 # for stride
    
    num_leaves_ = -1
    learning_rate_ = 0.1
    n_estimators_ = 100
    

In [4]:
### import data
data_df_1 = pd.read_csv("../../data/imputed_training_validation_1.csv")
data_df_2 = pd.read_csv("../../data/imputed_training_validation_2.csv")
data_df_3 = pd.read_csv("../../data/imputed_model_refinement.csv")

merged_df = pd.concat([data_df_1, data_df_2, data_df_3], axis=0) # Merge data frames

In [5]:
# train_df = merged_df[~ (merged_df['class'] == 0)]

train_df = merged_df.copy()

train_df['runId'] = 1000 * train_df['class'] + train_df['run']

labels = train_df['class']
runs = train_df['runId']

run_df = train_df[['class', 'runId']].copy()
run_df.drop_duplicates(inplace=True)
run_df.reset_index(inplace=True)
del run_df['index']

del train_df['run']

train_df.shape

(57971, 249)

In [6]:
sensor_list = list(train_df.columns)
sensor_list.remove('class')
sensor_list.remove('runId')
len(sensor_list)

247

In [7]:
# from catboost import CatBoostClassifier
# from ngboost import NGBClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.svm import SVC

In [8]:
train_df.head()

Unnamed: 0,CpuTemperature_vMax,CpuTemperature_vMin,CpuTemperature_vStd,CpuTemperature_value,DurationPickToPick_vCnt,DurationPickToPick_vFreq,DurationPickToPick_vMax,DurationPickToPick_vMin,DurationPickToPick_vStd,DurationPickToPick_vTrend,...,NumberFuseEstimated_vTrend_na,NumberFuseEstimated_value_na,SharpnessImage_vMax_na,SharpnessImage_vMin_na,SharpnessImage_vStd_na,SharpnessImage_vTrend_na,SharpnessImage_value_na,TemperatureThermoCam_vTrend_na,class,runId
0,56.75,44.75,3.523729,47.833333,0.0,0.0,3.192,2.807,0.1925,0.385,...,1,1,1,1,1,1,1,1,0,0
1,48.75,42.25,2.395308,45.0,2.0,0.198207,3.192,2.807,0.1925,0.385,...,0,0,0,0,0,0,0,0,0,0
2,47.5,42.0,2.085815,43.825,3.0,0.296778,3.23,3.106,0.051674,0.02,...,1,1,1,1,1,1,1,0,0,0
3,48.25,42.25,1.853375,45.2,3.0,0.298541,3.307,3.103,0.085391,0.031,...,1,1,1,1,1,1,1,0,0,0
4,50.0,42.5,2.661766,45.7,3.0,0.298554,3.242,3.153,0.038577,-0.036,...,1,1,1,1,1,1,1,0,0,0


In [9]:
# split a sequence into samples
def create_sequence(sequence, n_steps):
    X = list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence):
            break
        # gather input and output parts of the pattern
        seq_x = sequence[i:end_ix]
        X.append(seq_x)
    return np.array(X)


In [10]:
def create_dataset_for_run(df, ws):
#     data_data = np.empty((0, ws * len(sensor_list))) # for 1D
#     data_data = np.empty((0, ws, len(sensor_list))) # for 2D
#     data_data = np.empty((0, len(sensor_list), ws)) # for 2D
#     label_data = np.empty((0, 1))

    sensors_df = df.filter(sensor_list)

    # Calculate seq of windows_size len
    seq = create_sequence(sensors_df.values, n_steps=ws)
#     seq = np.transpose(seq, axes=(0, 2, 1))
    seq_count = seq.shape[0]
    seq = seq.reshape((seq_count, -1)) # for 1D
    seq = seq[::ws//2]
    
    # add new seq to data_data array
#     data_data = np.vstack((data_data, seq))

    # Calculate RULS
    labels = df['class'].values[:seq_count]
    labels = labels[::ws//2]

    # add rul to rul_data array
#     rul_data = np.vstack((rul_data, ruls))

# TODO: What is RUL_Max in this context?

#     print ("Shape:", seq.shape, labels.shape)
    return seq, labels


 

In [11]:
# Globals

# Main sensor list. 
# List shall include other features such as operating conditions. 
_subsample = -1
_stride = -1

# scale_type = "standard"
# window_size = 50
# cv_fold = 3

# train_model = True
# eval_model = True
verbose = False

In [12]:
# TODO: X_t, X_tp1, y_t, y_tp1 should be calculated per run.  
# TODO: Then should be merged into one X_t, X_tp1, y_t, y_tp1.
def create_datasets(df, ws):
    
    run_list = df['runId'].unique()

    X_df_list = []
    y_df_list = []
    
    for r in run_list:
        r_df = df[df['runId'] == r]
#         print ("--> r: ", r, r_df.shape)
        sensor_data, label_data = create_dataset_for_run(r_df, ws)

        # Post Processing for the model

        # Padding for model input 
        padded_sensor_data = sensor_data.copy() #np.hstack((sensor_data, np.zeros((sensor_data.shape[0], 2)))) # for AE     

        # Calculate X(t) and X(t+1) for model input/output 
        X_t = padded_sensor_data[:]

        # Calculate y(t) and y(t+1) for model input/output 
        y_t = label_data[:]

        X_df_list.append(pd.DataFrame(X_t))
        y_df_list.append(pd.DataFrame(y_t))
    
    X_t = pd.concat(X_df_list, axis=0) # Merge data frames
    y_t = pd.concat(y_df_list, axis=0) # Merge data frames

    return X_t.values, y_t.values.flatten()


In [13]:
# cv = StratifiedKFold(n_splits=4, shuffle=True)

In [14]:
ws_list = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 10]

In [16]:
@ex.main
def ex_main(_run, fill_missing_, scale_type, window_size_, train_model, eval_model, subsample, stride, num_leaves_, learning_rate_, n_estimators_):

    global total_runs
    global run_counter
    global results_df
    
    print ('===========================================================================')
    print ('Run:', run_counter+1, '/', total_runs)
    print ("Window Size:", window_size_)

    run_counter += 1
    
    acc_sum = 0
    f1_sum = 0

    cv = StratifiedKFold(n_splits=3, shuffle=True)

    for fold, (training_indices, validation_indices) in enumerate(cv.split(run_df['runId'], run_df['class'])):
        print ("--> Fold: ", fold)

        training_runIds = run_df.loc[training_indices]['runId']
        validation_runIds = run_df.loc[validation_indices]['runId']

        X_train_df = train_df[train_df['runId'].isin(training_runIds)].copy()
        X_val_df = train_df[train_df['runId'].isin(validation_runIds)].copy()

        X_train, y_train = create_datasets(X_train_df, window_size_)
        X_val, y_val = create_datasets(X_val_df, window_size_)

    #     print ("Data shape", X_train_df.shape, X_val_df.shape)
        print ("Train data shape:", X_train.shape, y_train.shape)
        print ("Val data shape:", X_val.shape, y_val.shape)

        xgb_model = make_pipeline(
            StandardScaler(),
            XGBClassifier()
        )

        lgbm_1_model = make_pipeline(
            StandardScaler(),
            LGBMClassifier()
        )

        lgbm_2_model = make_pipeline(
            StandardScaler(),
            LGBMClassifier(num_leaves=5, learning_rate=0.01, n_estimators=200)
        )

        svm_model = make_pipeline(
            StandardScaler(), 
            PCA(n_components=21), 
            SVC(class_weight='balanced', gamma='auto')
        )        

        gnb_model = make_pipeline(
            StandardScaler(),
            PCA(n_components=21), 
            GaussianNB()
        )
        
        voting_clf = VotingClassifier(
            estimators=[
                ('xgb', xgb_model), 
                ('lgbm1', lgbm_1_model), 
#                 ('lgbm2', lgbm_2_model), 
                ('svm', svm_model), 
#                 ('gnb', gnb_model)
            ], voting='hard', verbose=True, n_jobs=6)            
        
        voting_clf.fit(X_train, y_train)

        pred = voting_clf.predict(X_val)

        acc_val = accuracy_score(pred, y_val)
        f1_val = f1_score(pred, y_val, average='weighted')

        acc_sum += acc_val
        f1_sum += f1_val

        print ("Fold:", fold, "ACC:", acc_val, "F1:", f1_val)

    print ()
    print ("Avg ACC:", acc_sum / 3.0, "Avg F1:", f1_sum / 3.0)

    result = {
        'ML_Algorithm': 'voting',
        'Window Size': window_size_, 
        'ACC': acc_sum / 3.0,
        'F1': f1_sum / 3.0,
    }

    results_df = results_df.append(result, ignore_index=True)
    results_df.to_excel("voting_results.xlsx")    

In [None]:
run_counter = 0
total_runs = len(ws_list)

results_df = pd.DataFrame()

for wsl in ws_list:
    ex.run(config_updates={
        'window_size_': wsl,
    })



INFO - PHME21 - Running command 'ex_main'
INFO - PHME21 - Started


Run: 1 / 11
Window Size: 5
--> Fold:  0
Train data shape: (19318, 1235) (19318,)
Val data shape: (9495, 1235) (9495,)


In [None]:
cv = StratifiedKFold(n_splits=3, shuffle=True)

fill_missing_ = True
window_size_ = 200
num_leaves_ = 4
learning_rate_ = 0.1
n_estimators_ = 500

for fold, (training_indices, validation_indices) in enumerate(cv.split(run_df['runId'], run_df['class'])):
    print ("--> Fold: ", fold)

    training_runIds = run_df.loc[training_indices]['runId']
    validation_runIds = run_df.loc[validation_indices]['runId']

    X_train_df = train_df[train_df['runId'].isin(training_runIds)].copy()
    X_val_df = train_df[train_df['runId'].isin(validation_runIds)].copy()

    if (fill_missing_):
        X_train_df.fillna(method='backfill', inplace=True)
        X_val_df.fillna(method='backfill', inplace=True)

        X_train_df.fillna(-1, inplace=True)
        X_val_df.fillna(-1, inplace=True)

    X_train, y_train = create_datasets(X_train_df, window_size_)
    X_val, y_val = create_datasets(X_val_df, window_size_)

#     print ("Data shape", X_train_df.shape, X_val_df.shape)
    print ("Train data shape:", X_train.shape, y_train.shape)
    print ("Val data shape:", X_val.shape, y_val.shape)

    model = LGBMClassifier(num_leaves=num_leaves_, learning_rate=learning_rate_, n_estimators=n_estimators_)
    model.fit(X_train, y_train)

    break
