In [None]:
import numpy as np
import pandas as pd

%matplotlib inline

In [None]:
### import data
data_df_1 = pd.read_csv("../../data/training_validation_1.csv")
data_df_2 = pd.read_csv("../../data/training_validation_2.csv")
train_df = pd.concat([data_df_1, data_df_2], axis=0) # Merge data frames
# train_df = data_df_2.filter(regex="vCnt|value")

train_df['runId'] = 1000 * train_df['class'] + train_df['run']

labels = train_df['class']
runs = train_df['runId']

run_df = train_df[['class', 'runId']].copy()
run_df.drop_duplicates(inplace=True)
run_df.reset_index(inplace=True)
del run_df['index']

# del train_df['class']
del train_df['run']

train_df.shape

In [None]:
sensor_list = list(train_df.columns)
sensor_list.remove('runId')
sensor_list.remove('class')
len(sensor_list)

In [None]:
from catboost import CatBoostClassifier
from ngboost import NGBClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold

In [None]:
train_df.head()

In [None]:
# split a sequence into samples
def create_sequence(sequence, n_steps):
    X = list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence):
            break
        # gather input and output parts of the pattern
        seq_x = sequence[i:end_ix]
        X.append(seq_x)
    return np.array(X)


In [None]:
def create_dataset_for_run(df, ws):
#     data_data = np.empty((0, ws * len(sensor_list))) # for 1D
#     data_data = np.empty((0, ws, len(sensor_list))) # for 2D
#     data_data = np.empty((0, len(sensor_list), ws)) # for 2D
#     label_data = np.empty((0, 1))

    sensors_df = df.filter(sensor_list)

    # Calculate seq of windows_size len
    seq = create_sequence(sensors_df.values, n_steps=ws)
#     seq = np.transpose(seq, axes=(0, 2, 1))
    seq_count = seq.shape[0]
    seq = seq.reshape((seq_count, -1)) # for 1D

    # add new seq to data_data array
#     data_data = np.vstack((data_data, seq))

    # Calculate RULS
    labels = df['class'].values[:seq_count]

    # add rul to rul_data array
#     rul_data = np.vstack((rul_data, ruls))

# TODO: What is RUL_Max in this context?

#     print ("Shape:", seq.shape, labels.shape)
    return seq, labels


 

In [None]:
l = list(range(100))

len(create_sequence(l, 3))

In [None]:
# TODO: X_t, X_tp1, y_t, y_tp1 should be calculated per run.  
# TODO: Then should be merged into one X_t, X_tp1, y_t, y_tp1.
def create_datasets(df, ws):
    
    run_list = df['runId'].unique()

    X_df_list = []
    y_df_list = []
    
    for r in run_list:
        r_df = df[df['runId'] == r]
#         print ("--> r: ", r, r_df.shape)
        sensor_data, label_data = create_dataset_for_run(r_df, ws)

        # Post Processing for the model

        # Padding for model input 
        padded_sensor_data = sensor_data.copy() #np.hstack((sensor_data, np.zeros((sensor_data.shape[0], 2)))) # for AE     

        # Calculate X(t) and X(t+1) for model input/output 
        X_t = padded_sensor_data[:]

        # Calculate y(t) and y(t+1) for model input/output 
        y_t = label_data[:]

        X_df_list.append(pd.DataFrame(X_t))
        y_df_list.append(pd.DataFrame(y_t))
    
    X_t = pd.concat(X_df_list, axis=0) # Merge data frames
    y_t = pd.concat(y_df_list, axis=0) # Merge data frames

    return X_t.values, y_t.values.flatten()


In [None]:
# cv = StratifiedKFold(n_splits=4, shuffle=True)

In [None]:
%%time

acc_sum = 0
f1_sum = 0

cv = StratifiedKFold(n_splits=4, shuffle=True)

for fold, (training_indices, validation_indices) in enumerate(cv.split(run_df['runId'], run_df['class'])):
    print ("--> Fold: ", fold)
    
    training_runIds = run_df.loc[training_indices]['runId']
    validation_runIds = run_df.loc[validation_indices]['runId']
    
    X_train_df = train_df[train_df['runId'].isin(training_runIds)].copy()
    X_val_df = train_df[train_df['runId'].isin(validation_runIds)].copy()

    X_train, y_train = create_datasets(X_train_df, 1)
    X_val, y_val = create_datasets(X_val_df, 1)
    
    print ("Data shape", X_train_df.shape, X_val_df.shape)
    print ("Train data shape:", X_train.shape, y_train.shape)
    print ("Val data shape:", X_val.shape, y_val.shape)
    
    model = XGBClassifier(verbose=False, use_label_encoder=True)
    model.fit(X_train, y_train)
    
    m = np.argmax(model.feature_importances_)
    print (sensor_list[m], model.feature_importances_[m])
    
    pred = model.predict(X_val)
    
    acc_val = accuracy_score(pred, y_val)
    f1_val = f1_score(pred, y_val, average='weighted')
    
    acc_sum += acc_val
    f1_sum += f1_val
    
    print ("Fold:", fold, "ACC:", acc_val, "F1:", f1_val)

print ()
print ("Avg ACC:", acc_sum / 4.0, "Avg F1:", f1_sum / 4.0)


In [None]:
%%time

acc_sum = 0
f1_sum = 0

cv = StratifiedKFold(n_splits=4, shuffle=True)

for fold, (training_indices, validation_indices) in enumerate(cv.split(run_df['runId'], run_df['class'])):
    print ("--> Fold: ", fold)
    
    training_runIds = run_df.loc[training_indices]['runId']
    validation_runIds = run_df.loc[validation_indices]['runId']
    
    X_train_df = train_df[train_df['runId'].isin(training_runIds)].copy()
    X_val_df = train_df[train_df['runId'].isin(validation_runIds)].copy()

    X_train, y_train = create_datasets(X_train_df, 3)
    X_val, y_val = create_datasets(X_val_df, 3)
    
    print ("Data shape", X_train_df.shape, X_val_df.shape)
    print ("Train data shape:", X_train.shape, y_train.shape)
    print ("Val data shape:", X_val.shape, y_val.shape)
    
    model = XGBClassifier(verbose=False, use_label_encoder=True)
    model.fit(X_train, y_train)
    
    m = np.argmax(model.feature_importances_)
    print (sensor_list[m], model.feature_importances_[m])
    
    pred = model.predict(X_val)
    
    acc_val = accuracy_score(pred, y_val)
    f1_val = f1_score(pred, y_val, average='weighted')
    
    acc_sum += acc_val
    f1_sum += f1_val
    
    print ("Fold:", fold, "ACC:", acc_val, "F1:", f1_val)

print ()
print ("Avg ACC:", acc_sum / 4.0, "Avg F1:", f1_sum / 4.0)


In [None]:
from sklearn.decomposition import PCA

In [None]:
%%time

acc_sum = 0
f1_sum = 0

cv = StratifiedKFold(n_splits=4, shuffle=True)

for fold, (training_indices, validation_indices) in enumerate(cv.split(run_df['runId'], run_df['class'])):
    print ("--> Fold: ", fold)
    
    training_runIds = run_df.loc[training_indices]['runId']
    validation_runIds = run_df.loc[validation_indices]['runId']
    
    X_train_df = train_df[train_df['runId'].isin(training_runIds)].copy()
    X_val_df = train_df[train_df['runId'].isin(validation_runIds)].copy()

    X_train_df.fillna(method='backfill', inplace=True)
    X_val_df.fillna(method='backfill', inplace=True)

    X_train_df.fillna(-1, inplace=True)
    X_val_df.fillna(-1, inplace=True)

    X_train, y_train = create_datasets(X_train_df, 1)
    X_val, y_val = create_datasets(X_val_df, 1)
    
    pca_model = PCA(n_components=25)
    
    X_train = pca_model.fit_transform(X_train)
    X_val = pca_model.transform(X_val)

    
    print ("Data shape", X_train_df.shape, X_val_df.shape)
    print ("Train data shape:", X_train.shape, y_train.shape)
    print ("Val data shape:", X_val.shape, y_val.shape)
    
    model = XGBClassifier(verbose=False)
    model.fit(X_train, y_train)
    
    m = np.argmax(model.feature_importances_)
    print (sensor_list[m], model.feature_importances_[m])
    
    pred = model.predict(X_val)
    
    acc_val = accuracy_score(pred, y_val)
    f1_val = f1_score(pred, y_val, average='weighted')
    
    acc_sum += acc_val
    f1_sum += f1_val
    
    print ("Fold:", fold, "ACC:", acc_val, "F1:", f1_val)

print ()
print ("Avg ACC:", acc_sum / 4.0, "Avg F1:", f1_sum / 4.0)
    

In [None]:
%%time

acc_sum = 0
f1_sum = 0

cv = StratifiedKFold(n_splits=4, shuffle=True)

for fold, (training_indices, validation_indices) in enumerate(cv.split(run_df['runId'], run_df['class'])):
    print ("--> Fold: ", fold)
    
    training_runIds = run_df.loc[training_indices]['runId']
    validation_runIds = run_df.loc[validation_indices]['runId']
    
    X_train_df = train_df[train_df['runId'].isin(training_runIds)].copy()
    X_val_df = train_df[train_df['runId'].isin(validation_runIds)].copy()

    X_train_df.fillna(method='backfill', inplace=True)
    X_val_df.fillna(method='backfill', inplace=True)

    X_train_df.fillna(-1, inplace=True)
    X_val_df.fillna(-1, inplace=True)

    X_train, y_train = create_datasets(X_train_df, 6)
    X_val, y_val = create_datasets(X_val_df, 6)
    
    pca_model = PCA(n_components=25)
    
    X_train = pca_model.fit_transform(X_train)
    X_val = pca_model.transform(X_val)

    
    print ("Data shape", X_train_df.shape, X_val_df.shape)
    print ("Train data shape:", X_train.shape, y_train.shape)
    print ("Val data shape:", X_val.shape, y_val.shape)
    
    model = XGBClassifier(verbose=False)
    model.fit(X_train, y_train)
    
    m = np.argmax(model.feature_importances_)
    print (sensor_list[m], model.feature_importances_[m])
    
    pred = model.predict(X_val)
    
    acc_val = accuracy_score(pred, y_val)
    f1_val = f1_score(pred, y_val, average='weighted')
    
    acc_sum += acc_val
    f1_sum += f1_val
    
    print ("Fold:", fold, "ACC:", acc_val, "F1:", f1_val)

print ()
print ("Avg ACC:", acc_sum / 4.0, "Avg F1:", f1_sum / 4.0)
    

In [None]:
% skip
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(10,8))
plt.barh(model.feature_names_, model.feature_importances_)

In [None]:
model.feature_importances_

In [None]:
np.argmax(model.feature_importances_)

In [None]:
    model = NGBClassifier(verbose=False)
    model.fit(X_train, y_train)


In [None]:
y_train