In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold
from lofo import LOFOImportance, Dataset, plot_importance

%matplotlib inline

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.svm import SVC

In [None]:
# import data
data_df_1 = pd.read_csv("../../data/imputed_training_validation_1.csv")
data_df_2 = pd.read_csv("../../data/imputed_training_validation_2.csv")
data_df_3 = pd.read_csv("../../data/imputed_model_refinement.csv")

merged_df = pd.concat([data_df_1, data_df_2, data_df_3], axis=0) # Merge data frames


In [None]:
train_df = merged_df.filter(regex="vCnt|value|class|run")

In [None]:
sensor_list = list(train_df.columns)
sensor_list.remove('class')
sensor_list.remove('run')
len(sensor_list)

In [None]:
# split a sequence into samples
def create_sequence(sequence, n_steps):
    X = list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence):
            break
        # gather input and output parts of the pattern
        seq_x = sequence[i:end_ix]
        X.append(seq_x)
    return np.array(X)


In [None]:
def create_dataset_for_run(df, ws):
#     data_data = np.empty((0, ws * len(sensor_list))) # for 1D
#     data_data = np.empty((0, ws, len(sensor_list))) # for 2D
#     data_data = np.empty((0, len(sensor_list), ws)) # for 2D
#     label_data = np.empty((0, 1))

    sensors_df = df.filter(sensor_list)

    # Calculate seq of windows_size len
    seq = create_sequence(sensors_df.values, n_steps=ws)
#     seq = np.transpose(seq, axes=(0, 2, 1))
    seq_count = seq.shape[0]
    seq = seq.reshape((seq_count, -1)) # for 1D

    # add new seq to data_data array
#     data_data = np.vstack((data_data, seq))

    # Calculate RULS
    labels = df['class'].values[:seq_count]

    # add rul to rul_data array
#     rul_data = np.vstack((rul_data, ruls))

# TODO: What is RUL_Max in this context?

#     print ("Shape:", seq.shape, labels.shape)
    return seq, labels


 

In [None]:
# TODO: X_t, X_tp1, y_t, y_tp1 should be calculated per run.  
# TODO: Then should be merged into one X_t, X_tp1, y_t, y_tp1.
def create_datasets(df, ws):
    c = int(df.iloc[0]['class'])
    r = int(df.iloc[0]['run'])
    
#     print (c, r)
    
    sensor_data, label_data = create_dataset_for_run(df, ws)

    # Post Processing for the model

    # Padding for model input 
    padded_sensor_data = sensor_data.copy() #np.hstack((sensor_data, np.zeros((sensor_data.shape[0], 2)))) # for AE     

    # Calculate X(t) and X(t+1) for model input/output 
    X_t = padded_sensor_data[:]

    # Calculate y(t) and y(t+1) for model input/output 
    y_t = label_data[:]

    return pd.DataFrame(X_t) #, y_t


In [None]:
cols1 = [ "_".join([str(c),str(0)]) for i, c in enumerate(sensor_list)]
cols2 = [ "_".join([str(c),str(1)]) for i, c in enumerate(sensor_list)]
cols = ['class'] + cols1 + cols2
len(cols)

In [None]:
ws = 2
new_df = train_df.groupby(['class', 'run']).apply(create_datasets, ws)
new_df.reset_index(inplace=True)
del new_df['level_2']
del new_df['run']
new_df.columns = cols

In [None]:
# extract a sample of the data
sample_df = new_df.sample(frac=0.5, random_state=42)
#sample_df.sort_values("run", inplace=True)

In [None]:
sample_df.head()

In [None]:
# define the validation scheme
cv = KFold(n_splits=3, shuffle=True)
# cv = StratifiedKFold(n_splits=4, shuffle=False, random_state=0, )

In [None]:
sample_df.shape

In [None]:
# define the binary target and the features
dataset = Dataset(df=sample_df, target="class", features=[col for col in sample_df.columns if col != 'class'])

In [None]:
model = make_pipeline(StandardScaler(), PCA(n_components=0.9995), SVC(class_weight='balanced', gamma='auto'))        


In [None]:
# define the validation scheme and scorer. The default model is LightGBM
lofo_imp = LOFOImportance(dataset, cv=cv, model=model, scoring="f1_micro")

In [None]:
# get the mean and standard deviation of the importances in pandas format
importance_df = lofo_imp.get_importance()

In [None]:
# plot the means and standard deviations of the importances
plot_importance(importance_df, figsize=(12, 20))

In [None]:
len(set(sorted([x[:-2] for x in list(importance_df[importance_df.importance_mean>0].feature)])))