In [14]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.decomposition import PCA

%matplotlib inline

In [15]:
### import data
data_df_1 = pd.read_csv("../../content/training_validation_1.csv")
data_df_2 = pd.read_csv("../../content/training_validation_2.csv")
train_df = pd.concat([data_df_1, data_df_2], axis=0) # Merge data frames
# train_df = data_df_2.filter(regex="vCnt|value")

train_df['runId'] = 1000 * train_df['class'] + train_df['run']

labels = train_df['class']
runs = train_df['runId']

run_df = train_df[['class', 'runId']].copy()
run_df.drop_duplicates(inplace=True)
run_df.reset_index(inplace=True)
del run_df['index']

# del train_df['class']
del train_df['run']

train_df.shape

(39611, 249)

In [16]:
sensor_list = list(train_df.columns)
sensor_list.remove('runId')
sensor_list.remove('class')
len(sensor_list)

247

In [17]:
train_df.head()

Unnamed: 0,CpuTemperature_vMax,CpuTemperature_vMin,CpuTemperature_vStd,CpuTemperature_value,DurationPickToPick_vCnt,DurationPickToPick_vFreq,DurationPickToPick_vMax,DurationPickToPick_vMin,DurationPickToPick_vStd,DurationPickToPick_vTrend,DurationPickToPick_value,DurationRobotFromFeederToTestBench_vCnt,DurationRobotFromFeederToTestBench_vFreq,DurationRobotFromFeederToTestBench_vMax,DurationRobotFromFeederToTestBench_vMin,DurationRobotFromFeederToTestBench_vStd,DurationRobotFromFeederToTestBench_vTrend,DurationRobotFromFeederToTestBench_value,DurationRobotFromTestBenchToFeeder_vCnt,DurationRobotFromTestBenchToFeeder_vFreq,DurationRobotFromTestBenchToFeeder_vMax,DurationRobotFromTestBenchToFeeder_vMin,DurationRobotFromTestBenchToFeeder_vStd,DurationRobotFromTestBenchToFeeder_vTrend,DurationRobotFromTestBenchToFeeder_value,DurationTestBenchClosed_vCnt,DurationTestBenchClosed_vFreq,DurationTestBenchClosed_vMax,DurationTestBenchClosed_vMin,DurationTestBenchClosed_vStd,DurationTestBenchClosed_vTrend,DurationTestBenchClosed_value,EPOSCurrent_vCnt,EPOSCurrent_vFreq,EPOSCurrent_vMax,EPOSCurrent_vMin,EPOSCurrent_vStd,EPOSCurrent_vTrend,EPOSCurrent_value,EPOSPosition_vCnt,...,TemperatureThermoCam_vMax,TemperatureThermoCam_vMin,TemperatureThermoCam_vStd,TemperatureThermoCam_vTrend,TemperatureThermoCam_value,TotalCpuLoadNormalized_vMax,TotalCpuLoadNormalized_vMin,TotalCpuLoadNormalized_vStd,TotalCpuLoadNormalized_value,TotalMemoryConsumption_vMax,TotalMemoryConsumption_vMin,TotalMemoryConsumption_vStd,TotalMemoryConsumption_value,Vacuum_vCnt,Vacuum_vFreq,Vacuum_vMax,Vacuum_vMin,Vacuum_vStd,Vacuum_vTrend,Vacuum_value,VacuumFusePicked_vCnt,VacuumFusePicked_vFreq,VacuumFusePicked_vMax,VacuumFusePicked_vMin,VacuumFusePicked_vStd,VacuumFusePicked_vTrend,VacuumFusePicked_value,VacuumValveClosed_vCnt,VacuumValveClosed_vFreq,VacuumValveClosed_vMax,VacuumValveClosed_vMin,VacuumValveClosed_vStd,VacuumValveClosed_vTrend,VacuumValveClosed_value,ValidFrame_vCnt,ValidFrame_vFreq,ValidFrameOptrisPIIRCamera_vCnt,ValidFrameOptrisPIIRCamera_vFreq,class,runId
0,47.25,42.75,1.384437,44.083333,0,0.0,,,,,,0,0.0,,,,,,0,0.0,,,,,,0,0.0,,,,,,17,7.873314,3.0,-6.0,2.273664,-0.022059,0.352941,17,...,,,,,,29.054602,0.0,8.740538,17.025962,12359.023438,12231.925781,50.81458,12312.574653,20,9.34054,-0.008271,-0.770013,0.253385,-0.016897,-0.571374,0,0.0,,,,,,0,0.0,,,,,,90.0,11.433574,258.0,27.249055,0,18
1,47.25,43.25,1.361066,45.15,3,0.29792,3.212,2.857,0.155491,0.1775,3.074333,3,0.29792,0.681,0.641,0.01819,0.0185,0.666667,3,0.29792,0.704,0.651,0.021649,-0.0265,0.677,3,0.29792,0.113,0.104,0.003742,0.003,0.109,77,7.646608,111.0,-202.0,42.435696,-0.051843,-2.415584,77,...,26.444174,26.442893,0.00064,0.001281,26.443533,49.439812,13.887554,9.885918,25.592494,12442.804688,12389.421875,15.801814,12420.210938,92,9.136209,-0.159642,-0.743645,0.130061,-9.6e-05,-0.514538,51,5.064633,-0.354961,-0.638173,0.065301,-0.000864,-0.541308,19,1.886824,-0.394024,-0.740226,0.090882,-0.006102,-0.628689,114.0,11.320947,271.0,26.912077,0,18
2,51.25,42.75,2.532415,45.675,3,0.298495,3.216,2.731,0.226769,-0.2425,2.895333,4,0.397994,0.652,0.625,0.009618,-0.0046,0.638,3,0.298495,0.673,0.653,0.008219,-0.01,0.662333,4,0.397994,0.115,0.104,0.004265,-0.0025,0.11125,76,7.561879,140.0,-95.0,32.462316,-0.087573,10.473684,76,...,26.35505,26.295674,0.029688,0.059376,26.325362,37.471573,15.20042,6.394334,19.421115,12726.859375,12516.011719,69.455168,12635.228516,91,9.054356,-0.189428,-0.722648,0.117296,0.000109,-0.488287,54,5.372914,-0.415021,-0.622547,0.064446,0.000134,-0.49691,19,1.89047,-0.369121,-0.722648,0.105852,-0.009367,-0.608721,113.0,11.243317,271.0,26.964062,0,18
3,47.0,42.5,1.23996,44.25,3,0.298771,3.267,3.145,0.053773,-0.052,3.220333,3,0.298771,0.684,0.666,0.007587,0.009,0.673667,3,0.298771,0.694,0.67,0.010077,0.0085,0.683667,3,0.298771,0.113,0.111,0.000816,-0.001,0.112,77,7.668459,134.0,-111.0,50.360388,0.038172,6.558442,77,...,26.390768,26.354432,0.015579,0.018168,26.375966,33.276146,12.524403,4.973165,21.095983,12837.554688,12634.859375,60.477673,12778.886719,92,9.162314,-0.149388,-0.724113,0.12459,-2.2e-05,-0.529017,48,4.780335,-0.445296,-0.628407,0.044811,0.001442,-0.556118,23,2.290577,-0.30369,-0.7163,0.106673,-0.00558,-0.606412,114.0,11.353282,271.0,26.988943,0,18
4,46.25,43.5,0.842615,45.05,4,0.398254,3.255,3.022,0.085044,0.0722,3.158,3,0.29869,0.658,0.644,0.0066,0.0,0.648667,4,0.398254,0.748,0.645,0.03881,0.022,0.6845,3,0.29869,0.103,0.102,0.000471,-0.0005,0.102667,76,7.566812,135.0,-167.0,51.651243,-0.141312,4.065789,76,...,26.392135,26.388526,0.001525,-0.001805,26.390052,44.253738,12.492103,9.08689,21.677164,12891.308594,12543.34375,108.307696,12746.797266,92,9.159825,-0.179174,-0.738273,0.128842,-0.000237,-0.514623,49,4.878605,-0.436995,-0.636219,0.049341,0.000968,-0.546413,21,2.090831,-0.388165,-0.738273,0.113608,-0.005115,-0.608945,114.0,11.350235,270.0,26.882136,0,18


In [18]:
# split a sequence into samples
def create_sequence(sequence, n_steps):
    X = list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence):
            break
        # gather input and output parts of the pattern
        seq_x = sequence[i:end_ix]
        X.append(seq_x)
    return np.array(X)


In [19]:
def create_dataset_for_run(df, ws):
#     data_data = np.empty((0, ws * len(sensor_list))) # for 1D
#     data_data = np.empty((0, ws, len(sensor_list))) # for 2D
#     data_data = np.empty((0, len(sensor_list), ws)) # for 2D
#     label_data = np.empty((0, 1))

    sensors_df = df.filter(sensor_list)

    # Calculate seq of windows_size len
    seq = create_sequence(sensors_df.values, n_steps=ws)
#     seq = np.transpose(seq, axes=(0, 2, 1))
    seq_count = seq.shape[0]
    seq = seq.reshape((seq_count, -1)) # for 1D

    # add new seq to data_data array
#     data_data = np.vstack((data_data, seq))

    # Calculate RULS
    labels = df['class'].values[:seq_count]

    # add rul to rul_data array
#     rul_data = np.vstack((rul_data, ruls))

# TODO: What is RUL_Max in this context?

#     print ("Shape:", seq.shape, labels.shape)
    return seq, labels

In [20]:
l = list(range(100))

len(create_sequence(l, 3))

98

In [21]:
# TODO: X_t, X_tp1, y_t, y_tp1 should be calculated per run.  
# TODO: Then should be merged into one X_t, X_tp1, y_t, y_tp1.
def create_datasets(df, ws):
    
    run_list = df['runId'].unique()

    X_df_list = []
    y_df_list = []
    
    for r in run_list:
        r_df = df[df['runId'] == r]
#         print ("--> r: ", r, r_df.shape)
        sensor_data, label_data = create_dataset_for_run(r_df, ws)

        # Post Processing for the model

        # Padding for model input 
        padded_sensor_data = sensor_data.copy() #np.hstack((sensor_data, np.zeros((sensor_data.shape[0], 2)))) # for AE     

        # Calculate X(t) and X(t+1) for model input/output 
        X_t = padded_sensor_data[:]

        # Calculate y(t) and y(t+1) for model input/output 
        y_t = label_data[:]

        X_df_list.append(pd.DataFrame(X_t))
        y_df_list.append(pd.DataFrame(y_t))
    
    X_t = pd.concat(X_df_list, axis=0) # Merge data frames
    y_t = pd.concat(y_df_list, axis=0) # Merge data frames

    return X_t.values, y_t.values.flatten()


In [22]:
%%time

acc_sum = 0
f1_sum = 0

cv = StratifiedKFold(n_splits=4, shuffle=True)

for fold, (training_indices, validation_indices) in enumerate(cv.split(run_df['runId'], run_df['class'])):
    print ("--> Fold: ", fold)
    
    training_runIds = run_df.loc[training_indices]['runId']
    validation_runIds = run_df.loc[validation_indices]['runId']
    
    X_train_df = train_df[train_df['runId'].isin(training_runIds)].copy()
    X_val_df = train_df[train_df['runId'].isin(validation_runIds)].copy()

    X_train_df.fillna(method='backfill', inplace=True)
    X_val_df.fillna(method='backfill', inplace=True)

    X_train_df.fillna(-1, inplace=True)
    X_val_df.fillna(-1, inplace=True)

    X_train, y_train = create_datasets(X_train_df, 1)
    X_val, y_val = create_datasets(X_val_df, 1)
    
    pca_model = PCA(n_components=25)
    
    X_train = pca_model.fit_transform(X_train)
    X_val = pca_model.transform(X_val)

    
    print ("Data shape", X_train_df.shape, X_val_df.shape)
    print ("Train data shape:", X_train.shape, y_train.shape)
    print ("Val data shape:", X_val.shape, y_val.shape)
    
    model = RandomForestClassifier(verbose=False)
    #model = DecisionTreeClassifier()

    model.fit(X_train, y_train)
    
    m = np.argmax(model.feature_importances_)
    print (sensor_list[m], model.feature_importances_[m])
    
    pred = model.predict(X_val)
    
    acc_val = accuracy_score(pred, y_val)
    f1_val = f1_score(pred, y_val, average='weighted')
    
    acc_sum += acc_val
    f1_sum += f1_val
    
    print ("Fold:", fold, "ACC:", acc_val, "F1:", f1_val)

print ()
print ("Avg ACC:", acc_sum / 4.0, "Avg F1:", f1_sum / 4.0)
    

--> Fold:  0
Data shape (28808, 249) (10803, 249)
Train data shape: (28808, 25) (28808,)
Val data shape: (10803, 25) (10803,)
CpuTemperature_vMax 0.1516999723623758
Fold: 0 ACC: 0.726094603350921 F1: 0.820863318332524
--> Fold:  1
Data shape (30268, 249) (9343, 249)
Train data shape: (30268, 25) (30268,)
Val data shape: (9343, 25) (9343,)
CpuTemperature_vMax 0.15908259766317237
Fold: 1 ACC: 0.5416889650005352 F1: 0.5574332339369023
--> Fold:  2
Data shape (30589, 249) (9022, 249)
Train data shape: (30589, 25) (30589,)
Val data shape: (9022, 25) (9022,)
CpuTemperature_vMax 0.12135978417707273
Fold: 2 ACC: 0.7541565063178896 F1: 0.8119060618737197
--> Fold:  3
Data shape (29168, 249) (10443, 249)
Train data shape: (29168, 25) (29168,)
Val data shape: (10443, 25) (10443,)
CpuTemperature_vMax 0.1405972957500686
Fold: 3 ACC: 0.5982955089533659 F1: 0.6037204051990874

Avg ACC: 0.6550588959056779 Avg F1: 0.6984807548355585
CPU times: user 1min 27s, sys: 3.85 s, total: 1min 31s
Wall time: 1min

In [23]:
%%time

acc_sum = 0
f1_sum = 0

cv = StratifiedKFold(n_splits=4, shuffle=True)

for fold, (training_indices, validation_indices) in enumerate(cv.split(run_df['runId'], run_df['class'])):
    print ("--> Fold: ", fold)
    
    training_runIds = run_df.loc[training_indices]['runId']
    validation_runIds = run_df.loc[validation_indices]['runId']
    
    X_train_df = train_df[train_df['runId'].isin(training_runIds)].copy()
    X_val_df = train_df[train_df['runId'].isin(validation_runIds)].copy()

    X_train_df.fillna(method='backfill', inplace=True)
    X_val_df.fillna(method='backfill', inplace=True)

    X_train_df.fillna(-1, inplace=True)
    X_val_df.fillna(-1, inplace=True)

    X_train, y_train = create_datasets(X_train_df, 6)
    X_val, y_val = create_datasets(X_val_df, 6)
    
    pca_model = PCA(n_components=25)
    
    X_train = pca_model.fit_transform(X_train)
    X_val = pca_model.transform(X_val)

    
    print ("Data shape", X_train_df.shape, X_val_df.shape)
    print ("Train data shape:", X_train.shape, y_train.shape)
    print ("Val data shape:", X_val.shape, y_val.shape)
    
    model = RandomForestClassifier(verbose=False)
   #model = DecisionTreeClassifier()

    model.fit(X_train, y_train)
    
    m = np.argmax(model.feature_importances_)
    print (sensor_list[m], model.feature_importances_[m])
    
    pred = model.predict(X_val)
    
    acc_val = accuracy_score(pred, y_val)
    f1_val = f1_score(pred, y_val, average='weighted')
    
    acc_sum += acc_val
    f1_sum += f1_val
    
    print ("Fold:", fold, "ACC:", acc_val, "F1:", f1_val)

print ()
print ("Avg ACC:", acc_sum / 4.0, "Avg F1:", f1_sum / 4.0)
    

--> Fold:  0
Data shape (28807, 249) (10804, 249)
Train data shape: (28547, 25) (28547,)
Val data shape: (10714, 25) (10714,)
CpuTemperature_vMax 0.1703253029545021
Fold: 0 ACC: 0.6137763673697966 F1: 0.7039880353836576
--> Fold:  1
Data shape (29551, 249) (10060, 249)
Train data shape: (29291, 25) (29291,)
Val data shape: (9970, 25) (9970,)
CpuTemperature_vMax 0.1714400085224247
Fold: 1 ACC: 0.5859578736208626 F1: 0.6190758884396173
--> Fold:  2
Data shape (29882, 249) (9729, 249)
Train data shape: (29617, 25) (29617,)
Val data shape: (9644, 25) (9644,)
CpuTemperature_vMax 0.17594512444720448
Fold: 2 ACC: 0.6492119452509332 F1: 0.7157423271790501
--> Fold:  3
Data shape (30593, 249) (9018, 249)
Train data shape: (30328, 25) (30328,)
Val data shape: (8933, 25) (8933,)
CpuTemperature_vMax 0.2005988998462898
Fold: 3 ACC: 0.5428187618941005 F1: 0.5237812127050846

Avg ACC: 0.5979412370339232 Avg F1: 0.6406468659268524
CPU times: user 1min 51s, sys: 8.43 s, total: 1min 59s
Wall time: 1min 

In [24]:
model.feature_importances_

array([0.2005989 , 0.11623063, 0.05952911, 0.01400047, 0.00873145,
       0.00798868, 0.00742386, 0.007848  , 0.14301671, 0.0827029 ,
       0.00675496, 0.00686895, 0.00660099, 0.00772584, 0.00867093,
       0.00798131, 0.02684637, 0.01697222, 0.06401833, 0.01275705,
       0.01604014, 0.01765742, 0.05861771, 0.03852634, 0.05589073])

In [26]:
np.argmax(model.feature_importances_)

0

In [27]:
    model = RandomForestClassifier(verbose=False)
    model.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=False, warm_start=False)

In [28]:
y_train

array([0, 0, 0, ..., 7, 7, 7], dtype=int64)

In [29]:
window_size = 6
cv_fold = 2

In [30]:
%%time
X_train_df = train_df.copy()

X_train, y_train= create_datasets(X_train_df, window_size)

X_train = X_train.astype(np.int64) 
   
print (X_train.shape, y_train.shape)        
print("---------------------------")

print('Done.')

(39261, 1482) (39261,)
---------------------------
Done.
CPU times: user 738 ms, sys: 22 ms, total: 760 ms
Wall time: 765 ms


In [32]:
%%time
# Grid Search for Algorithm Tuning
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier()

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 5)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 9, num = 5)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
grid = {'n_estimators': n_estimators, 'max_depth': max_depth,
        'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap
        }
# only two cores are used
gsc = GridSearchCV(estimator=model, cv=2, param_grid=grid, n_jobs=6)
gsc.fit(X=X_train, y=y_train[:,])

sorted(gsc.cv_results_.keys())
print(gsc)
# summarize the results of the grid search
print("Best Score :", gsc.best_score_)

GridSearchCV(cv=2, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [31]:
%%time

acc_sum = 0
f1_sum = 0

cv = StratifiedKFold(n_splits=4, shuffle=True)

for fold, (training_indices, validation_indices) in enumerate(cv.split(run_df['runId'], run_df['class'])):
    print ("--> Fold: ", fold)
    
    training_runIds = run_df.loc[training_indices]['runId']
    validation_runIds = run_df.loc[validation_indices]['runId']
    
    X_train_df = train_df[train_df['runId'].isin(training_runIds)].copy()
    X_val_df = train_df[train_df['runId'].isin(validation_runIds)].copy()

    X_train_df.fillna(method='backfill', inplace=True)
    X_val_df.fillna(method='backfill', inplace=True)

    X_train_df.fillna(-1, inplace=True)
    X_val_df.fillna(-1, inplace=True)

    X_train, y_train = create_datasets(X_train_df, 6)
    X_val, y_val = create_datasets(X_val_df, 6)
    
    pca_model = PCA(n_components=25)
    
    X_train = pca_model.fit_transform(X_train)
    X_val = pca_model.transform(X_val)

    
    print ("Data shape", X_train_df.shape, X_val_df.shape)
    print ("Train data shape:", X_train.shape, y_train.shape)
    print ("Val data shape:", X_val.shape, y_val.shape)
    
    model = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False)
   #model = DecisionTreeClassifier()

    model.fit(X_train, y_train)
    
    m = np.argmax(model.feature_importances_)
    print (sensor_list[m], model.feature_importances_[m])
    
    pred = model.predict(X_val)
    
    acc_val = accuracy_score(pred, y_val)
    f1_val = f1_score(pred, y_val, average='weighted')
    
    acc_sum += acc_val
    f1_sum += f1_val
    
    print ("Fold:", fold, "ACC:", acc_val, "F1:", f1_val)

print ()
print ("Avg ACC:", acc_sum / 4.0, "Avg F1:", f1_sum / 4.0)
    

--> Fold:  0
Data shape (28090, 249) (11521, 249)
Train data shape: (27830, 25) (27830,)
Val data shape: (11431, 25) (11431,)
CpuTemperature_vMax 0.1735397662754937
Fold: 0 ACC: 0.5006561105765025 F1: 0.499772137299689
--> Fold:  1
Data shape (28843, 249) (10768, 249)
Train data shape: (28583, 25) (28583,)
Val data shape: (10678, 25) (10678,)
CpuTemperature_vMax 0.20267824232162873
Fold: 1 ACC: 0.7000374601985391 F1: 0.7208710489162714
--> Fold:  2
Data shape (30598, 249) (9013, 249)
Train data shape: (30333, 25) (30333,)
Val data shape: (8928, 25) (8928,)
CpuTemperature_vMax 0.17036789674097721
Fold: 2 ACC: 0.6681227598566308 F1: 0.7517341512009674
--> Fold:  3
Data shape (31302, 249) (8309, 249)
Train data shape: (31037, 25) (31037,)
Val data shape: (8224, 25) (8224,)
CpuTemperature_vMax 0.17455097822975923
Fold: 3 ACC: 0.5278453307392996 F1: 0.6079381436339424

Avg ACC: 0.599165415342743 Avg F1: 0.6450788702627175
CPU times: user 1min 53s, sys: 8.2 s, total: 2min 1s
Wall time: 1min 