In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import metrics
import os, sys
from pathlib import Path
BASE_DIR = "/Users/tomxu/Documents/nasa_ppmx/"
os.chdir(BASE_DIR)
from phm08ds.models import experiment

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual


## Load Dataset

In [4]:
folderpath = BASE_DIR + 'phm08ds/data/files/'
data_completed = pd.read_csv(folderpath + 'data_preprocessed.csv')
data_completed.head()

Unnamed: 0,unit,time_step,operational_setting_1,operational_setting_2,operational_setting_3,Sensor_0,Sensor_1,Sensor_2,Sensor_3,Sensor_4,...,Sensor_13,Sensor_14,Sensor_15,Sensor_16,Sensor_17,Sensor_18,Sensor_19,Sensor_20,Health_state,Operational_condition
0,1,1,10.0047,0.2501,20.0,489.05,604.13,1499.45,1309.95,10.52,...,8120.83,8.6216,0.03,368,2319,100.0,28.58,17.1735,1,1
1,1,2,0.0015,0.0003,100.0,518.67,642.13,1584.55,1403.96,14.62,...,8132.87,8.3907,0.03,391,2388,100.0,38.99,23.3619,1,3
2,1,3,34.9986,0.8401,60.0,449.44,555.42,1368.17,1122.49,5.48,...,8063.84,9.3557,0.02,334,2223,100.0,14.83,8.8555,1,4
3,1,4,20.0031,0.7005,0.0,491.19,607.03,1488.44,1249.18,9.35,...,8052.3,9.2231,0.02,364,2324,100.0,24.42,14.7832,1,2
4,1,5,42.0041,0.8405,40.0,445.0,549.52,1354.48,1124.32,3.91,...,8083.67,9.2986,0.02,330,2212,100.0,10.99,6.4025,1,4


In [5]:
data_unlabel = data_completed.drop(labels=['Health_state', 'Operational_condition'], axis=1)

In [17]:
from phm08ds.features.feature_selection import SelectSensors
tf_select_sensors = SelectSensors(kind='custom', sensors=[1,2,3,6,8,10,11,12,13,14,19,20])
data_op_1 = tf_select_sensors.fit_transform(data_completed)
data_op_1.head()

Unnamed: 0,unit,time_step,operational_setting_1,operational_setting_2,operational_setting_3,Sensor_1,Sensor_2,Sensor_3,Sensor_6,Sensor_8,Sensor_10,Sensor_11,Sensor_12,Sensor_13,Sensor_14,Sensor_19,Sensor_20,Operational_condition,Health_state
0,1,1,10.0047,0.2501,20.0,604.13,1499.45,1309.95,394.88,8770.2,45.4,372.15,2388.13,8120.83,8.6216,28.58,17.1735,1,1
1,1,2,0.0015,0.0003,100.0,642.13,1584.55,1403.96,553.67,9045.76,47.29,521.81,2388.15,8132.87,8.3907,38.99,23.3619,3,1
2,1,3,34.9986,0.8401,60.0,555.42,1368.17,1122.49,194.93,8343.91,41.92,183.26,2387.95,8063.84,9.3557,14.83,8.8555,4,1
3,1,4,20.0031,0.7005,0.0,607.03,1488.44,1249.18,334.82,8721.53,44.26,314.84,2388.07,8052.3,9.2231,24.42,14.7832,2,1
4,1,5,42.0041,0.8405,40.0,549.52,1354.48,1124.32,138.24,8314.56,41.79,130.44,2387.89,8083.67,9.2986,10.99,6.4025,4,1


## Data preprocessing

Use the pipeline and mlp

In [6]:
from phm08ds.data.preprocessing import OperationalCondition
data_unlabel = data_completed.drop(labels=['Health_state', 'Operational_condition'], axis=1)
tf_op_cond = OperationalCondition()
op_cond = tf_op_cond.fit_transform(data_unlabel.loc[0])

from phm08ds.features.feature_selection import RemoveSensor
tf_select_sensor = RemoveSensor(sensors=[0,4,5,7,9,15,16,17,18])
data_almost = tf_select_sensor.fit_transform(data_unlabel).iloc[:,5:]

from sklearn.preprocessing import StandardScaler
data_mlp = tf_std = StandardScaler().fit_transform(data_almost)

In [None]:
mlp_clf.predict_proba(data_mlp)[:,-1]

In [None]:
import joblib

classifiers = joblib.load(BASE_DIR + 'notebooks/E08_PHM08-train_MLP/classifiers.pkl')
mlp_clf = classifiers['MLP']

def plot_series(unit, sensor):
    data_unit_sensor_buffer = data_completed.loc[data_completed['unit'] == unit,:]    
    from phm08ds.data.preprocessing import OperationalCondition        
    data_unlabel = data_unit_sensor_buffer.drop(labels=['Health_state', 'Operational_condition'], axis=1)    
#     tf_op_cond = OperationalCondition()
#     op_cond = tf_op_cond.fit_transform(data_unlabel.loc[0])
    from phm08ds.features.feature_selection import RemoveSensor
    tf_select_sensor = RemoveSensor(sensors=[0,4,5,7,9,15,16,17,18])
    data_almost = tf_select_sensor.fit_transform(data_unlabel).iloc[:,5:]

    from sklearn.preprocessing import StandardScaler
    data_mlp = StandardScaler().fit_transform(data_almost)        
    plt.figure(figsize=(30*0.39, 10*0.39))    
    sns.lineplot(x='time_step', y='Sensor_' + str(sensor), data=data_unit_sensor_buffer, hue='Health_state', palette='Wistia')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)    
    print(mlp_clf.predict_proba(data_mlp[220,:].reshape(1,-1))[0,2])    
    
interact(plot_series, unit=(1,100,1), sensor=(1,20,1))

Get sensors that a I like it better

Before feeding to the classifier, let's remove unwanted information, such as unit, time_step and operational settings.

In [18]:
from phm08ds.features.feature_selection import RemoveInfo
tf_remove_info = RemoveInfo()
data_with_features = tf_remove_info.fit_transform(data_op_1)
data_with_features.head()

Unnamed: 0,Sensor_1,Sensor_2,Sensor_3,Sensor_6,Sensor_8,Sensor_10,Sensor_11,Sensor_12,Sensor_13,Sensor_14,Sensor_19,Sensor_20,Health_state
0,604.13,1499.45,1309.95,394.88,8770.2,45.4,372.15,2388.13,8120.83,8.6216,28.58,17.1735,1
1,642.13,1584.55,1403.96,553.67,9045.76,47.29,521.81,2388.15,8132.87,8.3907,38.99,23.3619,1
2,555.42,1368.17,1122.49,194.93,8343.91,41.92,183.26,2387.95,8063.84,9.3557,14.83,8.8555,1
3,607.03,1488.44,1249.18,334.82,8721.53,44.26,314.84,2388.07,8052.3,9.2231,24.42,14.7832,1
4,549.52,1354.48,1124.32,138.24,8314.56,41.79,130.44,2387.89,8083.67,9.2986,10.99,6.4025,1


## Create a new feature set based on HOS

We need to normalize our data. Let's use Z-score standardization.

In [19]:
from sklearn.preprocessing import StandardScaler
tf_std_scaller = preprocessing.StandardScaler()
data_with_features_std = tf_std_scaller.fit_transform(data_with_features.drop(labels='Health_state', axis=1))
data_with_features_std

array([[ 0.65613257,  0.7480038 ,  0.87228043, ..., -0.94547527,
         0.78791419,  0.79220892],
       [ 1.67242914,  1.54867518,  1.65864837, ..., -1.25259429,
         1.83904831,  1.83369583],
       [-0.64659916, -0.48715647, -0.69577121, ...,  0.03094771,
        -0.60047142, -0.60768251],
       ...,
       [-0.79369471, -0.57145747, -0.46440287, ...,  0.19986982,
        -1.05081322, -1.05496448],
       [-0.61637771, -0.37538237, -0.5033825 , ...,  0.11806896,
        -0.61359797, -0.63787493],
       [-1.12693301, -1.36704821, -1.14512432, ...,  2.18450507,
        -0.66509446, -0.6654924 ]])

In [20]:
labels_op_1 = np.array(data_with_features['Health_state'])
labels_op_1

array([1, 1, 1, ..., 4, 4, 4])

# Classification steps

## Load Experiment model

In [21]:
from phm08ds.models import experiment

## Define classifiers and its specifications

In [13]:
from sklearn.neural_network import MLPClassifier
import joblib

In [14]:
mlp_clf = MLPClassifier(hidden_layer_sizes=(50,), activation='tanh', solver='adam', tol=1e-8, warm_start=True)

## Put all clf in a dictionary:

In [15]:
classifiers = {'MLP': mlp_clf}

## Train Classifiers and test them

Stratified cross-validation for model selection are going to be used.

In [22]:
kfolds = 10
clf_outputs = experiment.run_classifiers(data_with_features_std, labels_op_1, classifiers, kfolds)



## Performance assessment

Saving variables in a dictionary:

In [23]:
results = {}
results['train'] = experiment.results_clf(4, clf_outputs['train']['true'], clf_outputs['train']['pred'])
results['test'] = experiment.results_clf(4, clf_outputs['test']['true'], clf_outputs['test']['pred'])

## Savel results, models and pipeline to a .pkl file 

In [24]:
from sklearn.pipeline import Pipeline
data_preprocessing = Pipeline([('select_sensors', tf_select_sensors),
                               ('remove_info', tf_remove_info),
                               ('std_scaler', tf_std_scaller)
                              ])

In [25]:
experiment.save_models(classifiers)
experiment.save_pipeline(data_preprocessing)

## Save results to CSVs and figures

In [26]:
experiment.export_results(results['test'], 'test')
experiment.export_results(results['train'], 'train')

/Users/tomxu/Documents/nasa_ppmx
MLP
/Users/tomxu/Documents/nasa_ppmx
MLP
